6 changed files with 277 additions and 57 deletions
-
55htmldata_get/api_main.py
-
118htmldata_get/api_main.py_
-
20htmldata_get/main.py
-
80htmldata_get/res.json
-
39htmldata_get/tools/class_int.py
-
22htmldata_get/tools/deal_html.py
@ -0,0 +1,118 @@ |
|||||
|
from main import Start |
||||
|
from fastapi import FastAPI |
||||
|
from pydantic import BaseModel |
||||
|
from threading import Thread |
||||
|
import queue |
||||
|
import uuid |
||||
|
import sqlite3 |
||||
|
import json |
||||
|
import hashlib |
||||
|
|
||||
|
app = FastAPI() |
||||
|
|
||||
|
task_queue = queue.Queue() |
||||
|
|
||||
|
|
||||
|
def get_conn(): |
||||
|
return sqlite3.connect("spider.db", check_same_thread=False) |
||||
|
|
||||
|
|
||||
|
class Spider(BaseModel): |
||||
|
url: str |
||||
|
|
||||
|
|
||||
|
# 🔁 worker |
||||
|
def worker(): |
||||
|
while True: |
||||
|
task_id, url = task_queue.get() |
||||
|
conn = get_conn() |
||||
|
cur = conn.cursor() |
||||
|
|
||||
|
try: |
||||
|
data = Start(url).run() |
||||
|
|
||||
|
# cur.execute(""" |
||||
|
# UPDATE tasks |
||||
|
# SET status=?, |
||||
|
# result=? |
||||
|
# WHERE task_id = ? |
||||
|
# """, (0, json.dumps(data, ensure_ascii=False), task_id)) |
||||
|
cur.execute(""" |
||||
|
INSERT OR REPLACE INTO tasks (task_id, status, result) |
||||
|
VALUES (?, ?, ?) |
||||
|
""", (task_id, 0, json.dumps(data, ensure_ascii=False))) |
||||
|
|
||||
|
except Exception as e: |
||||
|
cur.execute(""" |
||||
|
UPDATE tasks |
||||
|
SET status=?, |
||||
|
error=? |
||||
|
WHERE task_id = ? |
||||
|
""", ("error", str(e), task_id)) |
||||
|
|
||||
|
conn.commit() |
||||
|
conn.close() |
||||
|
task_queue.task_done() |
||||
|
|
||||
|
|
||||
|
Thread(target=worker, daemon=True).start() |
||||
|
|
||||
|
|
||||
|
# ✅ 提交任务 |
||||
|
@app.post("/crawler/put", summary='提交爬虫') |
||||
|
def put_task(req: Spider): |
||||
|
""" |
||||
|
url:爬取的url |
||||
|
""" |
||||
|
task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest()) |
||||
|
print(task_id) |
||||
|
|
||||
|
conn = get_conn() |
||||
|
cur = conn.cursor() |
||||
|
|
||||
|
cur.execute(""" |
||||
|
INSERT OR REPLACE INTO tasks (task_id, url, status) |
||||
|
VALUES (?, ?, ?) |
||||
|
""", (task_id, req.url, -2)) |
||||
|
|
||||
|
conn.commit() |
||||
|
conn.close() |
||||
|
|
||||
|
task_queue.put((task_id, req.url)) |
||||
|
|
||||
|
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'} |
||||
|
|
||||
|
|
||||
|
# ✅ 获取结果 |
||||
|
@app.get("/crawler/get/{task_id}", summary='获取数据') |
||||
|
def get_result(task_id: str): |
||||
|
""" |
||||
|
task_id: 提交返回的的task_id |
||||
|
code: 异常有问题:-1 正在处理:-2 成功:0 |
||||
|
""" |
||||
|
conn = get_conn() |
||||
|
cur = conn.cursor() |
||||
|
|
||||
|
cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,)) |
||||
|
row = cur.fetchone() |
||||
|
conn.close() |
||||
|
|
||||
|
if not row: |
||||
|
return {"code": -1, 'data': task_id, 'msg': '无此task_id'} |
||||
|
|
||||
|
status, result, error = row |
||||
|
if status == "error": |
||||
|
return {"code": -1, 'data': task_id, 'msg': '任务处理失败'} |
||||
|
if int(status) == 0: |
||||
|
msg = '操作成功' |
||||
|
else: |
||||
|
msg = '任务正在处理' |
||||
|
|
||||
|
return { |
||||
|
"code": int(status), |
||||
|
"data": json.loads(result) if result else None, |
||||
|
"msg": msg |
||||
|
} |
||||
|
|
||||
|
# uvicorn api_main:app --host 0.0.0.0 --port 8000 |
||||
|
# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug |
||||
80
htmldata_get/res.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue