6 changed files with 277 additions and 57 deletions
-
53htmldata_get/api_main.py
-
118htmldata_get/api_main.py_
-
20htmldata_get/main.py
-
80htmldata_get/res.json
-
39htmldata_get/tools/class_int.py
-
22htmldata_get/tools/deal_html.py
@ -0,0 +1,118 @@ |
|||
from main import Start |
|||
from fastapi import FastAPI |
|||
from pydantic import BaseModel |
|||
from threading import Thread |
|||
import queue |
|||
import uuid |
|||
import sqlite3 |
|||
import json |
|||
import hashlib |
|||
|
|||
app = FastAPI() |
|||
|
|||
task_queue = queue.Queue() |
|||
|
|||
|
|||
def get_conn(): |
|||
return sqlite3.connect("spider.db", check_same_thread=False) |
|||
|
|||
|
|||
class Spider(BaseModel): |
|||
url: str |
|||
|
|||
|
|||
# 🔁 worker |
|||
def worker(): |
|||
while True: |
|||
task_id, url = task_queue.get() |
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
try: |
|||
data = Start(url).run() |
|||
|
|||
# cur.execute(""" |
|||
# UPDATE tasks |
|||
# SET status=?, |
|||
# result=? |
|||
# WHERE task_id = ? |
|||
# """, (0, json.dumps(data, ensure_ascii=False), task_id)) |
|||
cur.execute(""" |
|||
INSERT OR REPLACE INTO tasks (task_id, status, result) |
|||
VALUES (?, ?, ?) |
|||
""", (task_id, 0, json.dumps(data, ensure_ascii=False))) |
|||
|
|||
except Exception as e: |
|||
cur.execute(""" |
|||
UPDATE tasks |
|||
SET status=?, |
|||
error=? |
|||
WHERE task_id = ? |
|||
""", ("error", str(e), task_id)) |
|||
|
|||
conn.commit() |
|||
conn.close() |
|||
task_queue.task_done() |
|||
|
|||
|
|||
Thread(target=worker, daemon=True).start() |
|||
|
|||
|
|||
# ✅ 提交任务 |
|||
@app.post("/crawler/put", summary='提交爬虫') |
|||
def put_task(req: Spider): |
|||
""" |
|||
url:爬取的url |
|||
""" |
|||
task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest()) |
|||
print(task_id) |
|||
|
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
cur.execute(""" |
|||
INSERT OR REPLACE INTO tasks (task_id, url, status) |
|||
VALUES (?, ?, ?) |
|||
""", (task_id, req.url, -2)) |
|||
|
|||
conn.commit() |
|||
conn.close() |
|||
|
|||
task_queue.put((task_id, req.url)) |
|||
|
|||
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'} |
|||
|
|||
|
|||
# ✅ 获取结果 |
|||
@app.get("/crawler/get/{task_id}", summary='获取数据') |
|||
def get_result(task_id: str): |
|||
""" |
|||
task_id: 提交返回的的task_id |
|||
code: 异常有问题:-1 正在处理:-2 成功:0 |
|||
""" |
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,)) |
|||
row = cur.fetchone() |
|||
conn.close() |
|||
|
|||
if not row: |
|||
return {"code": -1, 'data': task_id, 'msg': '无此task_id'} |
|||
|
|||
status, result, error = row |
|||
if status == "error": |
|||
return {"code": -1, 'data': task_id, 'msg': '任务处理失败'} |
|||
if int(status) == 0: |
|||
msg = '操作成功' |
|||
else: |
|||
msg = '任务正在处理' |
|||
|
|||
return { |
|||
"code": int(status), |
|||
"data": json.loads(result) if result else None, |
|||
"msg": msg |
|||
} |
|||
|
|||
# uvicorn api_main:app --host 0.0.0.0 --port 8000 |
|||
# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug |
|||
80
htmldata_get/res.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
Write
Preview
Loading…
Cancel
Save
Reference in new issue