爬虫相关
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

118 lines
2.9 KiB

from main import Start
from fastapi import FastAPI
from pydantic import BaseModel
from threading import Thread
import queue
import uuid
import sqlite3
import json
import hashlib
app = FastAPI()
task_queue = queue.Queue()
def get_conn():
return sqlite3.connect("spider.db", check_same_thread=False)
class Spider(BaseModel):
url: str
# 🔁 worker
def worker():
while True:
task_id, url = task_queue.get()
conn = get_conn()
cur = conn.cursor()
try:
data = Start(url).run()
# cur.execute("""
# UPDATE tasks
# SET status=?,
# result=?
# WHERE task_id = ?
# """, (0, json.dumps(data, ensure_ascii=False), task_id))
cur.execute("""
INSERT OR REPLACE INTO tasks (task_id, status, result)
VALUES (?, ?, ?)
""", (task_id, 0, json.dumps(data, ensure_ascii=False)))
except Exception as e:
cur.execute("""
UPDATE tasks
SET status=?,
error=?
WHERE task_id = ?
""", ("error", str(e), task_id))
conn.commit()
conn.close()
task_queue.task_done()
Thread(target=worker, daemon=True).start()
# ✅ 提交任务
@app.post("/crawler/put", summary='提交爬虫')
def put_task(req: Spider):
"""
url:爬取的url
"""
task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest())
print(task_id)
conn = get_conn()
cur = conn.cursor()
cur.execute("""
INSERT OR REPLACE INTO tasks (task_id, url, status)
VALUES (?, ?, ?)
""", (task_id, req.url, -2))
conn.commit()
conn.close()
task_queue.put((task_id, req.url))
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}
# ✅ 获取结果
@app.get("/crawler/get/{task_id}", summary='获取数据')
def get_result(task_id: str):
"""
task_id: 提交返回的的task_id
code: 异常有问题:-1 正在处理:-2 成功:0
"""
conn = get_conn()
cur = conn.cursor()
cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,))
row = cur.fetchone()
conn.close()
if not row:
return {"code": -1, 'data': task_id, 'msg': '无此task_id'}
status, result, error = row
if status == "error":
return {"code": -1, 'data': task_id, 'msg': '任务处理失败'}
if int(status) == 0:
msg = '操作成功'
else:
msg = '任务正在处理'
return {
"code": int(status),
"data": json.loads(result) if result else None,
"msg": msg
}
# uvicorn api_main:app --host 0.0.0.0 --port 8000
# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug