You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
270 lines
8.3 KiB
270 lines
8.3 KiB
import os
|
|
import json
|
|
import time
|
|
import threading
|
|
from datetime import datetime
|
|
from typing import Dict, List
|
|
|
|
import requests
|
|
from loguru import logger
|
|
|
|
from utlit.encrpty import encrypt_payload
|
|
from utlit.retry import retry
|
|
|
|
# 配置日志
|
|
cwd = os.path.dirname(os.path.abspath(__file__))
|
|
logger.add(f"{cwd}/wxyy.log",
|
|
level="DEBUG",
|
|
rotation="00:00",
|
|
retention="3 days",
|
|
compression="zip",
|
|
backtrace=True)
|
|
|
|
BASE = 'https://api.granking.com/api/third'
|
|
HOST = 'api.granking.com'
|
|
|
|
|
|
class ToolsLoad:
|
|
"""工具类:处理任务获取、提交等操作"""
|
|
|
|
@retry('获取文心一言任务', 5, time_sleep=10)
|
|
def get_task(self):
|
|
url = f"{BASE}/getTask?app_id=aa65700299848d6f21b969dbc9f6cf7c&secret=5588071d36f0bc61af849c311a03f2c4&platform_ids=6"
|
|
|
|
resp = requests.get(url, timeout=(5, 20))
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
@retry('提交文心一言结果', 5, time_sleep=5)
|
|
def post_task(self, data):
|
|
url = f"{BASE}/submitProjectTask"
|
|
headers = {
|
|
"User-Agent": "Apifox/1.0.0",
|
|
"Content-Type": "application/json",
|
|
"Host": HOST
|
|
}
|
|
|
|
resp = requests.post(url, headers=headers, json=data, timeout=(5, 300))
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
@retry('获取文心一言首页cookie', 0, time_sleep=5)
|
|
def index(self):
|
|
"""获取文心一言首页cookie"""
|
|
url = "https://yiyan.baidu.com/"
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
}
|
|
resp = requests.get(url, headers=headers, timeout=15, verify=False)
|
|
cookie_str = '; '.join([f'{k}={v}' for k, v in resp.cookies.items()])
|
|
return cookie_str, resp.headers.get("Set-Cookie", "")
|
|
|
|
@retry('文心一言对话接口', 3, time_sleep=5)
|
|
def conversation(self, Token, text, cookie_str):
|
|
"""文心一言对话接口"""
|
|
url = "https://yiyan.baidu.com/eb/chat/conversation/v2"
|
|
headers = {
|
|
"Host": "yiyan.baidu.com",
|
|
"Device-Type": "pc",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
|
|
"Accept": "text/event-stream,application/json",
|
|
"Acs-Token": Token,
|
|
"Content-Type": "application/json",
|
|
"Referer": "https://yiyan.baidu.com/chat/",
|
|
"Cookie": cookie_str,
|
|
}
|
|
|
|
post_data = {
|
|
"sign": Token,
|
|
"timestamp": int(time.time() * 1000),
|
|
"deviceType": "pc",
|
|
"text": text,
|
|
"sessionId": "",
|
|
"sessionName": text,
|
|
"type": 10,
|
|
"deepThoughtStatus": 2,
|
|
"model": "EB45T",
|
|
"parentChatId": "0",
|
|
"isNewYiyan": True
|
|
}
|
|
|
|
answer_acc = ""
|
|
refs_raw: List[Dict[str, str]] = []
|
|
|
|
with requests.post(url, headers=headers, json=post_data, stream=True, timeout=300) as resp:
|
|
resp.raise_for_status()
|
|
resp.encoding = "utf-8"
|
|
|
|
for raw in resp.iter_lines(decode_unicode=True):
|
|
if not raw or not raw.startswith("data:"):
|
|
continue
|
|
|
|
data_str = raw[5:].strip()
|
|
if "已达文心大模型使用上限" in data_str or "用户访问被限制" in data_str:
|
|
return "", []
|
|
|
|
try:
|
|
ev = json.loads(data_str)
|
|
except Exception:
|
|
continue
|
|
|
|
if ev.get("searchCitations"):
|
|
for c in ev.get("searchCitations", {}).get("list", []):
|
|
refs_raw.append({
|
|
"title": c.get("title", ""),
|
|
"url": c.get("url", ""),
|
|
"name": c.get("site", ""),
|
|
"body": c.get("wild_abstract", ""),
|
|
"publishTime": c.get("date", "")
|
|
})
|
|
|
|
if isinstance(ev.get("data"), Dict):
|
|
content = ev.get("data").get("content")
|
|
if isinstance(content, str) and content:
|
|
answer_acc += content
|
|
|
|
return answer_acc, refs_raw
|
|
|
|
|
|
class WenxinChatClient:
|
|
"""文心一言聊天客户端"""
|
|
|
|
def __init__(self):
|
|
self.tools = ToolsLoad()
|
|
|
|
def generate_token(self, baiduid):
|
|
"""生成Token"""
|
|
t = int(time.time() * 1000)
|
|
e = {
|
|
"d0": "ka0oitptemc1jfshcdsx",
|
|
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
|
|
"baiduid": baiduid,
|
|
"platform": "Win32",
|
|
"clientTs": t,
|
|
"version": "1.4.0.3",
|
|
}
|
|
token_sign = encrypt_payload(e)
|
|
return f"1769396406334_{t}_{token_sign}"
|
|
|
|
def chat(self, platform_id, keyword, brand, task_id):
|
|
"""执行对话任务"""
|
|
# logger.info(f"开始处理任务: {keyword} - {task_id}")
|
|
|
|
cookie_str, baiduid = self.tools.index()
|
|
if not cookie_str:
|
|
logger.error("获取cookie失败")
|
|
return False
|
|
|
|
# 生成Token
|
|
Token = self.generate_token(baiduid)
|
|
|
|
# 执行对话
|
|
answer, refs = self.tools.conversation(Token, keyword, cookie_str)
|
|
|
|
if not answer:
|
|
logger.warning(f"未获取到回答: {keyword}")
|
|
return False
|
|
|
|
# 构造结果
|
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
result = {
|
|
"app_id": "aa65700299848d6f21b969dbc9f6cf7c",
|
|
"secret": "5588071d36f0bc61af849c311a03f2c4",
|
|
"platform_id": platform_id,
|
|
"platform_name": "文心一言",
|
|
"prompt": keyword,
|
|
"keyword": brand,
|
|
"answer": answer,
|
|
"search_result": refs,
|
|
"run_status": True,
|
|
"task_id": task_id,
|
|
"rank": 0,
|
|
"start_time": now,
|
|
"end_time": now,
|
|
"screenshot_url": "",
|
|
"words": []
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
class Start:
|
|
"""主启动类"""
|
|
|
|
def __init__(self):
|
|
self.tools = ToolsLoad()
|
|
self.client = WenxinChatClient()
|
|
|
|
@retry('处理文心一言任务', for_work=10, time_sleep=10)
|
|
def process_task(self, task):
|
|
"""处理单个任务"""
|
|
task_id = task.get("id", "")
|
|
keyword = task.get("keyword", "")
|
|
platform_id = task.get("platform_id", "")
|
|
brand = task.get("brand", "")
|
|
|
|
logger.info(f"开始处理任务: {keyword} - {task_id}")
|
|
|
|
# 执行对话获取结果
|
|
result = self.client.chat(
|
|
platform_id=platform_id,
|
|
keyword=keyword,
|
|
brand=brand,
|
|
task_id=task_id
|
|
)
|
|
|
|
if not result:
|
|
logger.warning(f"任务结果为空,重新处理: {keyword}")
|
|
return False
|
|
print(result)
|
|
# 提交结果
|
|
post_resp = self.tools.post_task(result)
|
|
logger.info(f"任务 {task_id} 提交返回: {post_resp}")
|
|
|
|
return result
|
|
|
|
@retry('主运行窗口', for_work=3, time_sleep=5)
|
|
def start_task_msg(self):
|
|
"""获取并处理任务"""
|
|
task_resp = self.tools.get_task()
|
|
|
|
if not task_resp:
|
|
logger.info("get_task 未返回有效数据,等待后重试")
|
|
time.sleep(5)
|
|
return True
|
|
|
|
tasks = task_resp.get("data", False)
|
|
if not tasks:
|
|
logger.info("没有任务数据,等待下一轮")
|
|
time.sleep(30)
|
|
return True
|
|
|
|
logger.info(f"获取到任务: {tasks}")
|
|
|
|
return self.process_task(tasks)
|
|
|
|
def run(self):
|
|
"""主循环"""
|
|
logger.info("文心一言爬虫启动...")
|
|
while True:
|
|
try:
|
|
self.start_task_msg()
|
|
except Exception as e:
|
|
logger.error(f"主循环异常: {e}")
|
|
time.sleep(10)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
from threading import Thread
|
|
|
|
ts = []
|
|
for i in range(5):
|
|
t = Thread(target=Start().run)
|
|
ts.append(t)
|
|
t.start()
|
|
|
|
time.sleep(2)
|
|
for t in ts:
|
|
t.join()
|