You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

272 lines
12 KiB

# coding=utf-8
import asyncio
import json
import time
from functools import partial, wraps
from json import JSONDecodeError
import requests
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, css_to_dict
from utils.image_utils import crop_image_left,remove_bottom_part
import re
logger = create_logger(__name__)
class WenxiaoyanSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword)
def get_home_url(self) -> str:
return ''
def get_platform_id(self) -> int:
return 8
def get_platform_name(self) -> str:
return 'wenxiaoyan'
def get_answer(self,question):
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q&jt=" # ⚠️ 注意替换 jt 参数为你抓包所得
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "text/event-stream",
"Sec-Fetch-Site": "same-origin",
# "Acs-Token": "1751775388769_1751851559961_bhZQ4YI+QFmUuvgW70dSF4p6RO6kwt11kM1YLLkcALYQfKLMVkL4CaQihHY3UdL0U59IEuGQ18+yD/87YkdFFJjvRFtzv5V4w566SOzaV4Tl6MgUbtEXWZ1AIXHPKMOvdwofdSm4kY1DzhUDlUirlwEBtgg57atBjuGENz2ku0gIlOq7WF/KO9B8rZ8A+wZvDqBg64h0nWDr+OG7ggPF0m18dztme9wUmTomJ71gIxt6pfiIWP/ICkk0voivZe5/WZ2+n6H1jEiXrbZV2naPksyYjA9alnvptf5PU09c3+9ZzFa7Wh+iou+eKCxjjTv99f8BsQNAs0NVITq3szndR9ZAjqk7Tx5Pf1wskjrafPx4Gk5/tYj9PBEgIapBmvPmf1HBfNwI2zZbVHoEZnMuMllQ1AzDyFiVIdxkGAak3FmsIL/4k8XfcXxaB86CDVk2rKwOSDudYaDDSH5DkP/iWnFFUxAs6Y0/DZglPrMfYpFYgdnS+TD2/Qm5QXbsN1pBmmWjTuvzy9zjP5ykRhaqVg==", # ⚠️ 替换为抓包所得 Acs-Token
# "X-Bd-Alive": "86ff04fba2edb36666d4fbd126554961",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Sec-Fetch-Mode": "cors",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Cookie": self.session_info.get("cookie")
}
payload = {
"parentId" : "7955151",
"inputSource" : "user_input",
"querySource" : "user_input",
"context" : [
"7955151",
"7955150",
"7955113",
"7955112",
"7955075",
"7955074",
"7955051",
"7955050",
"7955045",
"7955044",
"7955015",
"7955014",
"7954965",
"7954964",
"7954865",
"7954864",
"7954863",
"7954862",
"7954831",
"7954830"
],
"contentType" : "text",
"from" : "app",
"plugins" : {
},
"models" : {
"dispatch" : "",
"force" : "",
"select" : "auto",
"replyMode" : "normal"
},
"sessionId" : "1159258",
"chooses" : None,
"promptId" : "",
"botId" : "144038",
"characterId" : "1034",
"environment" : {
"isNewAB" : 1,
"depthAnalysis" : 0,
"hostTag" : "main_chat",
"composition" : None,
"sceneFrom" : "",
"individual" : "0",
"live" : {
"visOnline" : {
"per" : "4189",
"pdt" : "10170",
"audio_ctrl" : "{\"mid\":\"\",\"sampling_rate\":24000}",
"spd" : "5"
}
},
"isReplyModeChange" : 0,
"memory" : {
"noMemoryExtraction" : 0
},
"depthMsgId" : "",
"clarify" : {
"enable" : 1,
"stage" : 1
},
"interveneId" : ""
},
"content" : "小米"
}
payload["content"] = question
dic = {}
try:
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
answer = ""
reference = None
messageId = ''
parentId = ''
for line in r.iter_lines():
if line:
try:
text = line.decode("utf-8")
if text.startswith("data:"):
print(json.loads(text[5:]))
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"):
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference")
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId")
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId")
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
except:
pass
# results.append({"question": question, "answer": answer})
print(f"✅ 回答:{answer}...\n")
dic['messageId'] = messageId
dic['parentId'] = parentId
dic['reference'] = reference
dic['answer'] = answer
except Exception as e:
print(f"❌ 请求失败:{e}")
return dic
# results.append({"question": question, "answer": "请求失败"})
# ✅ 4. 批量问题
# questions = [
# "苹果手机咋样!",
#
# ]
#
# # ✅ 5. 结果列表
# results = []
#
# # ✅ 6. 主循环:发送每个问题
# for q in questions:
# print(f"📨 提问中:{q}")
# payload["content"] = q
# try:
# r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
# answer = ""
# for line in r.iter_lines():
# if line:
# try:
# text = line.decode("utf-8")
# if text.startswith("data:"):
# print(json.loads(text[5:]))
# answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
# except:
# pass
# results.append({"question": q, "answer": answer})
# print(f"✅ 回答:{answer}...\n")
# time.sleep(2) # 加一点延时防止频率过快
# except Exception as e:
# print(f"❌ 请求失败:{e}")
# results.append({"question": q, "answer": "请求失败"})
def get_url(self,idList):
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q"
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "application/json, text/plain, */*",
"Sec-Fetch-Site": "same-origin",
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Sec-Fetch-Mode": "cors",
"Content-Type": "application/json",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Cookie": self.session_info.get("cookie")
}
payload = {
"idList": "7924487,7924488",
"watermark": "qYbvOVmx",
"vp": 0
}
payload["idList"] = idList
resp = requests.post(url, headers=headers, json=payload)
print(resp.json())
print("返回内容:", resp.json().get("data").get("url"))
return resp.json().get("data").get("url")
async def _do_spider(self,) -> AiAnswer:
self._init_data()
self.search_result_count = 0
print("self.prompt",self.prompt)
result = self.get_answer(self.prompt)
idList = None
if result.get("reference"):
idList = result.get("parentId")+","+ result.get("messageId")
if idList:
await self.browser_page.goto(self.get_url(idList), timeout=600000)
await asyncio.sleep(3)
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]')
await chat_input_element.click()
# 获取回答元素
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1)
box = await answer.bounding_box()
# 设置视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(box['height']) + 500
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
#匹配citation:中的数字
citation = list()
citations = re.findall(r'citation:(\d+)', result.get("answer"))
if citations:
citation = list(set(citations))
ai_search_result_list = []
for index,search_result in enumerate(result.get("reference").get("searchInfo")):
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
if str(index+1) in citation:
is_referenced = "1"
else:
is_referenced = "0"
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
# 切割图片
remove_bottom_part(screenshot_path, 82)
self.ai_answer.answer = result.get("answer")
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer