You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

210 lines
12 KiB

# coding=utf-8
import asyncio
import json
import time
from functools import partial, wraps
from json import JSONDecodeError
import requests
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, css_to_dict
from utils.image_utils import crop_image_left,remove_bottom_part
import re
logger = create_logger(__name__)
class WenxiaoyanSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword)
def get_home_url(self) -> str:
return ''
def get_platform_id(self) -> int:
return 8
def get_platform_name(self) -> str:
return 'wenxiaoyan'
def get_answer(self,question):
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-999991_1-999999_71-164345_3-999989_1-165121_1-999990_1-999988_1-167537_2-999994_2-999993_1&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnxRNfQ727QKdNMr45YRMLu2XeEydINMJvMu1WKw5cU5kg&jt=31%24eyJrIj4iOCI0Iix5IkciQEdGRUVITkpITlBMSyJJIkFqIjwiNTw7Ojo9Qz89Q0VARiI%2BIjYzIlEiSlFQT09SWDQyNzs2NyIzIit5IkYiPiI8IjYiTiJHRUlRSEtKTyJLImsiPSI0PD9AelRyb2NPUWNrZFc2RTBHLCw4YjN3eDJhdlpQU3FQUlBQcipfMWhdRURWSEk1NTFjWy5oO0JncUl5cEo2K2xZcmtsK255Z1k7VCxGYjZdOz1hai5iaTkuMFtSb1U2bWhQXFFOUngvN1lqY2Y3WU1QZF4xYGM8MkpBWVJxPHp2Nzs6bmNVUUV3ZjFsa0xaXl9bTnNkOzFuQFRVMm9qb1Vxalc5UFJCRj5gVTowZmwtbkldcXFCN0MvbDlvMlM0Tk9cWkJzPDExVmJXNUl3Wkc5aGhLYUQ6NGxlcDVpSk0uMklPK3opVGN6dzlTNzdlR1pLZTBzbz1XdWRSNUZWdkxIP2tVYFl5RVZ2Vkoqa3hwK180ZitudWdHY3BLTzdPP0xNY3cyc0RUMFxgYEZtLF9ibF1kK2JjZ3N3Uipxa0locj5tX1xyWVB4RXkyQngvLC9RLFE%2FdFZCNlN0dFFZVy0uLTtYWixFLjBNfGNWbCxhR05fb19tbjhrVCloakdHS0thY0JYcj5lZUJyVmgsYC0xR01oPjlafFAwfDo7amZraj88P0FEPz9GSHd4RkZNT0xJT1FNLU5SU2MzIn0%3D"
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "text/event-stream",
"Acs-Token": "1753247060952_1753249454932_Sx0qWvhpSEqXzeFVZfWjhajgI9acfaMFCrYjZzGN7ixGcXuCVLvn0QPuy/sSb1aJd5sutmLZqf6lJXHjsj3/Ls0hixZCVNgp5zSKGpxaOxK+DRWKizPtHHB2jDLYRGXsFIdtJfiqs37Ju/oieeKHevteS5Xrs1w+qE5gpc+z3AcQ9HG1pjbysU37KEUKy8n9UTg0xhpfslG0kat8tRM02up4+QsBDHoci9AP+9zYSRXd/Ay1Uh5u0AHeHY2nRNMv6+txzzdfED7IlPbWj1a7xsF3ak7pCqvWdjLelCXgVBWvc/wJwQw4hLr5Sfr9AeJSNN2Q7+GCL3yM4OVypUsEyBziASqAiLTe87j9JPTx/IanvUxg0HgxmVZJarY91Pm0puQhBVUR+DSIuj/zrdpCxdffx1m+0Lq9JskPhYXw4KZn8jwukybqCSGXbAjq7vrePNuuFgVXVStruZ4C0X94NYtv85sMpRdEw1TN8lbI7kt2B/1fdASUF94Nrbg7fx9+Ihvl4S9AoY5t/g1RVZqfhQ==",
"X-Bd-Alive": "bbd8e3a50a890b80110fcff0fd674290",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Sec-Fetch-Mode": "cors",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Sec-Fetch-Dest": "empty",
"Cookie": "__bid_n=197e279d29a43687ff36d1; ab_sr=1.0.1_YzNmZDY5MmMzZDdiYWM5NmIyYjkzMGE5YWMwYjRiMjZiZjg3OGVmYzIyMWE2ZmI0OGZlYjg3Njc3MDVhMmY5NTM4NDMxZDFlODg1MGI2MDFjYjkwM2Q3NzMxNTRjZjQ1MmEyY2RhNDExNmUxM2Q2MjUxMTRjMjY0NmFkMDg5MDZjNjk3MDkyYmUyNmNiMmUyNTkyYzYxNTE5NzU4ODFjNjI4NGYyYTFkYmJhZjdmYjRiNDY4ZmM3NjE0ODgwNDRi; BDUSS=w3QXkyMjFUTXdtYThpYUdkTHlGWWpiMFU1TGZjZWxaUEV-Mk1hV0hRS0ZBNmhvSVFBQUFBJCQAAAAAAQAAAAEAAABz6-yOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIV2gGiFdoBoUE; passtheme=light; SP_FW_VER=4.220.1; BAIDUCUID=Ya-oalOgvi0javtz_8BHf0axH8je828Elav7ijiSB8_Qu2ir_8vValtVWu039QRR0zXmA; matrixstyle=0; BAIDUID=976311C6B3BF9DB2D54171E6D3D928DF:FG=1; ST=2",
}
payload = {
"content": "苹果",
"sessionId": "",
"contentType": "text",
"parentId": "",
"inputSource": "user_input",
"querySource": "user_input",
"context": [],
"plugins": {},
"chooses": None,
"models": {
"dispatch": "",
"force": "",
"select": "auto",
"replyMode": "normal"
},
"newBot": True,
"from": "app",
"promptId": "",
"botId": "",
"characterId": "1034",
"environment": {
"clarify": {"enable": 1, "stage": 1},
"isNewAB": 1,
"depthAnalysis": 0,
"depthMsgId": "",
"live": {
"visOnline": {"pdt": "10170", "per": "4189", "audio_ctrl": "{\"mid\":\"\",\"sampling_rate\":24000}", "spd": "5"}
},
"interveneId": "",
"hostTag": "main_chat",
"individual": "0",
"composition": None,
"memory": {"noMemoryExtraction": 0},
"sceneFrom": "",
"isReplyModeChange": 0
}
}
payload["content"] = question
dic = {}
try:
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
answer = ""
reference = None
messageId = ''
parentId = ''
for line in r.iter_lines():
if line:
try:
text = line.decode("utf-8")
if text.startswith("data:"):
print(json.loads(text[5:]))
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"):
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference")
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId")
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId")
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
except:
pass
# results.append({"question": question, "answer": answer})
print(f"✅ 回答:{answer}...\n")
dic['messageId'] = messageId
dic['parentId'] = parentId
dic['reference'] = reference
dic['answer'] = answer
except Exception as e:
print(f"❌ 请求失败:{e}")
return dic
def get_url(self,idList):
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q"
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "application/json, text/plain, */*",
"Sec-Fetch-Site": "same-origin",
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Sec-Fetch-Mode": "cors",
"Content-Type": "application/json",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Cookie": self.session_info.get("cookie")
}
payload = {
"idList": "7691531,7691532",
"watermark": "xlSKFmsf",
"vp": 0
}
payload["idList"] = idList
resp = requests.post(url, headers=headers, json=payload)
print(resp.json())
print("返回内容:", resp.json().get("data").get("url"))
return resp.json().get("data").get("url")
async def _do_spider(self,) -> AiAnswer:
self._init_data()
self.search_result_count = 0
print("self.prompt",self.prompt)
result = self.get_answer(self.prompt)
idList = None
if result.get("reference"):
idList = result.get("parentId")+","+ result.get("messageId")
if idList:
await self.browser_page.goto(self.get_url(idList), timeout=600000)
await asyncio.sleep(3)
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]')
await chat_input_element.click()
# 获取回答元素
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1)
box = await answer.bounding_box()
# 设置视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(box['height']) + 500
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
#匹配citation:中的数字
citation = list()
citations = re.findall(r'citation:(\d+)', result.get("answer"))
if citations:
citation = list(set(citations))
ai_search_result_list = []
for index,search_result in enumerate(result.get("reference").get("searchInfo")):
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
if str(index+1) in citation:
is_referenced = "1"
else:
is_referenced = "0"
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
# 切割图片
remove_bottom_part(screenshot_path, 82)
self.ai_answer.answer = result.get("answer")
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer