You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
210 lines
12 KiB
210 lines
12 KiB
# coding=utf-8
|
|
import asyncio
|
|
import json
|
|
import time
|
|
from functools import partial, wraps
|
|
from json import JSONDecodeError
|
|
|
|
import requests
|
|
from glom import glom
|
|
from playwright.async_api import Browser
|
|
|
|
from abs_spider import AbstractAiSeoSpider
|
|
from domain.ai_seo import AiAnswer, AiSearchResult
|
|
from utils import create_logger, css_to_dict
|
|
from utils.image_utils import crop_image_left,remove_bottom_part
|
|
import re
|
|
logger = create_logger(__name__)
|
|
|
|
class WenxiaoyanSpider(AbstractAiSeoSpider):
|
|
|
|
def __init__(self, browser: Browser, prompt: str, keyword: str):
|
|
super().__init__(browser, prompt, keyword)
|
|
def get_home_url(self) -> str:
|
|
return ''
|
|
|
|
def get_platform_id(self) -> int:
|
|
return 8
|
|
|
|
def get_platform_name(self) -> str:
|
|
return 'wenxiaoyan'
|
|
|
|
|
|
def get_answer(self,question):
|
|
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-999991_1-999999_71-164345_3-999989_1-165121_1-999990_1-999988_1-167537_2-999994_2-999993_1&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnxRNfQ727QKdNMr45YRMLu2XeEydINMJvMu1WKw5cU5kg&jt=31%24eyJrIj4iOCI0Iix5IkciQEdGRUVITkpITlBMSyJJIkFqIjwiNTw7Ojo9Qz89Q0VARiI%2BIjYzIlEiSlFQT09SWDQyNzs2NyIzIit5IkYiPiI8IjYiTiJHRUlRSEtKTyJLImsiPSI0PD9AelRyb2NPUWNrZFc2RTBHLCw4YjN3eDJhdlpQU3FQUlBQcipfMWhdRURWSEk1NTFjWy5oO0JncUl5cEo2K2xZcmtsK255Z1k7VCxGYjZdOz1hai5iaTkuMFtSb1U2bWhQXFFOUngvN1lqY2Y3WU1QZF4xYGM8MkpBWVJxPHp2Nzs6bmNVUUV3ZjFsa0xaXl9bTnNkOzFuQFRVMm9qb1Vxalc5UFJCRj5gVTowZmwtbkldcXFCN0MvbDlvMlM0Tk9cWkJzPDExVmJXNUl3Wkc5aGhLYUQ6NGxlcDVpSk0uMklPK3opVGN6dzlTNzdlR1pLZTBzbz1XdWRSNUZWdkxIP2tVYFl5RVZ2Vkoqa3hwK180ZitudWdHY3BLTzdPP0xNY3cyc0RUMFxgYEZtLF9ibF1kK2JjZ3N3Uipxa0locj5tX1xyWVB4RXkyQngvLC9RLFE%2FdFZCNlN0dFFZVy0uLTtYWixFLjBNfGNWbCxhR05fb19tbjhrVCloakdHS0thY0JYcj5lZUJyVmgsYC0xR01oPjlafFAwfDo7amZraj88P0FEPz9GSHd4RkZNT0xJT1FNLU5SU2MzIn0%3D"
|
|
|
|
headers = {
|
|
"Host": "yiyanapp.baidu.com",
|
|
"Accept": "text/event-stream",
|
|
"Acs-Token": "1753247060952_1753249454932_Sx0qWvhpSEqXzeFVZfWjhajgI9acfaMFCrYjZzGN7ixGcXuCVLvn0QPuy/sSb1aJd5sutmLZqf6lJXHjsj3/Ls0hixZCVNgp5zSKGpxaOxK+DRWKizPtHHB2jDLYRGXsFIdtJfiqs37Ju/oieeKHevteS5Xrs1w+qE5gpc+z3AcQ9HG1pjbysU37KEUKy8n9UTg0xhpfslG0kat8tRM02up4+QsBDHoci9AP+9zYSRXd/Ay1Uh5u0AHeHY2nRNMv6+txzzdfED7IlPbWj1a7xsF3ak7pCqvWdjLelCXgVBWvc/wJwQw4hLr5Sfr9AeJSNN2Q7+GCL3yM4OVypUsEyBziASqAiLTe87j9JPTx/IanvUxg0HgxmVZJarY91Pm0puQhBVUR+DSIuj/zrdpCxdffx1m+0Lq9JskPhYXw4KZn8jwukybqCSGXbAjq7vrePNuuFgVXVStruZ4C0X94NYtv85sMpRdEw1TN8lbI7kt2B/1fdASUF94Nrbg7fx9+Ihvl4S9AoY5t/g1RVZqfhQ==",
|
|
"X-Bd-Alive": "bbd8e3a50a890b80110fcff0fd674290",
|
|
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Origin": "https://yiyanapp.baidu.com",
|
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
|
|
"Referer": "https://yiyanapp.baidu.com/talk/chat",
|
|
"Connection": "keep-alive",
|
|
"Content-Type": "application/json",
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Cookie": "__bid_n=197e279d29a43687ff36d1; ab_sr=1.0.1_YzNmZDY5MmMzZDdiYWM5NmIyYjkzMGE5YWMwYjRiMjZiZjg3OGVmYzIyMWE2ZmI0OGZlYjg3Njc3MDVhMmY5NTM4NDMxZDFlODg1MGI2MDFjYjkwM2Q3NzMxNTRjZjQ1MmEyY2RhNDExNmUxM2Q2MjUxMTRjMjY0NmFkMDg5MDZjNjk3MDkyYmUyNmNiMmUyNTkyYzYxNTE5NzU4ODFjNjI4NGYyYTFkYmJhZjdmYjRiNDY4ZmM3NjE0ODgwNDRi; BDUSS=w3QXkyMjFUTXdtYThpYUdkTHlGWWpiMFU1TGZjZWxaUEV-Mk1hV0hRS0ZBNmhvSVFBQUFBJCQAAAAAAQAAAAEAAABz6-yOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIV2gGiFdoBoUE; passtheme=light; SP_FW_VER=4.220.1; BAIDUCUID=Ya-oalOgvi0javtz_8BHf0axH8je828Elav7ijiSB8_Qu2ir_8vValtVWu039QRR0zXmA; matrixstyle=0; BAIDUID=976311C6B3BF9DB2D54171E6D3D928DF:FG=1; ST=2",
|
|
}
|
|
|
|
payload = {
|
|
"content": "苹果",
|
|
"sessionId": "",
|
|
"contentType": "text",
|
|
"parentId": "",
|
|
"inputSource": "user_input",
|
|
"querySource": "user_input",
|
|
"context": [],
|
|
"plugins": {},
|
|
"chooses": None,
|
|
"models": {
|
|
"dispatch": "",
|
|
"force": "",
|
|
"select": "auto",
|
|
"replyMode": "normal"
|
|
},
|
|
"newBot": True,
|
|
"from": "app",
|
|
"promptId": "",
|
|
"botId": "",
|
|
"characterId": "1034",
|
|
"environment": {
|
|
"clarify": {"enable": 1, "stage": 1},
|
|
"isNewAB": 1,
|
|
"depthAnalysis": 0,
|
|
"depthMsgId": "",
|
|
"live": {
|
|
"visOnline": {"pdt": "10170", "per": "4189", "audio_ctrl": "{\"mid\":\"\",\"sampling_rate\":24000}", "spd": "5"}
|
|
},
|
|
"interveneId": "",
|
|
"hostTag": "main_chat",
|
|
"individual": "0",
|
|
"composition": None,
|
|
"memory": {"noMemoryExtraction": 0},
|
|
"sceneFrom": "",
|
|
"isReplyModeChange": 0
|
|
}
|
|
}
|
|
payload["content"] = question
|
|
dic = {}
|
|
try:
|
|
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
|
|
answer = ""
|
|
reference = None
|
|
messageId = ''
|
|
parentId = ''
|
|
for line in r.iter_lines():
|
|
if line:
|
|
try:
|
|
text = line.decode("utf-8")
|
|
if text.startswith("data:"):
|
|
print(json.loads(text[5:]))
|
|
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"):
|
|
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference")
|
|
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId")
|
|
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId")
|
|
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
|
|
except:
|
|
pass
|
|
# results.append({"question": question, "answer": answer})
|
|
print(f"✅ 回答:{answer}...\n")
|
|
dic['messageId'] = messageId
|
|
dic['parentId'] = parentId
|
|
dic['reference'] = reference
|
|
dic['answer'] = answer
|
|
except Exception as e:
|
|
print(f"❌ 请求失败:{e}")
|
|
return dic
|
|
|
|
|
|
def get_url(self,idList):
|
|
|
|
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q"
|
|
|
|
headers = {
|
|
"Host": "yiyanapp.baidu.com",
|
|
"Accept": "application/json, text/plain, */*",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==",
|
|
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Content-Type": "application/json",
|
|
"Origin": "https://yiyanapp.baidu.com",
|
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
|
|
"Referer": "https://yiyanapp.baidu.com/talk/chat",
|
|
"Connection": "keep-alive",
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Cookie": self.session_info.get("cookie")
|
|
}
|
|
|
|
|
|
payload = {
|
|
"idList": "7691531,7691532",
|
|
"watermark": "xlSKFmsf",
|
|
"vp": 0
|
|
}
|
|
payload["idList"] = idList
|
|
resp = requests.post(url, headers=headers, json=payload)
|
|
print(resp.json())
|
|
print("返回内容:", resp.json().get("data").get("url"))
|
|
return resp.json().get("data").get("url")
|
|
async def _do_spider(self,) -> AiAnswer:
|
|
self._init_data()
|
|
self.search_result_count = 0
|
|
print("self.prompt",self.prompt)
|
|
result = self.get_answer(self.prompt)
|
|
idList = None
|
|
if result.get("reference"):
|
|
idList = result.get("parentId")+","+ result.get("messageId")
|
|
if idList:
|
|
await self.browser_page.goto(self.get_url(idList), timeout=600000)
|
|
|
|
await asyncio.sleep(3)
|
|
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]')
|
|
await chat_input_element.click()
|
|
# 获取回答元素
|
|
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1)
|
|
box = await answer.bounding_box()
|
|
# 设置视口大小
|
|
await self.browser_page.set_viewport_size({
|
|
'width': 1920,
|
|
'height': int(box['height']) + 500
|
|
})
|
|
# 截图
|
|
screenshot_path = self._get_screenshot_path()
|
|
await self.browser_page.screenshot(path=screenshot_path)
|
|
#匹配citation:中的数字
|
|
citation = list()
|
|
citations = re.findall(r'citation:(\d+)', result.get("answer"))
|
|
if citations:
|
|
citation = list(set(citations))
|
|
ai_search_result_list = []
|
|
for index,search_result in enumerate(result.get("reference").get("searchInfo")):
|
|
url = search_result.get('url', '')
|
|
title = search_result.get('title', '')
|
|
body = search_result.get('snippet', '')
|
|
publish_time = search_result.get('published_at', '')
|
|
host_name = search_result.get('site_name', '未知')
|
|
if str(index+1) in citation:
|
|
is_referenced = "1"
|
|
else:
|
|
is_referenced = "0"
|
|
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced)
|
|
if ai_result.title and ai_result.url:
|
|
ai_search_result_list.append(ai_result)
|
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
|
|
self.ai_answer.search_result = ai_search_result_list
|
|
self.search_result_count = len(self.ai_answer.search_result)
|
|
# 切割图片
|
|
remove_bottom_part(screenshot_path, 82)
|
|
self.ai_answer.answer = result.get("answer")
|
|
self.ai_answer.screenshot_file = screenshot_path
|
|
return self.ai_answer
|
|
|
|
|
|
|
|
|