7 changed files with 301 additions and 5 deletions
-
8abs_spider.py
-
4config.py
-
1domain/ai_seo.py
-
2spiders/ai_seo/nanometer.py
-
272spiders/ai_seo/wenxiaoyan.py
-
17utils/image_utils.py
-
2utils/session_utils.py
@ -0,0 +1,272 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
import time |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
|
|||
import requests |
|||
from glom import glom |
|||
from playwright.async_api import Browser |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger, css_to_dict |
|||
from utils.image_utils import crop_image_left,remove_bottom_part |
|||
import re |
|||
logger = create_logger(__name__) |
|||
|
|||
class WenxiaoyanSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str): |
|||
super().__init__(browser, prompt, keyword) |
|||
def get_home_url(self) -> str: |
|||
return '' |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 8 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'wenxiaoyan' |
|||
|
|||
|
|||
def get_answer(self,question): |
|||
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q&jt=" # ⚠️ 注意替换 jt 参数为你抓包所得 |
|||
|
|||
headers = { |
|||
"Host": "yiyanapp.baidu.com", |
|||
"Accept": "text/event-stream", |
|||
"Sec-Fetch-Site": "same-origin", |
|||
# "Acs-Token": "1751775388769_1751851559961_bhZQ4YI+QFmUuvgW70dSF4p6RO6kwt11kM1YLLkcALYQfKLMVkL4CaQihHY3UdL0U59IEuGQ18+yD/87YkdFFJjvRFtzv5V4w566SOzaV4Tl6MgUbtEXWZ1AIXHPKMOvdwofdSm4kY1DzhUDlUirlwEBtgg57atBjuGENz2ku0gIlOq7WF/KO9B8rZ8A+wZvDqBg64h0nWDr+OG7ggPF0m18dztme9wUmTomJ71gIxt6pfiIWP/ICkk0voivZe5/WZ2+n6H1jEiXrbZV2naPksyYjA9alnvptf5PU09c3+9ZzFa7Wh+iou+eKCxjjTv99f8BsQNAs0NVITq3szndR9ZAjqk7Tx5Pf1wskjrafPx4Gk5/tYj9PBEgIapBmvPmf1HBfNwI2zZbVHoEZnMuMllQ1AzDyFiVIdxkGAak3FmsIL/4k8XfcXxaB86CDVk2rKwOSDudYaDDSH5DkP/iWnFFUxAs6Y0/DZglPrMfYpFYgdnS+TD2/Qm5QXbsN1pBmmWjTuvzy9zjP5ykRhaqVg==", # ⚠️ 替换为抓包所得 Acs-Token |
|||
# "X-Bd-Alive": "86ff04fba2edb36666d4fbd126554961", |
|||
"Accept-Language": "zh-CN,zh-Hans;q=0.9", |
|||
"Sec-Fetch-Mode": "cors", |
|||
"Accept-Encoding": "gzip, deflate, br", |
|||
"Origin": "https://yiyanapp.baidu.com", |
|||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", |
|||
"Referer": "https://yiyanapp.baidu.com/talk/chat", |
|||
"Connection": "keep-alive", |
|||
"Content-Type": "application/json", |
|||
"Cookie": self.session_info.get("cookie") |
|||
} |
|||
payload = { |
|||
"parentId" : "7955151", |
|||
"inputSource" : "user_input", |
|||
"querySource" : "user_input", |
|||
"context" : [ |
|||
"7955151", |
|||
"7955150", |
|||
"7955113", |
|||
"7955112", |
|||
"7955075", |
|||
"7955074", |
|||
"7955051", |
|||
"7955050", |
|||
"7955045", |
|||
"7955044", |
|||
"7955015", |
|||
"7955014", |
|||
"7954965", |
|||
"7954964", |
|||
"7954865", |
|||
"7954864", |
|||
"7954863", |
|||
"7954862", |
|||
"7954831", |
|||
"7954830" |
|||
], |
|||
"contentType" : "text", |
|||
"from" : "app", |
|||
"plugins" : { |
|||
|
|||
}, |
|||
"models" : { |
|||
"dispatch" : "", |
|||
"force" : "", |
|||
"select" : "auto", |
|||
"replyMode" : "normal" |
|||
}, |
|||
"sessionId" : "1159258", |
|||
"chooses" : None, |
|||
"promptId" : "", |
|||
"botId" : "144038", |
|||
"characterId" : "1034", |
|||
"environment" : { |
|||
"isNewAB" : 1, |
|||
"depthAnalysis" : 0, |
|||
"hostTag" : "main_chat", |
|||
"composition" : None, |
|||
"sceneFrom" : "", |
|||
"individual" : "0", |
|||
"live" : { |
|||
"visOnline" : { |
|||
"per" : "4189", |
|||
"pdt" : "10170", |
|||
"audio_ctrl" : "{\"mid\":\"\",\"sampling_rate\":24000}", |
|||
"spd" : "5" |
|||
} |
|||
}, |
|||
"isReplyModeChange" : 0, |
|||
"memory" : { |
|||
"noMemoryExtraction" : 0 |
|||
}, |
|||
"depthMsgId" : "", |
|||
"clarify" : { |
|||
"enable" : 1, |
|||
"stage" : 1 |
|||
}, |
|||
"interveneId" : "" |
|||
}, |
|||
"content" : "小米" |
|||
} |
|||
payload["content"] = question |
|||
dic = {} |
|||
try: |
|||
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) |
|||
answer = "" |
|||
reference = None |
|||
messageId = '' |
|||
parentId = '' |
|||
for line in r.iter_lines(): |
|||
if line: |
|||
try: |
|||
text = line.decode("utf-8") |
|||
if text.startswith("data:"): |
|||
print(json.loads(text[5:])) |
|||
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"): |
|||
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference") |
|||
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId") |
|||
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId") |
|||
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") |
|||
except: |
|||
pass |
|||
# results.append({"question": question, "answer": answer}) |
|||
print(f"✅ 回答:{answer}...\n") |
|||
dic['messageId'] = messageId |
|||
dic['parentId'] = parentId |
|||
dic['reference'] = reference |
|||
dic['answer'] = answer |
|||
except Exception as e: |
|||
print(f"❌ 请求失败:{e}") |
|||
return dic |
|||
# results.append({"question": question, "answer": "请求失败"}) |
|||
# ✅ 4. 批量问题 |
|||
# questions = [ |
|||
# "苹果手机咋样!", |
|||
# |
|||
# ] |
|||
# |
|||
# # ✅ 5. 结果列表 |
|||
# results = [] |
|||
# |
|||
# # ✅ 6. 主循环:发送每个问题 |
|||
# for q in questions: |
|||
# print(f"📨 提问中:{q}") |
|||
# payload["content"] = q |
|||
# try: |
|||
# r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) |
|||
# answer = "" |
|||
# for line in r.iter_lines(): |
|||
# if line: |
|||
# try: |
|||
# text = line.decode("utf-8") |
|||
# if text.startswith("data:"): |
|||
# print(json.loads(text[5:])) |
|||
# answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") |
|||
# except: |
|||
# pass |
|||
# results.append({"question": q, "answer": answer}) |
|||
# print(f"✅ 回答:{answer}...\n") |
|||
# time.sleep(2) # 加一点延时防止频率过快 |
|||
# except Exception as e: |
|||
# print(f"❌ 请求失败:{e}") |
|||
# results.append({"question": q, "answer": "请求失败"}) |
|||
|
|||
def get_url(self,idList): |
|||
|
|||
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q" |
|||
|
|||
headers = { |
|||
"Host": "yiyanapp.baidu.com", |
|||
"Accept": "application/json, text/plain, */*", |
|||
"Sec-Fetch-Site": "same-origin", |
|||
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==", |
|||
"Accept-Language": "zh-CN,zh-Hans;q=0.9", |
|||
"Accept-Encoding": "gzip, deflate, br", |
|||
"Sec-Fetch-Mode": "cors", |
|||
"Content-Type": "application/json", |
|||
"Origin": "https://yiyanapp.baidu.com", |
|||
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", |
|||
"Referer": "https://yiyanapp.baidu.com/talk/chat", |
|||
"Connection": "keep-alive", |
|||
"Sec-Fetch-Dest": "empty", |
|||
"Cookie": self.session_info.get("cookie") |
|||
} |
|||
|
|||
|
|||
payload = { |
|||
"idList": "7924487,7924488", |
|||
"watermark": "qYbvOVmx", |
|||
"vp": 0 |
|||
} |
|||
payload["idList"] = idList |
|||
resp = requests.post(url, headers=headers, json=payload) |
|||
print(resp.json()) |
|||
print("返回内容:", resp.json().get("data").get("url")) |
|||
return resp.json().get("data").get("url") |
|||
async def _do_spider(self,) -> AiAnswer: |
|||
self._init_data() |
|||
self.search_result_count = 0 |
|||
print("self.prompt",self.prompt) |
|||
result = self.get_answer(self.prompt) |
|||
idList = None |
|||
if result.get("reference"): |
|||
idList = result.get("parentId")+","+ result.get("messageId") |
|||
if idList: |
|||
await self.browser_page.goto(self.get_url(idList), timeout=600000) |
|||
|
|||
await asyncio.sleep(3) |
|||
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]') |
|||
await chat_input_element.click() |
|||
# 获取回答元素 |
|||
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1) |
|||
box = await answer.bounding_box() |
|||
# 设置视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(box['height']) + 500 |
|||
}) |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
#匹配citation:中的数字 |
|||
citation = list() |
|||
citations = re.findall(r'citation:(\d+)', result.get("answer")) |
|||
if citations: |
|||
citation = list(set(citations)) |
|||
ai_search_result_list = [] |
|||
for index,search_result in enumerate(result.get("reference").get("searchInfo")): |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
body = search_result.get('snippet', '') |
|||
publish_time = search_result.get('published_at', '') |
|||
host_name = search_result.get('site_name', '未知') |
|||
if str(index+1) in citation: |
|||
is_referenced = "1" |
|||
else: |
|||
is_referenced = "0" |
|||
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced) |
|||
if ai_result.title and ai_result.url: |
|||
ai_search_result_list.append(ai_result) |
|||
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.search_result_count = len(self.ai_answer.search_result) |
|||
# 切割图片 |
|||
remove_bottom_part(screenshot_path, 82) |
|||
self.ai_answer.answer = result.get("answer") |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
|
|||
|
|||
|
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue