2 Commits
ff97264f85
...
09e874bc24
| Author | SHA1 | Message | Date |
|---|---|---|---|
|
|
09e874bc24 |
Merge remote-tracking branch 'origin/master'
|
4 months ago |
|
|
9e995d0fcb |
修改秘塔开启深度思考,新增文小言采集
|
4 months ago |
7 changed files with 301 additions and 5 deletions
-
6abs_spider.py
-
4config.py
-
1domain/ai_seo.py
-
2spiders/ai_seo/nanometer.py
-
272spiders/ai_seo/wenxiaoyan.py
-
17utils/image_utils.py
-
2utils/session_utils.py
@ -0,0 +1,272 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
import time |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
|
||||
|
import requests |
||||
|
from glom import glom |
||||
|
from playwright.async_api import Browser |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger, css_to_dict |
||||
|
from utils.image_utils import crop_image_left,remove_bottom_part |
||||
|
import re |
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class WenxiaoyanSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str): |
||||
|
super().__init__(browser, prompt, keyword) |
||||
|
def get_home_url(self) -> str: |
||||
|
return '' |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 8 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'wenxiaoyan' |
||||
|
|
||||
|
|
||||
|
def get_answer(self,question): |
||||
|
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q&jt=" # ⚠️ 注意替换 jt 参数为你抓包所得 |
||||
|
|
||||
|
headers = { |
||||
|
"Host": "yiyanapp.baidu.com", |
||||
|
"Accept": "text/event-stream", |
||||
|
"Sec-Fetch-Site": "same-origin", |
||||
|
# "Acs-Token": "1751775388769_1751851559961_bhZQ4YI+QFmUuvgW70dSF4p6RO6kwt11kM1YLLkcALYQfKLMVkL4CaQihHY3UdL0U59IEuGQ18+yD/87YkdFFJjvRFtzv5V4w566SOzaV4Tl6MgUbtEXWZ1AIXHPKMOvdwofdSm4kY1DzhUDlUirlwEBtgg57atBjuGENz2ku0gIlOq7WF/KO9B8rZ8A+wZvDqBg64h0nWDr+OG7ggPF0m18dztme9wUmTomJ71gIxt6pfiIWP/ICkk0voivZe5/WZ2+n6H1jEiXrbZV2naPksyYjA9alnvptf5PU09c3+9ZzFa7Wh+iou+eKCxjjTv99f8BsQNAs0NVITq3szndR9ZAjqk7Tx5Pf1wskjrafPx4Gk5/tYj9PBEgIapBmvPmf1HBfNwI2zZbVHoEZnMuMllQ1AzDyFiVIdxkGAak3FmsIL/4k8XfcXxaB86CDVk2rKwOSDudYaDDSH5DkP/iWnFFUxAs6Y0/DZglPrMfYpFYgdnS+TD2/Qm5QXbsN1pBmmWjTuvzy9zjP5ykRhaqVg==", # ⚠️ 替换为抓包所得 Acs-Token |
||||
|
# "X-Bd-Alive": "86ff04fba2edb36666d4fbd126554961", |
||||
|
"Accept-Language": "zh-CN,zh-Hans;q=0.9", |
||||
|
"Sec-Fetch-Mode": "cors", |
||||
|
"Accept-Encoding": "gzip, deflate, br", |
||||
|
"Origin": "https://yiyanapp.baidu.com", |
||||
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", |
||||
|
"Referer": "https://yiyanapp.baidu.com/talk/chat", |
||||
|
"Connection": "keep-alive", |
||||
|
"Content-Type": "application/json", |
||||
|
"Cookie": self.session_info.get("cookie") |
||||
|
} |
||||
|
payload = { |
||||
|
"parentId" : "7955151", |
||||
|
"inputSource" : "user_input", |
||||
|
"querySource" : "user_input", |
||||
|
"context" : [ |
||||
|
"7955151", |
||||
|
"7955150", |
||||
|
"7955113", |
||||
|
"7955112", |
||||
|
"7955075", |
||||
|
"7955074", |
||||
|
"7955051", |
||||
|
"7955050", |
||||
|
"7955045", |
||||
|
"7955044", |
||||
|
"7955015", |
||||
|
"7955014", |
||||
|
"7954965", |
||||
|
"7954964", |
||||
|
"7954865", |
||||
|
"7954864", |
||||
|
"7954863", |
||||
|
"7954862", |
||||
|
"7954831", |
||||
|
"7954830" |
||||
|
], |
||||
|
"contentType" : "text", |
||||
|
"from" : "app", |
||||
|
"plugins" : { |
||||
|
|
||||
|
}, |
||||
|
"models" : { |
||||
|
"dispatch" : "", |
||||
|
"force" : "", |
||||
|
"select" : "auto", |
||||
|
"replyMode" : "normal" |
||||
|
}, |
||||
|
"sessionId" : "1159258", |
||||
|
"chooses" : None, |
||||
|
"promptId" : "", |
||||
|
"botId" : "144038", |
||||
|
"characterId" : "1034", |
||||
|
"environment" : { |
||||
|
"isNewAB" : 1, |
||||
|
"depthAnalysis" : 0, |
||||
|
"hostTag" : "main_chat", |
||||
|
"composition" : None, |
||||
|
"sceneFrom" : "", |
||||
|
"individual" : "0", |
||||
|
"live" : { |
||||
|
"visOnline" : { |
||||
|
"per" : "4189", |
||||
|
"pdt" : "10170", |
||||
|
"audio_ctrl" : "{\"mid\":\"\",\"sampling_rate\":24000}", |
||||
|
"spd" : "5" |
||||
|
} |
||||
|
}, |
||||
|
"isReplyModeChange" : 0, |
||||
|
"memory" : { |
||||
|
"noMemoryExtraction" : 0 |
||||
|
}, |
||||
|
"depthMsgId" : "", |
||||
|
"clarify" : { |
||||
|
"enable" : 1, |
||||
|
"stage" : 1 |
||||
|
}, |
||||
|
"interveneId" : "" |
||||
|
}, |
||||
|
"content" : "小米" |
||||
|
} |
||||
|
payload["content"] = question |
||||
|
dic = {} |
||||
|
try: |
||||
|
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) |
||||
|
answer = "" |
||||
|
reference = None |
||||
|
messageId = '' |
||||
|
parentId = '' |
||||
|
for line in r.iter_lines(): |
||||
|
if line: |
||||
|
try: |
||||
|
text = line.decode("utf-8") |
||||
|
if text.startswith("data:"): |
||||
|
print(json.loads(text[5:])) |
||||
|
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"): |
||||
|
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference") |
||||
|
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId") |
||||
|
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId") |
||||
|
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") |
||||
|
except: |
||||
|
pass |
||||
|
# results.append({"question": question, "answer": answer}) |
||||
|
print(f"✅ 回答:{answer}...\n") |
||||
|
dic['messageId'] = messageId |
||||
|
dic['parentId'] = parentId |
||||
|
dic['reference'] = reference |
||||
|
dic['answer'] = answer |
||||
|
except Exception as e: |
||||
|
print(f"❌ 请求失败:{e}") |
||||
|
return dic |
||||
|
# results.append({"question": question, "answer": "请求失败"}) |
||||
|
# ✅ 4. 批量问题 |
||||
|
# questions = [ |
||||
|
# "苹果手机咋样!", |
||||
|
# |
||||
|
# ] |
||||
|
# |
||||
|
# # ✅ 5. 结果列表 |
||||
|
# results = [] |
||||
|
# |
||||
|
# # ✅ 6. 主循环:发送每个问题 |
||||
|
# for q in questions: |
||||
|
# print(f"📨 提问中:{q}") |
||||
|
# payload["content"] = q |
||||
|
# try: |
||||
|
# r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) |
||||
|
# answer = "" |
||||
|
# for line in r.iter_lines(): |
||||
|
# if line: |
||||
|
# try: |
||||
|
# text = line.decode("utf-8") |
||||
|
# if text.startswith("data:"): |
||||
|
# print(json.loads(text[5:])) |
||||
|
# answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") |
||||
|
# except: |
||||
|
# pass |
||||
|
# results.append({"question": q, "answer": answer}) |
||||
|
# print(f"✅ 回答:{answer}...\n") |
||||
|
# time.sleep(2) # 加一点延时防止频率过快 |
||||
|
# except Exception as e: |
||||
|
# print(f"❌ 请求失败:{e}") |
||||
|
# results.append({"question": q, "answer": "请求失败"}) |
||||
|
|
||||
|
def get_url(self,idList): |
||||
|
|
||||
|
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q" |
||||
|
|
||||
|
headers = { |
||||
|
"Host": "yiyanapp.baidu.com", |
||||
|
"Accept": "application/json, text/plain, */*", |
||||
|
"Sec-Fetch-Site": "same-origin", |
||||
|
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==", |
||||
|
"Accept-Language": "zh-CN,zh-Hans;q=0.9", |
||||
|
"Accept-Encoding": "gzip, deflate, br", |
||||
|
"Sec-Fetch-Mode": "cors", |
||||
|
"Content-Type": "application/json", |
||||
|
"Origin": "https://yiyanapp.baidu.com", |
||||
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", |
||||
|
"Referer": "https://yiyanapp.baidu.com/talk/chat", |
||||
|
"Connection": "keep-alive", |
||||
|
"Sec-Fetch-Dest": "empty", |
||||
|
"Cookie": self.session_info.get("cookie") |
||||
|
} |
||||
|
|
||||
|
|
||||
|
payload = { |
||||
|
"idList": "7924487,7924488", |
||||
|
"watermark": "qYbvOVmx", |
||||
|
"vp": 0 |
||||
|
} |
||||
|
payload["idList"] = idList |
||||
|
resp = requests.post(url, headers=headers, json=payload) |
||||
|
print(resp.json()) |
||||
|
print("返回内容:", resp.json().get("data").get("url")) |
||||
|
return resp.json().get("data").get("url") |
||||
|
async def _do_spider(self,) -> AiAnswer: |
||||
|
self._init_data() |
||||
|
self.search_result_count = 0 |
||||
|
print("self.prompt",self.prompt) |
||||
|
result = self.get_answer(self.prompt) |
||||
|
idList = None |
||||
|
if result.get("reference"): |
||||
|
idList = result.get("parentId")+","+ result.get("messageId") |
||||
|
if idList: |
||||
|
await self.browser_page.goto(self.get_url(idList), timeout=600000) |
||||
|
|
||||
|
await asyncio.sleep(3) |
||||
|
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]') |
||||
|
await chat_input_element.click() |
||||
|
# 获取回答元素 |
||||
|
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1) |
||||
|
box = await answer.bounding_box() |
||||
|
# 设置视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(box['height']) + 500 |
||||
|
}) |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
#匹配citation:中的数字 |
||||
|
citation = list() |
||||
|
citations = re.findall(r'citation:(\d+)', result.get("answer")) |
||||
|
if citations: |
||||
|
citation = list(set(citations)) |
||||
|
ai_search_result_list = [] |
||||
|
for index,search_result in enumerate(result.get("reference").get("searchInfo")): |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
body = search_result.get('snippet', '') |
||||
|
publish_time = search_result.get('published_at', '') |
||||
|
host_name = search_result.get('site_name', '未知') |
||||
|
if str(index+1) in citation: |
||||
|
is_referenced = "1" |
||||
|
else: |
||||
|
is_referenced = "0" |
||||
|
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced) |
||||
|
if ai_result.title and ai_result.url: |
||||
|
ai_search_result_list.append(ai_result) |
||||
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.search_result_count = len(self.ai_answer.search_result) |
||||
|
# 切割图片 |
||||
|
remove_bottom_part(screenshot_path, 82) |
||||
|
self.ai_answer.answer = result.get("answer") |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue