2 Commits

  1. 8
      abs_spider.py
  2. 4
      config.py
  3. 1
      domain/ai_seo.py
  4. 2
      spiders/ai_seo/nanometer.py
  5. 272
      spiders/ai_seo/wenxiaoyan.py
  6. 17
      utils/image_utils.py
  7. 2
      utils/session_utils.py

8
abs_spider.py

@ -54,7 +54,8 @@ class AbstractAiSeoSpider(ABC):
4: "kimi", 4: "kimi",
2: "tongyi", 2: "tongyi",
6: "yiyan", 6: "yiyan",
3: "yuanbao"
3: "yuanbao",
8: "wenxiaoyan"
} }
# todo 支持多session管理 # todo 支持多session管理
@ -69,7 +70,10 @@ class AbstractAiSeoSpider(ABC):
async def __init_page(self): async def __init_page(self):
if self.load_session: if self.load_session:
self.session_info = await get_spider_session(self.platform_id) self.session_info = await get_spider_session(self.platform_id)
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path'])
if self.platform_id != 8:
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path'])
else:
self.browser_content = await self.browser.new_context()
else: else:
self.browser_content = await self.browser.new_context() self.browser_content = await self.browser.new_context()
self.browser_page = await self.browser_content.new_page() self.browser_page = await self.browser_content.new_page()

4
config.py

@ -30,7 +30,7 @@ AI_SEO_JOB_RANGE = {
# aiseo任务是否启用 # aiseo任务是否启用
AI_SEO_JOB_ENABLE = True AI_SEO_JOB_ENABLE = True
# aiseo任务运行间隔 # aiseo任务运行间隔
AI_SEO_JOB_INTERVAL = 5
AI_SEO_JOB_INTERVAL = 20
# aiseo任务获取平台 # aiseo任务获取平台
AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13'] AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13']
# aiseo任务最大并发量 # aiseo任务最大并发量
@ -42,7 +42,7 @@ DEEPSEEK_SEO_JOB_RANGE = {
'end_time': '23:59' 'end_time': '23:59'
} }
# deepseek任务是否启用 # deepseek任务是否启用
DEEPSEEK_JOB_ENABLE = True
DEEPSEEK_JOB_ENABLE = False
# deepseek任务获取间隔 # deepseek任务获取间隔
DEEPSEEK_JOB_INTERVAL = 30 DEEPSEEK_JOB_INTERVAL = 30
# deepseek任务获取平台 # deepseek任务获取平台

1
domain/ai_seo.py

@ -26,7 +26,6 @@ class AiSearchResult:
is_referenced: str = '0' is_referenced: str = '0'
#情感倾向" 1- 中立 2- 正面 3- 负面 #情感倾向" 1- 中立 2- 正面 3- 负面
sentiment_type = 0 sentiment_type = 0
#情感类型 #情感类型
type = 0 type = 0
def __post_init__(self): def __post_init__(self):

2
spiders/ai_seo/nanometer.py

@ -31,6 +31,8 @@ class NanometerSpider(AbstractAiSeoSpider):
self._init_data() self._init_data()
# 开始操作 # 开始操作
await self.browser_page.goto(self.get_home_url(), timeout=600000) await self.browser_page.goto(self.get_home_url(), timeout=600000)
#开启深度思考
await self.browser_page.locator('//*[@id="nworld-app-container"]/div/div[1]/div[1]/div/div/div/div/div[2]/div[1]/div[1]/div[2]/div[1]/section/div').click()
chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']") chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']")
# 输入提问词 # 输入提问词
await chat_input_element.fill(self.prompt) await chat_input_element.fill(self.prompt)

272
spiders/ai_seo/wenxiaoyan.py

@ -0,0 +1,272 @@
# coding=utf-8
import asyncio
import json
import time
from functools import partial, wraps
from json import JSONDecodeError
import requests
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, css_to_dict
from utils.image_utils import crop_image_left,remove_bottom_part
import re
logger = create_logger(__name__)
class WenxiaoyanSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword)
def get_home_url(self) -> str:
return ''
def get_platform_id(self) -> int:
return 8
def get_platform_name(self) -> str:
return 'wenxiaoyan'
def get_answer(self,question):
url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q&jt=" # ⚠️ 注意替换 jt 参数为你抓包所得
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "text/event-stream",
"Sec-Fetch-Site": "same-origin",
# "Acs-Token": "1751775388769_1751851559961_bhZQ4YI+QFmUuvgW70dSF4p6RO6kwt11kM1YLLkcALYQfKLMVkL4CaQihHY3UdL0U59IEuGQ18+yD/87YkdFFJjvRFtzv5V4w566SOzaV4Tl6MgUbtEXWZ1AIXHPKMOvdwofdSm4kY1DzhUDlUirlwEBtgg57atBjuGENz2ku0gIlOq7WF/KO9B8rZ8A+wZvDqBg64h0nWDr+OG7ggPF0m18dztme9wUmTomJ71gIxt6pfiIWP/ICkk0voivZe5/WZ2+n6H1jEiXrbZV2naPksyYjA9alnvptf5PU09c3+9ZzFa7Wh+iou+eKCxjjTv99f8BsQNAs0NVITq3szndR9ZAjqk7Tx5Pf1wskjrafPx4Gk5/tYj9PBEgIapBmvPmf1HBfNwI2zZbVHoEZnMuMllQ1AzDyFiVIdxkGAak3FmsIL/4k8XfcXxaB86CDVk2rKwOSDudYaDDSH5DkP/iWnFFUxAs6Y0/DZglPrMfYpFYgdnS+TD2/Qm5QXbsN1pBmmWjTuvzy9zjP5ykRhaqVg==", # ⚠️ 替换为抓包所得 Acs-Token
# "X-Bd-Alive": "86ff04fba2edb36666d4fbd126554961",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Sec-Fetch-Mode": "cors",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Cookie": self.session_info.get("cookie")
}
payload = {
"parentId" : "7955151",
"inputSource" : "user_input",
"querySource" : "user_input",
"context" : [
"7955151",
"7955150",
"7955113",
"7955112",
"7955075",
"7955074",
"7955051",
"7955050",
"7955045",
"7955044",
"7955015",
"7955014",
"7954965",
"7954964",
"7954865",
"7954864",
"7954863",
"7954862",
"7954831",
"7954830"
],
"contentType" : "text",
"from" : "app",
"plugins" : {
},
"models" : {
"dispatch" : "",
"force" : "",
"select" : "auto",
"replyMode" : "normal"
},
"sessionId" : "1159258",
"chooses" : None,
"promptId" : "",
"botId" : "144038",
"characterId" : "1034",
"environment" : {
"isNewAB" : 1,
"depthAnalysis" : 0,
"hostTag" : "main_chat",
"composition" : None,
"sceneFrom" : "",
"individual" : "0",
"live" : {
"visOnline" : {
"per" : "4189",
"pdt" : "10170",
"audio_ctrl" : "{\"mid\":\"\",\"sampling_rate\":24000}",
"spd" : "5"
}
},
"isReplyModeChange" : 0,
"memory" : {
"noMemoryExtraction" : 0
},
"depthMsgId" : "",
"clarify" : {
"enable" : 1,
"stage" : 1
},
"interveneId" : ""
},
"content" : "小米"
}
payload["content"] = question
dic = {}
try:
r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
answer = ""
reference = None
messageId = ''
parentId = ''
for line in r.iter_lines():
if line:
try:
text = line.decode("utf-8")
if text.startswith("data:"):
print(json.loads(text[5:]))
if json.loads(text[5:]).get("data", "").get("data", "").get("reference"):
reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference")
messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId")
parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId")
answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
except:
pass
# results.append({"question": question, "answer": answer})
print(f"✅ 回答:{answer}...\n")
dic['messageId'] = messageId
dic['parentId'] = parentId
dic['reference'] = reference
dic['answer'] = answer
except Exception as e:
print(f"❌ 请求失败:{e}")
return dic
# results.append({"question": question, "answer": "请求失败"})
# ✅ 4. 批量问题
# questions = [
# "苹果手机咋样!",
#
# ]
#
# # ✅ 5. 结果列表
# results = []
#
# # ✅ 6. 主循环:发送每个问题
# for q in questions:
# print(f"📨 提问中:{q}")
# payload["content"] = q
# try:
# r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30)
# answer = ""
# for line in r.iter_lines():
# if line:
# try:
# text = line.decode("utf-8")
# if text.startswith("data:"):
# print(json.loads(text[5:]))
# answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "")
# except:
# pass
# results.append({"question": q, "answer": answer})
# print(f"✅ 回答:{answer}...\n")
# time.sleep(2) # 加一点延时防止频率过快
# except Exception as e:
# print(f"❌ 请求失败:{e}")
# results.append({"question": q, "answer": "请求失败"})
def get_url(self,idList):
url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s&currentLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q"
headers = {
"Host": "yiyanapp.baidu.com",
"Accept": "application/json, text/plain, */*",
"Sec-Fetch-Site": "same-origin",
# "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Sec-Fetch-Mode": "cors",
"Content-Type": "application/json",
"Origin": "https://yiyanapp.baidu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)",
"Referer": "https://yiyanapp.baidu.com/talk/chat",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Cookie": self.session_info.get("cookie")
}
payload = {
"idList": "7924487,7924488",
"watermark": "qYbvOVmx",
"vp": 0
}
payload["idList"] = idList
resp = requests.post(url, headers=headers, json=payload)
print(resp.json())
print("返回内容:", resp.json().get("data").get("url"))
return resp.json().get("data").get("url")
async def _do_spider(self,) -> AiAnswer:
self._init_data()
self.search_result_count = 0
print("self.prompt",self.prompt)
result = self.get_answer(self.prompt)
idList = None
if result.get("reference"):
idList = result.get("parentId")+","+ result.get("messageId")
if idList:
await self.browser_page.goto(self.get_url(idList), timeout=600000)
await asyncio.sleep(3)
chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]')
await chat_input_element.click()
# 获取回答元素
answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1)
box = await answer.bounding_box()
# 设置视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(box['height']) + 500
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
#匹配citation:中的数字
citation = list()
citations = re.findall(r'citation:(\d+)', result.get("answer"))
if citations:
citation = list(set(citations))
ai_search_result_list = []
for index,search_result in enumerate(result.get("reference").get("searchInfo")):
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
if str(index+1) in citation:
is_referenced = "1"
else:
is_referenced = "0"
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
# 切割图片
remove_bottom_part(screenshot_path, 82)
self.ai_answer.answer = result.get("answer")
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer

17
utils/image_utils.py

@ -41,3 +41,20 @@ def crop_image_left(image_path, crop_width):
print(f"处理图片时出错: {e}") print(f"处理图片时出错: {e}")
if 'temp_path' in locals() and os.path.exists(temp_path): if 'temp_path' in locals() and os.path.exists(temp_path):
os.remove(temp_path) os.remove(temp_path)
from PIL import Image
#从顶部截到离底部crop_height像素
def remove_bottom_part(image_path, crop_height):
img = Image.open(image_path)
width, height = img.size
cropped_img = img.crop((0, 0, width, height - crop_height)) # 保留顶部到 height - crop_height
# 临时保存切割后的图片
temp_path = image_path + ".png"
cropped_img.save(temp_path, quality=95)
# 覆盖原文件
os.replace(temp_path, image_path)
logger.info(f"成功从底部切割 {crop_height}px 并覆盖原图")

2
utils/session_utils.py

@ -16,6 +16,8 @@ async def get_spider_session(platform_id):
session_info = await AiSeoApis.get_spider_session(platform_id) session_info = await AiSeoApis.get_spider_session(platform_id)
if not session_info: if not session_info:
raise Exception(f"平台id: {platform_id} 没有可用的爬虫session") raise Exception(f"平台id: {platform_id} 没有可用的爬虫session")
if platform_id == 8:
return session_info
# 根据id去爬虫文件夹中找 # 根据id去爬虫文件夹中找
target = search_session_file(session_info['id'], base_path) target = search_session_file(session_info['id'], base_path)
# 如果没有找到 下载这个文件并保存 # 如果没有找到 下载这个文件并保存

Loading…
Cancel
Save