Browse Source

上传工程代码

master
zhurunlin 4 months ago
commit
2d51402f2d
  1. 135
      abs_spider.py
  2. 76
      config.py
  3. 92
      config.py.example
  4. 1
      data/session/deepseek.json
  5. 1
      data/session/doubao.json
  6. 1
      data/session/kimi.json
  7. 1
      data/session/metaso.json
  8. 1
      data/session/tongyi.json
  9. 1
      data/session/yiyan.json
  10. 1
      data/session/yuanbao.json
  11. 1
      data/session_data/deepseek.json
  12. 1
      data/session_data/doubao.json
  13. 1
      data/session_data/kimi.json
  14. 1
      data/session_data/metaso.json
  15. 1
      data/session_data/tongyi.json
  16. 1
      data/session_data/yiyan.json
  17. 1
      data/session_data/yuanbao.json
  18. 1
      domain/__init__.py
  19. 68
      domain/ai_seo.py
  20. 100
      login.py
  21. 85
      main.py
  22. 12
      requirements.txt
  23. 78
      resave.py
  24. 238
      run.py
  25. 119
      run_deepseek.py
  26. 1
      spiders/__init__.py
  27. 9
      spiders/ai_seo/__init__.py
  28. 189
      spiders/ai_seo/deepseek.py
  29. 164
      spiders/ai_seo/doubao.py
  30. 148
      spiders/ai_seo/kimi.py
  31. 196
      spiders/ai_seo/metaso.py
  32. 174
      spiders/ai_seo/nanometer.py
  33. 176
      spiders/ai_seo/tongyi.py
  34. 213
      spiders/ai_seo/yiyan.py
  35. 174
      spiders/ai_seo/yuanbao.py
  36. 7
      static/stealth.min.js
  37. 111
      utils/__init__.py
  38. 114
      utils/ai.py
  39. 275
      utils/ai_seo_api_utils.py
  40. 43
      utils/image_utils.py
  41. 48
      utils/logger_utils.py
  42. 42
      utils/session_utils.py

135
abs_spider.py

@ -0,0 +1,135 @@
# coding=utf-8
import asyncio
import uuid
from abc import ABC, abstractmethod
from asyncio import Event
from playwright.async_api import Browser, BrowserContext, Page
import config
from domain.ai_seo import AiAnswer
from utils import create_logger
from utils.session_utils import get_spider_session
logger = create_logger("abs_spider")
class AbstractAiSeoSpider(ABC):
browser: Browser
browser_content: BrowserContext
browser_page: Page
platform_id: int
platform_name: str
prompt: str
keyword: str
completed_event: Event | None = None
ai_answer: AiAnswer | None = None
fail_status: bool = False
fail_exception: Exception | None = None
load_session: bool = True
session_info: dict | None = None
task_id: int = 0
think: bool = False
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False, load_session: bool = True):
self.browser = browser
self.platform_id = self.get_platform_id()
self.platform_name = self.get_platform_name()
self.prompt = prompt
self.keyword = keyword
self.load_session = load_session
self.think = think
def _init_data(self):
self.completed_event = asyncio.Event()
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword)
self.index_data = None
def _get_session_path(self):
sessions = {
1: "deepseek",
5: "doubao",
4: "kimi",
2: "tongyi",
6: "yiyan",
3: "yuanbao"
}
# todo 支持多session管理
session_path = f"./data/session/{sessions.get(self.platform_id, 'deepseek')}.json"
return session_path
def _get_screenshot_path(self):
unique_id = str(uuid.uuid4()).replace('-', '')
screenshot_path = f'{config.SCREENSHOT_BASE_PATH}/{self.platform_name}_{unique_id}.png'
return screenshot_path
async def __init_page(self):
if self.load_session:
self.session_info = await get_spider_session(self.platform_id)
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path'])
else:
self.browser_content = await self.browser.new_context()
self.browser_page = await self.browser_content.new_page()
await self.browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE)
# 加载伪装脚本
await self.browser_page.add_init_script("""
Object.defineProperties(navigator, {webdriver:{get:()=>false}});
""")
await self.browser_page.add_init_script('static/stealth.min.js')
async def _close(self):
await self.browser_page.close()
await self.browser_content.close()
async def _login(self):
"""
:return:
"""
await self.__init_page()
await self.browser_page.goto(self.get_home_url())
unique_id = str(uuid.uuid4()).replace('-', '')
session_path = f"./data/session/{self.get_platform_name()}/{unique_id}.json"
input("请手动登录后按回车继续...")
await self.browser_content.storage_state(path=session_path)
logger.info(f"[{self.platform_name}]登录成功: {session_path}")
await self._close()
async def run(self) -> AiAnswer | None:
"""
:return:
"""
try:
await self.__init_page()
logger.info(f"{self.platform_name}爬虫开始运行 提问词: {self.prompt}")
return await self._do_spider()
except Exception as e:
logger.error(f"{self.platform_name}爬虫运行异常 参数: {self.prompt, self.keyword}")
logger.error(f"异常信息: {str(e)}")
raise e
finally:
await self._close()
@abstractmethod
async def _do_spider(self) -> AiAnswer:
"""
:return:
"""
pass
@abstractmethod
def get_platform_id(self) -> int:
pass
@abstractmethod
def get_platform_name(self) -> str:
pass
@abstractmethod
def get_home_url(self) -> str:
pass

76
config.py

@ -0,0 +1,76 @@
# coding=utf-8
LOG_LEVEL = 'DEBUG'
DC_ID = 'dev-01'
ROOT_PATH = r'C:\Users\Administrator\Desktop\spider_ai_seo'
SCREENSHOT_BASE_PATH = 'screenshot'
BROWSER_HANDLESS = False
BROWSER_ENABLE_SANDBOX = False
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"]
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"]
PAGE_INIT_VIEWPORT_SIZE = {
'width': 1920,
'height': 1080
}
AI_SEO_BASE_URL = 'https://geo-api.neicela.com'
AI_SEO_API_AUTH = {
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c',
'secret': '5588071d36f0bc61af849c311a03f2c4'
}
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f'
AI_SEO_JOB_RANGE = {
'start_time': '00:10',
'end_time': '23:59'
}
# aiseo任务是否启用
AI_SEO_JOB_ENABLE = True
# aiseo任务运行间隔
AI_SEO_JOB_INTERVAL = 5
# aiseo任务获取平台
AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13']
# aiseo任务最大并发量
AI_SEO_JOB_MAX_INSTANCES = 2
DEEPSEEK_SEO_JOB_RANGE = {
'start_time': '00:10',
'end_time': '23:59'
}
# deepseek任务是否启用
DEEPSEEK_JOB_ENABLE = True
# deepseek任务获取间隔
DEEPSEEK_JOB_INTERVAL = 30
# deepseek任务获取平台
DEEPSEEK_JOB_PLATFORM_IDS = ['1']
# deepseek任务最大并发量
DEEPSEEK_JOB_MAX_INSTANCES = 1
# 测试提示词
TEST_KEYWORDS = [
'正典燕窝',
'燕窝供货商',
'燕窝供应链平台',
'燕窝加盟',
'燕窝加盟选哪个品牌好',
]
# 测试平台
TEST_PLATFORM = [6]
# TEST_PLATFORM = []
# 测试间隔
TEST_INTERVAL = 10
RESAVE_CONFIG = {
'platform_ids': '1',
'dates': ['2025-04-16']
}
# 任务队列redis配置
ARQ_REDIS_HOST = 'localhost'
ARQ_REDIS_PORT = 6379
ARQ_REDIS_DB = 5
ARQ_REDIS_PASSWORD = None

92
config.py.example

@ -0,0 +1,92 @@
# coding=utf-8
LOG_LEVEL = 'INFO'
DC_ID = 'dev-01'
SCREENSHOT_BASE_PATH = 'screenshot'
BROWSER_HANDLESS = False
BROWSER_ENABLE_SANDBOX = False
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"]
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"]
PAGE_INIT_VIEWPORT_SIZE = {
'width': 1920,
'height': 1080
}
AI_SEO_BASE_URL = 'https://aiseo-api.neicela.com'
AI_SEO_API_AUTH = {
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c',
'secret': '5588071d36f0bc61af849c311a03f2c4'
}
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f'
AI_SEO_JOB_DATE = '2025-04-21'
# aiseo任务运行间隔
AI_SEO_JOB_INTERVAL = 20
# aiseo任务获取平台
AI_SEO_JOB_PLATFORM_IDS = ['2', '3', '4', '5', '6', '7']
# aiseo任务最大并发量
AI_SEO_JOB_MAX_INSTANCES = 2
# deepseek任务获取间隔
DEEPSEEK_JOB_INTERVAL = 120
# deepseek任务获取平台
DEEPSEEK_JOB_PLATFORM_IDS = ['1']
# deepseek任务最大并发量i
DEEPSEEK_JOB_MAX_INSTANCES = 1
# 测试提示词
TEST_KEYWORDS = [
# '行业咨询公司排行榜',
# '咨询公司排行榜',
# '中国有哪些知名的行业咨询公司',
# '请推荐中国十大行业咨询公司',
# '中国行业咨询机构中,哪些是最具影响力的',
# '行业顾问排行榜上,哪些公司名列前茅',
# '在行业咨询领域,哪些公司被誉为行业领军者',
# '我想了解中国行业咨询公司的排名,有吗',
# '请给我一份中国行业顾问公司的权威榜单',
# '中国有哪些行业咨询公司备受业界认可',
# '能否介绍一下中国行业咨询领域的佼佼者',
# '中国行业咨询公司的排名情况如何',
# '中国领先的行业顾问公司有哪些',
# '中国行业顾问排行榜上的公司都是哪些',
# '有没有一份关于中国行业咨询机构的综合排名',
# '十大行业研究机构排名',
# '行业研究机构推荐',
# '中国权威的行业研究机构有哪些',
# '能否列出一些中国顶级的行业研究机构',
# '哪些行业研究机构在中国市场上最具竞争力',
# '行业研究机构中,哪些在业界享有盛誉',
# '我想找一家权威的行业研究机构,有推荐吗',
# '请列举一些在行业研究方面有着深厚底蕴的中国公司',
# '我想了解中国行业研究领域的权威机构,能介绍一下吗?',
# '高端卫浴品牌',
# '数智卫浴品牌',
# '科技卫浴品牌',
# '智能马桶品牌',
# '智能淋浴器推荐',
# '小孩湿疹用什么药膏',
# '皮肤湿疹用什么药膏',
# '特应性皮炎用什么药膏最有效',
# '湿疹药膏排行榜',
# '皮炎性湿疹治疗药膏',
'大学生买什么平板电脑'
]
# 测试平台
# TEST_PLATFORM = [2,3,4,5,6,7]
TEST_PLATFORM = [7]
# 测试间隔
TEST_INTERVAL = 15
RESAVE_CONFIG = {
'platform_ids': '1',
'dates': ['2025-04-16']
}
# 任务队列redis配置
ARQ_REDIS_HOST = 'localhost'
ARQ_REDIS_PORT = 6379
ARQ_REDIS_DB = 5
ARQ_REDIS_PASSWORD = None

1
data/session/deepseek.json
File diff suppressed because it is too large
View File

1
data/session/doubao.json
File diff suppressed because it is too large
View File

1
data/session/kimi.json
File diff suppressed because it is too large
View File

1
data/session/metaso.json
File diff suppressed because it is too large
View File

1
data/session/tongyi.json
File diff suppressed because it is too large
View File

1
data/session/yiyan.json
File diff suppressed because it is too large
View File

1
data/session/yuanbao.json
File diff suppressed because it is too large
View File

1
data/session_data/deepseek.json
File diff suppressed because it is too large
View File

1
data/session_data/doubao.json
File diff suppressed because it is too large
View File

1
data/session_data/kimi.json
File diff suppressed because it is too large
View File

1
data/session_data/metaso.json
File diff suppressed because it is too large
View File

1
data/session_data/tongyi.json
File diff suppressed because it is too large
View File

1
data/session_data/yiyan.json
File diff suppressed because it is too large
View File

1
data/session_data/yuanbao.json
File diff suppressed because it is too large
View File

1
domain/__init__.py

@ -0,0 +1 @@
# coding=utf-8

68
domain/ai_seo.py

@ -0,0 +1,68 @@
# coding=utf-8
from dataclasses import dataclass, field
import os
import config
import utils
from datetime import datetime
@dataclass
class AiSearchResult:
"""
ai搜索结果对象
"""
# 标题
title: str = ''
# url
url: str = ''
# 来源
host_name: str = ''
# 描述
body: str = ''
# 发布时间
publish_time: str|int|float = ''
#是否被ai引用
is_referenced: str = '0'
#情感倾向" 1- 中立 2- 正面 3- 负面
sentiment_type = ""
#情感类型
type = ''
def __post_init__(self):
if isinstance(self.publish_time, float):
self.publish_time = int(self.publish_time)
if isinstance(self.publish_time, int):
self.publish_time = utils.convert_timestamp(self.publish_time).strftime('%Y-%m-%d')
if isinstance(self.publish_time, str):
try:
now = datetime.now()
publish = datetime.strptime(self.publish_time, f'%m-%d')
except ValueError:
return
self.publish_time = publish.strftime(f'{now.year}-%m-%d')
@dataclass
class AiAnswer:
"""
ai回答对象
"""
platform_id: int
platform_name: str
prompt: str
keyword: str
answer: str = ''
search_result: list[AiSearchResult] = field(default_factory=list)
screenshot_file: str = ''
# 状态
run_status: bool = True
def __post_init__(self):
self.screenshot_file = os.path.abspath(self.screenshot_file)

100
login.py

@ -0,0 +1,100 @@
# coding=utf-8
import asyncio
import os
import time
from playwright.async_api import async_playwright
from abs_spider import AbstractAiSeoSpider
from spiders.ai_seo import *
import config
from utils import make_sha256_hash
from utils.ai_seo_api_utils import AiSeoApis
SPIDER_CLS = {
1: DeepseekSpider,
2: TongyiSpider,
3: YuanBaoSpider,
4: KimiSpider,
5: DouBaoSpider,
6: YiYanSpider,
7: NanometerSpider,
13: MetasoSpider
}
async def init_browser() -> tuple:
"""
:return:
"""
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False,
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
channel="chrome",
args=config.BROWSER_ARGS)
content = await browser.new_context()
return playwright, browser
async def main():
playwright, browser = await init_browser()
main_info = """
:
1.Deepseek
2.
3.
4.Kimi
5.
6.
13.AI搜索
"""
print(main_info)
platform_id = input()
cls = SPIDER_CLS.get(int(platform_id), None)
# 如果没有找到对应的爬虫类,抛出异常
if not cls:
print('输入的平台id不存在')
# 要求用户填写登录账号
account = input('请输入登录账号:')
# 创建并返回爬虫实例
spider = cls(browser, '', '')
# 获取首页url
home_url = spider.get_home_url()
# 打开页面
browser_content = await browser.new_context()
browser_page = await browser_content.new_page()
await browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE)
print('创建浏览器成功')
# 加载伪装脚本
await browser_page.add_init_script("""
Object.defineProperties(navigator, {webdriver:{get:()=>false}});
""")
await browser_page.add_init_script('static/stealth.min.js')
print('伪装脚本加载成功')
await browser_page.goto(home_url, timeout=6000000)
print('加载首页成功')
input('使用手机号登录 并发送验证码后按回车键继续...')
# 保存登录后的上下文
session_path = f"{config.ROOT_PATH}/data/tmp/session_{time.time()}.json"
# 检查文件夹
dir_path = os.path.dirname(session_path)
os.makedirs(dir_path, exist_ok=True)
await browser_content.storage_state(path=session_path)
await browser_page.close()
await browser_content.close()
await browser.close()
print(f"登录成功 保存到{session_path}")
# 上传登录后的上下文
upload_data = await AiSeoApis.upload_session_file(session_path)
session_url = upload_data['url']
print(f"session文件上传成功 url:{session_url}")
# 计算文件hash
file_hash = make_sha256_hash(session_path)
result = await AiSeoApis.save_spider_session(platform_id, session_url, file_hash, account)
print("session文件保存成功")
print(result)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())

85
main.py

@ -0,0 +1,85 @@
# coding=utf-8
import asyncio
import json
import os
from dataclasses import asdict
from datetime import datetime
from playwright.async_api import async_playwright
import config
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer
from spiders.ai_seo import *
from utils.logger_utils import create_logger
logger = create_logger("app")
SPIDER_CLS = {
1: DeepseekSpider,
2: TongyiSpider,
3: YuanBaoSpider,
4: KimiSpider,
5: DouBaoSpider,
6: YiYanSpider,
7: NanometerSpider,
13: MetasoSpider
}
async def init_browser() -> tuple:
"""
:return:
"""
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS,
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
channel="chrome",
args=config.BROWSER_ARGS)
content = await browser.new_context()
return playwright, browser
def get_spider(platform_id, prompt, brand, browser) -> AbstractAiSeoSpider:
cls = SPIDER_CLS.get(int(platform_id), None)
if not cls:
raise ValueError(f"未找到对应的爬虫类,platform_id={platform_id}")
return cls(browser, prompt, brand, True)
def save_local(ai_answer: AiAnswer):
now = datetime.now().strftime("%Y-%m-%d")
base_path = f'./data/{ai_answer.platform_name}/{now}'
if not os.path.exists(base_path):
os.makedirs(base_path)
json_file_path = f'{base_path}/{ai_answer.prompt}.json'
_dict = asdict(ai_answer)
json_str = json.dumps(_dict, indent=4, ensure_ascii=False)
with open(json_file_path, 'w', encoding='utf-8') as f:
f.write(json_str)
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}")
async def test():
playwright, browser = await init_browser()
prompts = config.TEST_KEYWORDS
index = 1
for prompt in prompts:
logger.info(f"[{index}/{len(prompts)}] {prompt}")
for platform in config.TEST_PLATFORM:
spider = get_spider(platform, prompt, '品牌词', browser)
ai_answer = await spider.run()
if ai_answer:
save_local(ai_answer)
await asyncio.sleep(config.TEST_INTERVAL)
index = index + 1
await asyncio.sleep(config.TEST_INTERVAL * 6)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(test())

12
requirements.txt

@ -0,0 +1,12 @@
ftfy==6.3.1
glom==24.11.0
httpx==0.28.1
json_repair==0.40.0
loguru==0.7.3
openai~=1.72.0
playwright==1.51.0
arq~=0.26.3
APScheduler~=3.11.0
pillow~=11.2.1
pyperclip~=1.9.0
PyJWT~=2.10.1

78
resave.py

@ -0,0 +1,78 @@
# coding=utf-8
import asyncio
import json
import os
from datetime import datetime
import random
import config
from utils import create_logger
from utils.ai import read_rank
from utils.ai_seo_api_utils import AiSeoApis
logger = create_logger(__name__)
platform_names = {
1: 'Deepseek',
2: 'TongYi',
3: 'YuanBao',
4: 'Kimi',
5: 'DouBao',
6: 'YiYan',
7: 'Nano'
}
async def main():
datas = config.RESAVE_CONFIG['dates']
for date in datas:
logger.info(f'start: {date}')
while True:
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=config.RESAVE_CONFIG['platform_ids'])
if not task_data:
logger.info('没有任务')
break
logger.info(f"获取到任务: id: {task_data['id']} 关键词: {task_data['keyword']} 品牌词: {task_data['brand']}")
await save(task_data)
await asyncio.sleep(1)
async def save(task_data):
keyword = task_data.get('keyword')
platform_id = task_data.get('platform_id')
gather_date_str = task_data.get('gather_filter')
gather_date = datetime.strptime(gather_date_str, "%Y-%m-%d %H:%M:%S")
dir_name = gather_date.strftime("%Y-%m-%d")
platform_name = platform_names.get(platform_id)
data_path = f'./data/{platform_name}/{dir_name}/{keyword}.json'
# 读取文件内容
if not os.path.exists(data_path):
logger.info(f'文件不存在: {data_path}')
return
json_data = {}
with open(data_path, 'r', encoding='utf-8') as file:
json_data = json.loads(file.read())
upload_data = await AiSeoApis.upload_screenshot_file(json_data['screenshot_file'])
json_data = {
**config.AI_SEO_API_AUTH,
**json_data,
'task_id': task_data['id'],
'rank': random.randint(0, 15),
'start_time': gather_date.strftime("%Y-%m-%d 06:10:15"),
'end_time': gather_date.strftime("%Y-%m-%d 06:12:15"),
'screenshot_url': upload_data['url']
}
if not json_data.get('answer', ''):
json_data['answer'] = '未知'
json_data['rank'] = 0
else:
brands, rank = await read_rank(json_data['answer'], task_data['brand'])
json_data['rank'] = rank
json_data['words'] = brands
result = await AiSeoApis.submit_task(json_data)
logger.info(f"任务提交成功: id: {task_data['id']}")
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())

238
run.py

@ -0,0 +1,238 @@
# coding=utf-8
import asyncio
import uuid
from dataclasses import asdict
from datetime import datetime, timedelta
import requests
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from playwright.async_api import async_playwright, Browser
import config
from spiders.ai_seo import *
from utils import create_logger
from utils.ai import read_rank
from utils.ai_seo_api_utils import AiSeoApis
logger = create_logger(__name__)
scheduler = AsyncIOScheduler()
SPIDER_CLS = {
1: DeepseekSpider,
2: TongyiSpider,
3: YuanBaoSpider,
4: KimiSpider,
5: DouBaoSpider,
6: YiYanSpider,
7: NanometerSpider,
13: MetasoSpider
}
spider_pool: dict = {}
async def init_browser() -> tuple:
"""
:return:
"""
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS,
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
channel="chrome",
args=config.BROWSER_ARGS)
return playwright, browser
def get_spider(platform_id, prompt, brand, browser) -> None | DeepseekSpider | TongyiSpider | YuanBaoSpider | KimiSpider | DouBaoSpider | YiYanSpider | NanometerSpider:
"""
ID获取相应的爬虫实例
:
- platform_id:
- prompt:
- brand:
- browser: 使
:
- AbstractAiSeoSpider: AI SEO爬虫实例
:
- ValueError:
"""
# 根据平台ID获取对应的爬虫类
cls = SPIDER_CLS.get(int(platform_id), None)
# 如果没有找到对应的爬虫类,抛出异常
if not cls:
return None
# 创建并返回爬虫实例
return cls(browser, prompt, brand)
async def ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id):
status, date = calc_task_date(time_range)
# if not status:
# # 是否有紧急任务
# task_result = await AiSeoApis.get_urgent_task_count()
# if task_result['count'] <= 0:
# return
current_job = scheduler.get_job(job_id)
# current_job.pause()
platform_str = ','.join(platform_ids)
# 获取任务信息
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=platform_str)
if not task_data:
logger.info(f'[{type_name}]未获取到任务信息')
# current_job.resume()
return
task_id = task_data['id']
logger.info(f"获取到{task_data['project_id']}项目任务: id: {task_data['id']} 平台id: {task_data['platform_id']} "
f"关键词: {task_data['keyword']} 品牌词: {task_data['brand']}")
# 记录开始时间
start_time = datetime.now()
# 创建爬虫实例
spider = get_spider(task_data['platform_id'], task_data['keyword'], task_data['brand'], browser)
# 记录任务id
spider.task_id = task_id
spider_pool[run_id] = spider
logger.info(f"RunId注册成功: TaskId: {task_id} 平台: {spider.platform_name}")
# 是否开启深度思考
if task_data['thinking'] == 1:
spider.think = True
if not spider:
await AiSeoApis.update_task_status(task_id, 5)
logger.error(f"未找到对应的爬虫类 请检查任务信息: id: {task_data['id']} platform_id: {task_data['platform_id']}")
return
ai_answer = None
try:
# 运行爬虫并获取结果
ai_answer = await spider.run()
except Exception as e:
await AiSeoApis.update_task_status(task_id, 4)
logger.info(f"回滚任务状态: id: {task_id}")
spider_pool.pop(run_id, None)
return
if not ai_answer:
await AiSeoApis.update_task_status(task_id, 4)
logger.error(f"爬虫运行失败 id: {task_data['id']} platform_id: {task_data['platform_id']}")
spider_pool.pop(run_id, None)
return
# 记录结束时间
end_time = datetime.now()
# 提交爬虫结果
answer_data = asdict(ai_answer)
# 上传截图
upload_data = await AiSeoApis.upload_screenshot_file(answer_data['screenshot_file'])
# 结果参数
answer_data = {
**config.AI_SEO_API_AUTH,
**answer_data,
'task_id': task_data['id'],
'rank': 0,
'start_time': start_time.strftime("%Y-%m-%d %H:%M:%S"),
'end_time': end_time.strftime("%Y-%m-%d %H:%M:%S"),
'screenshot_url': upload_data['url']
}
if not answer_data.get('answer', ''):
answer_data['answer'] = '未知'
answer_data['rank'] = 0
else:
brands, rank = await read_rank(answer_data['answer'], task_data['brand'])
answer_data['rank'] = rank
answer_data['words'] = brands
# print('answer_data',answer_data)
search_results = list()
for data in answer_data.get("search_result"):
data_ = {**config.AI_SEO_API_AUTH,"content": data.get("title")}
rest = ''
try:
resp = requests.post(url='https://geo-api.neicela.com/api/third/getSentimentType',json=data_,timeout=600)
# print(resp.text)
rest = resp.json()
except Exception as e:
print(str(e))
# logger.info("调用getSentimentType接口出现异常:",str(e))
# print("rest",rest)
if rest.get("code") == 0:
data.update(rest.get("data"))
search_results.append(data)
answer_data['search_result'] = search_results
result = await AiSeoApis.submit_task(answer_data)
logger.info(f"任务提交成功: id: {task_data['id']}")
spider_pool.pop(run_id, None)
async def ai_seo_job_with_timeout(browser, platform_ids, time_range, job_id, type_name, timeout=1200):
# 生成一个唯一的run_id
run_id = str(uuid.uuid4()).replace("-", "")
try:
await asyncio.wait_for(ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id), timeout)
except asyncio.TimeoutError:
spider = spider_pool.get(run_id, None)
if spider:
await spider._close()
logger.error(f"任务超时: 平台: {spider.platform_id}")
spider_pool.pop(run_id, None)
await AiSeoApis.update_task_status(spider.task_id, 4)
logger.info(f"回滚任务状态: id: {spider.task_id}")
async def heartbeat(browser: Browser):
load_count = len(browser.contexts)
result = await AiSeoApis.heartbeat(config.DC_ID, load_count)
logger.success(f"心跳: 机器id: {config.DC_ID} 负载量: {load_count} 发送时间: {result.get('send_time', '')}")
def calc_task_date(time_range):
# 解析时间
start_time = datetime.strptime(time_range['start_time'], "%H:%M").time()
end_time = datetime.strptime(time_range['end_time'], "%H:%M").time()
# 获取带时区的当前时间
now = datetime.now()
current_time = now.time()
# 判断逻辑
if end_time < start_time:
# 跨天时间段
if current_time >= start_time or current_time <= end_time:
# 如果当前时间在次日结束时间前,开始日期是昨天
start_date = (now - timedelta(days=1)).date() if current_time <= end_time else now.date()
return True, start_date.strftime("%Y-%m-%d")
else:
# 非跨天时间段
if start_time <= current_time <= end_time:
return True, now.date().strftime("%Y-%m-%d")
return False, None
async def main():
# 初始化浏览器实例
playwright, browser = await init_browser()
logger.info('初始化浏览器成功')
if config.AI_SEO_JOB_ENABLE:
# 启动一般平台aiseo任务
scheduler.add_job(ai_seo_job_with_timeout, 'interval',
id='ai_seo_job', seconds=config.AI_SEO_JOB_INTERVAL, max_instances=config.AI_SEO_JOB_MAX_INSTANCES, coalesce=False,
args=[browser, config.AI_SEO_JOB_PLATFORM_IDS, config.AI_SEO_JOB_RANGE, 'ai_seo_job', '一般AI平台'])
logger.success('启动一般AI平台任务成功')
if config.DEEPSEEK_JOB_ENABLE:
# 启动deepseek-aiseo任务
scheduler.add_job(ai_seo_job_with_timeout, 'interval',
id='deepseek_ai_seo_job', seconds=config.DEEPSEEK_JOB_INTERVAL,
max_instances=config.DEEPSEEK_JOB_MAX_INSTANCES, coalesce=False,
args=[browser, config.DEEPSEEK_JOB_PLATFORM_IDS, config.DEEPSEEK_SEO_JOB_RANGE,
'deepseek_ai_seo_job', 'DeepSeek'])
logger.success('启动deepseek任务成功')
# 启动心跳任务
# scheduler.add_job(heartbeat, 'interval', id='heartbeat', seconds=30,args=[browser])
# logger.info('启动心跳任务成功')
scheduler.start()
await asyncio.Future() # 保持事件循环运行
if __name__ == '__main__':
asyncio.run(main()) # 启动事件循环

119
run_deepseek.py

@ -0,0 +1,119 @@
# coding=utf-8
import asyncio
import json
import os
from dataclasses import asdict
from datetime import datetime
from playwright.async_api import async_playwright
import config
from domain.ai_seo import AiAnswer
from spiders.ai_seo import *
from utils.logger_utils import create_logger
logger = create_logger("app")
async def init_browser() -> tuple:
"""
:return:
"""
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS,
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
channel="chrome",
args=config.BROWSER_ARGS)
content = await browser.new_context()
return playwright, browser
def save_local(ai_answer: AiAnswer):
now = datetime.now().strftime("%Y-%m-%d")
base_path = f'./data/{ai_answer.platform_name}/{now}'
if not os.path.exists(base_path):
os.makedirs(base_path)
json_file_path = f'{base_path}/{ai_answer.prompt}.json'
_dict = asdict(ai_answer)
json_str = json.dumps(_dict, indent=4, ensure_ascii=False)
with open(json_file_path, 'w', encoding='utf-8') as f:
f.write(json_str)
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}")
async def main():
playwright, browser = await init_browser()
ai_answer = await KimiSpider(browser, '2025前十的电动自行车推荐', '美的').run()
if ai_answer:
save_local(ai_answer)
async def test():
prompts = [
# '行业咨询公司排行榜',
# # '咨询公司排行榜',
# '中国有哪些知名的行业咨询公司',
# # '请推荐中国十大行业咨询公司',
# '中国行业咨询机构中,哪些是最具影响力的',
# '行业顾问排行榜上,哪些公司名列前茅',
# # '在行业咨询领域,哪些公司被誉为行业领军者',
# '我想了解中国行业咨询公司的排名,有吗',
# '请给我一份中国行业顾问公司的权威榜单',
# # '中国有哪些行业咨询公司备受业界认可',
# '能否介绍一下中国行业咨询领域的佼佼者',
# '中国行业咨询公司的排名情况如何',
# # '中国领先的行业顾问公司有哪些',
# '中国行业顾问排行榜上的公司都是哪些',
# # '有没有一份关于中国行业咨询机构的综合排名',
# '十大行业研究机构排名',
# '行业研究机构推荐',
# # '中国权威的行业研究机构有哪些',
# '能否列出一些中国顶级的行业研究机构',
'哪些行业研究机构在中国市场上最具竞争力',
# '行业研究机构中,哪些在业界享有盛誉',
'我想找一家权威的行业研究机构,有推荐吗',
# '请列举一些在行业研究方面有着深厚底蕴的中国公司',
'我想了解中国行业研究领域的权威机构,能介绍一下吗?',
# '高端卫浴品牌',
'数智卫浴品牌',
# '科技卫浴品牌',
'智能马桶品牌',
'智能淋浴器推荐',
# '小孩湿疹用什么药膏',
# '皮肤湿疹用什么药膏',
# '特应性皮炎用什么药膏最有效',
# '湿疹药膏排行榜',
# '皮炎性湿疹治疗药膏',
]
retry_prompts = []
playwright, browser = await init_browser()
index = 1
for prompt in prompts:
logger.info(f"[{index}/{len(prompts)}] {prompt}")
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run()
if ai_answer and ai_answer.run_status:
save_local(ai_answer)
if not ai_answer.run_status:
retry_prompts.append(prompt)
logger.info(f"[{len(prompts)}] {prompt} 采集失败")
index = index + 1
await asyncio.sleep(300)
for prompt in retry_prompts:
logger.info(f"重试[{index}/{len(prompts)}] {prompt}")
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run()
if ai_answer and ai_answer.run_status:
save_local(ai_answer)
if not ai_answer.run_status:
logger.info(f"[{len(prompts)}] {prompt} 采集失败")
index = index + 1
await asyncio.sleep(300)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(test())

1
spiders/__init__.py

@ -0,0 +1 @@
# coding=utf-8

9
spiders/ai_seo/__init__.py

@ -0,0 +1,9 @@
# coding=utf-8
from .kimi import KimiSpider
from .deepseek import DeepseekSpider
from .nanometer import NanometerSpider
from .yiyan import YiYanSpider
from .yuanbao import YuanBaoSpider
from .tongyi import TongyiSpider
from .doubao import DouBaoSpider
from .metaso import MetasoSpider

189
spiders/ai_seo/deepseek.py

@ -0,0 +1,189 @@
# coding=utf-8
import asyncio
import json
import re
from functools import partial, wraps
from json import JSONDecodeError
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, css_to_dict
from utils.image_utils import crop_image_left
logger = create_logger(__name__)
class DeepseekSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://chat.deepseek.com/'
def get_platform_id(self) -> int:
return 1
def get_platform_name(self) -> str:
return 'DeepSeek'
async def _do_spider(self) -> AiAnswer:
self._init_data()
self.search_result_count = 0
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(3)
# 开启联网搜索
search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..')
if await search_btn.is_visible():
await search_btn.click()
if self.think:
# 开启深度思考
think_btn = self.browser_page.locator("span:text('深度思考 (R1)')").locator('..')
if await think_btn.is_visible():
styles = css_to_dict(await think_btn.get_attribute('style'))
if styles.get('--ds-button-color') == '#fff':
await think_btn.click()
await asyncio.sleep(1)
chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']")
await chat_input_element.click()
# 输入提问词
await self.browser_page.keyboard.type(self.prompt)
await asyncio.sleep(1)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await self.completed_event.wait()
# 报错检查
if self.fail_status:
raise self.fail_exception
# 打开搜索栏
search_btn_text = f'已搜索到 {self.search_result_count} 个网页'
search_btn = self.browser_page.locator(f"div:text('{search_btn_text}')")
# search_btn = self.browser_page.locator('div:has-text("搜索到")')
if await search_btn.count() > 0:
await search_btn.click()
await asyncio.sleep(2)
if self.think:
# 思考元素
think_element = self.browser_page.locator("text=已深度思考(")
think_element_count = await think_element.count()
if think_element_count > 0:
await think_element.nth(-1).click()
await asyncio.sleep(2)
# 获取回答元素
answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1)
box = await answer.bounding_box()
# 设置视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(box['height']) + 500
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
# 切割图片
crop_image_left(screenshot_path, 250)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"DeepSeek响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
async def __listen_response(self, response):
if '/api/v0/chat/completion' not in response.url:
return
# 读取流式数据
response_text = ''
search_result_lists = list()
start_content = False
stream = await response.body()
body = stream.decode('utf-8')
datas = body.split("\n\n")
for data_str in datas:
if not data_str:
continue
data_str = data_str.replace('data: ', '')
try:
data = json.loads(data_str)
if glom(data, 'v.0.v', default='') == 'TIMEOUT':
self.fail_status = True
logger.error("DeepSeek服务器繁忙")
except JSONDecodeError as e:
continue
# 获取ai搜索结果
if data.get('p', '') == 'response/search_results' or isinstance(data.get('v', ''), list):
logger.debug(f"获取到联网搜索结果")
search_result_list = data.get('v', [])
# 保存搜索结果
ai_search_result_list = []
for search_result in search_result_list:
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
if ai_search_result_list:
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
continue
# 是否开始返回回复数据
if data.get('p', '') == 'response/content':
start_content = True
if start_content:
# 获取ai回复
value = data.get('v', None)
if isinstance(value, dict):
continue
if value is None:
target = 'choices.0.delta.content'
value = glom(data, target, default="")
response_text = response_text + str(value)
#匹配citation:中的数字
citation = list()
citations = re.findall(r'citation:(\d+)', response_text)
if citations:
citation = list(set(citations))
# 保存搜索结果
ai_search_result_list = []
for index,search_result in enumerate(search_result_lists):
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
if str(index+1) in citation:
is_referenced = "1"
else:
is_referenced = "0"
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
if ai_search_result_list:
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
logger.debug(response_text)
self.ai_answer.answer = response_text
self.completed_event.set()

164
spiders/ai_seo/doubao.py

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
import asyncio
from functools import partial, wraps
from json import JSONDecodeError
import ftfy
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, parse_nested_json
logger = create_logger(__name__)
class DouBaoSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://www.doubao.com/chat'
async def _do_spider(self) -> AiAnswer:
# 初始化信息
self._init_data()
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(3)
if self.think:
think_btn = self.browser_page.locator("//button[@title='深度思考']")
if await think_btn.is_visible():
clazz = (await think_btn.get_attribute('class')).split(' ')
# 找出点击的class名称
target_class = [c for c in clazz if c.startswith("active-")]
if not target_class:
await think_btn.click()
await asyncio.sleep(2)
# 开始操作
chat_input_element = self.browser_page.locator("//textarea[@data-testid='chat_input_input']")
# 输入提问词
await chat_input_element.fill(self.prompt)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await asyncio.sleep(2)
await self.completed_event.wait()
# 报错检查
if self.fail_status:
raise self.fail_exception
# 关闭侧边栏
sider_bar_element = self.browser_page.locator("//button[@data-testid='siderbar_close_btn']")
if await sider_bar_element.is_visible():
await sider_bar_element.click()
# 资料弹出框
search_result_popup_element = self.browser_page.locator("//div[contains(@class, 'search-item-transition-')]")
# 资料按钮
search_result_btn_list = self.browser_page.locator("//div[contains(@class, 'entry-btn-')]")
if await search_result_btn_list.count() > 0 and not await search_result_popup_element.count() > 0:
await search_result_btn_list.nth(-1).click()
await asyncio.sleep(2)
# 搜索结果元素
search_result_element_list = self.browser_page.locator("//a[contains(@class, 'search-')]")
ai_search_result_list = []
if await search_result_element_list.count() > 0:
for index,search_result_element in enumerate(await search_result_element_list.all()):
url = await search_result_element.get_attribute('href')
title = ''
desc = ''
host_name = ''
title_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-title-')]")
desc_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-summary-')]")
host_name_element = search_result_element.locator("xpath=.//span[contains(@class, 'footer-title-')]")
# 获取标题
if await title_element.is_visible():
title = await title_element.inner_text()
# 获取描述
if await desc_element.is_visible():
desc = await desc_element.inner_text()
# 获取来源
if await host_name_element.is_visible():
host_name = await host_name_element.inner_text()
if index+1 in self.index_data:
is_referenced = "1"
else:
is_referenced = "0"
ai_search_result_list.append(AiSearchResult(
title=title,
url=url,
host_name=host_name,
body=desc,
is_referenced=is_referenced
))
logger.debug(f'搜索结果: [{host_name}]{title}({url})')
self.ai_answer.search_result = ai_search_result_list
# 获取回答元素
answer_element = self.browser_page.locator("//div[@data-testid='receive_message']").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 500
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path, full_page=True)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
async def __listen_response(self, response):
# 读取流式数据
if '/samantha/chat/completion' in response.url:
answer = ''
datas = []
response_text = ftfy.fix_text(await response.text())
lines = response_text.split("\n\n")
for line in lines:
if line.startswith('data: '):
line = line[6:]
try:
data = parse_nested_json(line)
datas.append(data)
event_data = data.get('event_data', {})
target_key = 'message.content.text'
text = glom(event_data, target_key, default=None)
if not text is None:
answer = answer + str(text)
except JSONDecodeError:
continue
logger.debug(f"ai回复: {answer}")
self.ai_answer.answer = answer
self.completed_event.set()
def get_platform_id(self) -> int:
return 5
def get_platform_name(self) -> str:
return 'DouBao'

148
spiders/ai_seo/kimi.py

@ -0,0 +1,148 @@
# coding=utf-8
import asyncio
from functools import partial, wraps
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger
from glom import glom, Coalesce
logger = create_logger(__name__)
class KimiSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://www.kimi.ai'
def get_platform_id(self) -> int:
return 4
def get_platform_name(self) -> str:
return 'Kimi'
async def _do_spider(self) -> AiAnswer:
self.completed_event = asyncio.Event()
await self.browser_page.goto('https://www.kimi.ai', timeout=600000)
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword)
await asyncio.sleep(3)
if self.think:
think_btn = self.browser_page.locator("span:text('长思考 (k1.5)')").locator('..')
if await think_btn.is_visible():
clazz = (await think_btn.get_attribute('class')).split(' ')
if 'open' not in clazz:
await think_btn.click()
await asyncio.sleep(2)
chat_input_element = self.browser_page.locator("//div[@class='chat-input']")
await chat_input_element.click()
# 输入提问词
await self.browser_page.keyboard.type(self.prompt)
await asyncio.sleep(2)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await self.completed_event.wait()
await asyncio.sleep(2)
# 报错检查
if self.fail_status:
raise self.fail_exception
# 关闭侧边栏
sidebar_element = self.browser_page.locator("//div[@class='expand-btn']")
if await sidebar_element.is_visible():
await sidebar_element.click()
# 获取回答元素
answer_element = self.browser_page.locator("//div[@class='segment-container']").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 500
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 打开搜索结果
search_list_content_element = self.browser_page.locator("//div[contains(@class, 'side-console-container')]")
search_list_element = self.browser_page.locator("//div[@class='search-plus']")
if await search_list_element.is_visible() and not await search_list_content_element.is_visible():
await search_list_element.click()
# 截图
screenshot_path = self._get_screenshot_path()
self.ai_answer.screenshot_file = screenshot_path
await self.browser_page.screenshot(path=screenshot_path)
return self.ai_answer
async def __listen_response(self, response):
if '/segment/scroll' in response.url:
json_data = await response.json()
if json_data['items']:
logger.debug(json_data)
detail = json_data['items'][-1]
content = detail['content']
if self.think:
self.ai_answer.search_result = self.get_search_list_enable_think(detail)
else:
self.ai_answer.search_result = self.get_search_list_disable_think(detail)
self.ai_answer.answer = content
logger.debug(f"ai回复: {content}")
self.completed_event.set()
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
def get_search_list_disable_think(self, detail):
"""
:param detail:
:return:
"""
answer_search_list = []
search_result_list = detail.get('search_plus', [])
for search_result in search_result_list:
event = search_result.get('event', '')
msg = search_result.get('msg', {})
msg_type = msg.get('type', '')
if event == 'search_plus' and msg_type == 'get_res':
answer_search_list.append(
AiSearchResult(msg['title'], msg['url'], msg['site_name'], msg['snippet'], msg['date']))
logger.debug(f"ai参考资料: {msg['title']}({msg['url']})")
return answer_search_list
def get_search_list_enable_think(self, detail):
"""
:param detail:
:return:
"""
answer_search_list = []
keys = 'contents.zones.0.sections.0.k1.search_results'
search_result_list = glom(detail, keys, default=[])
for search_result in search_result_list:
answer_search_list.append(
AiSearchResult(search_result['title'], search_result['url'], search_result['site_name'], search_result['snippet'], search_result['date']))
logger.debug(f"ai参考资料: {search_result['title']}({search_result['url']})")
return answer_search_list

196
spiders/ai_seo/metaso.py

@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
import asyncio
import json
import re
from functools import partial, wraps
from json import JSONDecodeError
import ftfy
import pyperclip
from playwright.async_api import Browser, async_playwright
import config
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger
logger = create_logger(__name__)
class MetasoSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, load_session: bool = True):
super().__init__(browser, prompt, keyword, load_session)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://metaso.cn/'
async def _do_spider(self) -> AiAnswer:
# 初始化信息
self._init_data()
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(2)
# 开始操作
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'search-consult-textarea')]")
# 输入提问词
await chat_input_element.fill(self.prompt)
await self.browser_page.keyboard.press('Enter')
# 监听请求
await asyncio.sleep(2)
# self.browser_page.on('response', partial(self.__listen_response))
await self.browser_page.reload()
# await self.completed_event.wait()
# 等待指定元素
copy_button = await self.browser_page.wait_for_selector("//button[@id='generateInteractiveReportButton']/preceding-sibling::div[1]/button", timeout=600000)
# 点击复制按钮
await copy_button.click()
# 读取剪贴板
self.ai_answer.answer = pyperclip.paste()
logger.debug(f'ai回复内容: {self.ai_answer}')
# 获取来源数据
try:
await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000)
search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span")
search_item_count = await search_items.count()
logger.debug(f'来源数据: {search_item_count}')
await asyncio.sleep(5)
search_results = []
for i in range(search_item_count):
search_result = AiSearchResult()
search_item = search_items.nth(i)
# 抽取链接和标题
a = search_item.locator("xpath=./a")
# 抽取时间
publish_date_element = search_item.locator("xpath=./span")
if await a.is_visible():
search_result.title = await a.text_content()
search_result.url = await a.get_attribute('href')
if await publish_date_element.count() > 0:
publish_date_element = search_item.locator("xpath=./span").nth(-1)
publish_str = await publish_date_element.text_content()
search_result.publish_time = publish_str.replace('[', '').replace(']', '')
search_results.append(search_result)
self.ai_answer.search_result = search_results
except TimeoutError:
logger.error('没有搜索结果')
# 报错检查
if self.fail_status:
raise self.fail_exception
# 获取回答元素
answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]")
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 300
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
def get_platform_id(self) -> int:
return 13
def get_platform_name(self) -> str:
return 'Metaso'
async def __listen_response(self, response):
url = response.url
if 'searchV2' in url:
answer = ''
results = []
search_results = list()
response_text = await response.text()
event_lines = response_text.split('\n\n')
self.completed_event.set()
for line in event_lines:
if line.startswith('data:'):
line = line[5:]
try:
event_json = json.loads(line)
except JSONDecodeError:
continue
# 开始event_json
type = event_json.get('type')
# 获取到搜索结果
if type == 'set-reference':
search_results = event_json.get('list', [])
# for search_result in search_results:
# result = AiSearchResult(title=search_result.get('title', ''),
# url=search_result.get('url', ''),
# host_name=search_result.get('author', ''),
# body=search_result.get('displaySource'),
# publish_time=search_result.get('publish_time', ''))
# results.append(result)
# self.ai_answer.search_result = results
# 获取到回答内容
if type == 'append-text':
answer = answer + event_json.get('text', '')
pattern = r'\[(\d+)\]'
index_data = list(set(re.findall(pattern, answer)))
for index,search_result in enumerate(search_results):
if str(index+1) in index_data:
result = AiSearchResult(title=search_result.get('title', ''),
url=search_result.get('url', ''),
host_name=search_result.get('author', ''),
body=search_result.get('displaySource'),
publish_time=search_result.get('publish_time', ''),
is_referenced="1")
else:
result = AiSearchResult(title=search_result.get('title', ''),
url=search_result.get('url', ''),
host_name=search_result.get('author', ''),
body=search_result.get('displaySource'),
publish_time=search_result.get('publish_time', ''),
is_referenced="0")
results.append(result)
self.ai_answer.search_result = results
self.ai_answer.answer = answer
self.completed_event.set()
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
async def run():
# playwright = await async_playwright().start()
# browser = await playwright.chromium.launch(headless=False,
# chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
# ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
# channel="chrome",
# args=config.BROWSER_ARGS)
playwright = await async_playwright().start()
browser = await playwright.firefox.launch(
headless=False,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
args=config.BROWSER_ARGS
)
spider = MetasoSpider(browser, '2025前端工具库top5', '')
await spider.run()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(run())

174
spiders/ai_seo/nanometer.py

@ -0,0 +1,174 @@
# coding=utf-8
import asyncio
import json
import re
from datetime import datetime
from functools import partial, wraps
from json import JSONDecodeError
from glom import glom
from playwright.async_api import Playwright, Browser, async_playwright
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, parse_nested_json
logger = create_logger(__name__)
class NanometerSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword)
self.load_session = False
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://www.n.cn/'
async def _do_spider(self) -> AiAnswer:
# 初始化数据
self._init_data()
# 开始操作
await self.browser_page.goto(self.get_home_url(), timeout=600000)
chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']")
# 输入提问词
await chat_input_element.fill(self.prompt)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await asyncio.sleep(2)
await self.completed_event.wait()
# 报错检查
if self.fail_status:
raise self.fail_exception
# 获取回答元素
answer_element = self.browser_page.locator("//div[@class='js-article-content']").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 500
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path, full_page=True)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
def __parse_event_data(self, data_str):
# 按照 'id:' 分割文本,去掉第一个空的部分
parts = data_str.strip().split('id:')[1:]
# 初始化结果列表
result = []
# 遍历每个部分,提取数据并存储到字典中
for part in parts:
lines = part.strip().split('\n')
item = {}
for line in lines:
if ':' not in line:
key = 'id'
value = line
else:
key, value = line.split(':', 1)
key = key.strip()
value = value.strip()
if key == 'data':
try:
# 尝试将 data 转换为 JSON 对象
import json
value = json.loads(value)
except JSONDecodeError:
pass
item[key] = value
result.append(item)
return result
async def __listen_response(self, response):
if '/api/common/chat/v2' not in response.url:
return
# 读取流式数据
stream = await response.body()
response_text = stream.decode('utf-8')
datas = self.__parse_event_data(response_text)
answer = ''
search_result_list = list()
# 遍历每行数据
for data in datas:
event = data.get('event', '')
if event == '200':
answer = answer + str(data.get('data', ''))
elif event == '102':
# json格式的返回 要解析数据
data = data.get('data', {})
if isinstance(data, str):
data = parse_nested_json(data)
data_type = data.get('type', '')
if data_type == 'search_result':
search_result_list = glom(data, 'message.list', default=[])
# # 保存搜索数据
# ai_search_result_list = []
# for search_result in search_result_list:
# title = search_result.get('title', '')
# url = search_result.get('url', '')
# body = search_result.get('summary', '')
# host_name = search_result.get('site', '未知')
# publish_time = search_result.get('date', 0)
# ai_search_result_list.append(
# AiSearchResult(title, url, host_name, body, publish_time)
# )
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
# self.ai_answer.search_result = ai_search_result_list
pattern = r'\[(\d+)\]'
index_data = list(set(re.findall(pattern, answer)))
ai_search_result_list = []
for index,search_result in enumerate(search_result_list):
title = search_result.get('title', '')
url = search_result.get('url', '')
body = search_result.get('summary', '')
host_name = search_result.get('site', '未知')
publish_time = search_result.get('date', 0)
if str(index+1) in index_data:
is_referenced = "1"
else:
is_referenced = "0"
ai_search_result_list.append(
AiSearchResult(title, url, host_name, body, publish_time,is_referenced)
)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
self.ai_answer.search_result = ai_search_result_list
self.ai_answer.answer = answer
logger.debug(f'ai回复: {answer}')
self.completed_event.set()
def get_platform_id(self) -> int:
return 7
def get_platform_name(self) -> str:
return 'Nano'
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper

176
spiders/ai_seo/tongyi.py

@ -0,0 +1,176 @@
# coding=utf-8
import asyncio
import re
from functools import partial, wraps
from json import JSONDecodeError
from glom import glom
from playwright.async_api import Browser
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger, parse_nested_json
from utils.image_utils import crop_image_left
logger = create_logger(__name__)
class TongyiSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://tongyi.aliyun.com'
async def _do_spider(self) -> AiAnswer:
# 初始化信息
self._init_data()
await self.browser_page.goto(self.get_home_url(), timeout=600000)
if self.think:
search_btn = self.browser_page.locator("div:text('深度思考')")
if await search_btn.is_visible():
await search_btn.click()
await asyncio.sleep(1)
else:
search_btn = self.browser_page.locator("div:text('联网搜索')")
if await search_btn.is_visible():
await search_btn.click()
await asyncio.sleep(1)
# 开始操作
# chat_input_element = self.browser_page.locator("//textarea[@placeholder='千事不决问通义']")
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'ant-input')]")
await chat_input_element.click()
# 输入提问词
await self.browser_page.keyboard.type(self.prompt)
await asyncio.sleep(2)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await asyncio.sleep(2)
await self.completed_event.wait()
# 报错检查
if self.fail_status:
raise self.fail_exception
# # 获取回答元素
answer_element = self.browser_page.locator("//div[contains(@class, 'answerItem')]").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 500
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 打开搜索结果
search_list_element = self.browser_page.locator("//div[contains(@class, 'linkTitle')]").nth(-1)
if await search_list_element.is_visible():
await search_list_element.click()
await asyncio.sleep(2)
# 关闭侧边栏
side_console_element = self.browser_page.locator("//span[contains(@class, 'sc-frniUE')]")
if await side_console_element.is_visible():
await side_console_element.click()
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
# 切割图片
crop_image_left(screenshot_path, 340)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
async def __listen_response(self, response):
if '/dialog/conversation' not in response.url:
return
# 读取流式数据
data = {}
stream = await response.body()
response_text = stream.decode('utf-8')
datas = response_text.split("\n")
# 合规数据转成字典
for data_str in datas:
if not data_str or data_str == 'data: [DONE]':
continue
data_str = data_str.replace('data: ', '')
try:
data = parse_nested_json(data_str)
except JSONDecodeError as e:
continue
logger.debug(f"结果: {data}")
# 获取结果
contents = data.get('contents', [])
# 保存搜索内容
ai_search_result_list = []
search_result_list = list()
for content in contents:
content_type = content.get('contentType', '')
if content_type == 'plugin':
logger.debug(f"获取到联网搜索结果")
if self.think:
search_result_list = glom(content, 'content.pluginResult', default=[])
else:
search_result_list = glom(content, 'content.pluginResult.-1.search_results', default=[])
# for search_result in search_result_list:
# url = search_result.get('url', '')
# title = search_result.get('title', '')
# body = search_result.get('body', '')
# host_name = search_result.get('host_name', '未知')
# publish_time = search_result.get('time', 0)
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
# ai_search_result_list.append(
# AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time)
# )
if content_type == 'text':
logger.debug(f'获取到ai回复结果')
answer = content.get('content', '')
logger.debug(f"ai回复: {answer}")
self.ai_answer.answer = answer
pattern = r'ty-reference]\((\d+)\)'
index_data = list(set(re.findall(pattern, self.ai_answer.answer)))
for index, search_result in enumerate(search_result_list):
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('body', '')
host_name = search_result.get('host_name', '未知')
publish_time = search_result.get('time', 0)
if str(index+1) in index_data:
is_referenced = "1"
else:
is_referenced = "0"
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
ai_search_result_list.append(
AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time,is_referenced=is_referenced)
)
if ai_search_result_list:
self.ai_answer.search_result = ai_search_result_list
self.completed_event.set()
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
def get_platform_id(self) -> int:
return 2
def get_platform_name(self) -> str:
return 'TongYi'

213
spiders/ai_seo/yiyan.py

@ -0,0 +1,213 @@
# coding=utf-8
import asyncio
import json
from functools import partial, wraps
from glom import glom
from playwright.async_api import async_playwright, Browser
import config
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger
from utils.ai_seo_api_utils import AiSeoApis
from utils.image_utils import crop_image_left
from utils.session_utils import get_spider_session
logger = create_logger(__name__)
class YiYanSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://yiyan.baidu.com/'
async def _do_spider(self) -> AiAnswer:
# 初始化数据
self._init_data()
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(2)
# 检查登录状态
await self.check_login()
if self.think:
think_btn = self.browser_page.locator("span:text('深度思考(X1 Turbo)')").locator('..')
clazz = (await think_btn.get_attribute('class')).split(' ')
if await think_btn.is_visible():
if len(clazz) == 1:
await think_btn.click()
await asyncio.sleep(2)
# 开始操作
chat_input_element = self.browser_page.locator("//div[@class='yc-editor']")
await chat_input_element.click()
await asyncio.sleep(2)
# 输入提问词
await self.browser_page.keyboard.insert_text(self.prompt)
await asyncio.sleep(2)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
if self.think:
self.browser_page.on('response', partial(self.__listen_response_thinking))
await asyncio.sleep(2)
try:
await self.browser_page.wait_for_selector("//div[@data-auto-test='anew_response']", state='attached', timeout=600000)
logger.debug('ai回答完毕')
except TimeoutError as e:
logger.error('ai回答超时')
# 报错检查
if self.fail_status:
raise self.fail_exception
# 获取回答元素
answer_element = self.browser_page.locator("//div[contains(@class, 'dialog-card-wrapper')]").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 1000
# 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 打开搜索结果
open_search_btn_element = self.browser_page.locator("div:text('条网页信息源')")
if await open_search_btn_element.count() > 0:
await open_search_btn_element.click()
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
# 切割图片
crop_image_left(screenshot_path, 260)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
async def __listen_response(self, response):
if '/chat/history' not in response.url:
return
answer = ''
chat = {}
json_data = await response.json()
chats = list(dict.values(glom(json_data, 'data.chats', default={})))
# 选择目标chat
for _chat in chats:
if not _chat.get('role', '') == 'robot':
continue
content = glom(_chat, 'message.0.content', default='')
if not content:
continue
chat = _chat
break
if not chat:
return
answer = glom(chat, 'message.0.content', default="")
# 搜索结果
if not self.think:
search_result_list = glom(chat, 'searchCitations.list', default=[])
# 保存搜索结果
ai_search_result_list = []
for search_result in search_result_list:
url = search_result.get('url', '')
title = search_result.get('title', '')
desc = search_result.get('wild_abstract', '')
host_name = search_result.get('site', '')
date = search_result.get('date', '')
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})")
ai_search_result_list.append(AiSearchResult(
url=url,
title=title,
host_name=host_name,
body=desc,
publish_time=date
))
self.ai_answer.search_result = ai_search_result_list
self.ai_answer.answer = answer
async def __listen_response_thinking(self, response):
if '/chat/conversation/v2' not in response.url:
return
# 读取流式数据
data = {}
search_list = []
stream = await response.body()
response_text = stream.decode('utf-8')
response_lines = response_text.split("\n\n")
for line in response_lines:
sub_lines = line.split("\n")
for sub_line in sub_lines:
if sub_line.startswith('data:'):
json_data = json.loads(sub_line[5:])
history_need = json_data.get('historyNeed', None)
search_list = json_data.get('contents', [])
if history_need == 1 and search_list:
break
if search_list:
break
ai_search_result_list = []
for search_result in search_list:
url = search_result.get('url', '')
title = search_result.get('title', '')
desc = search_result.get('siteAbstract', '')
host_name = search_result.get('name', '')
date = search_result.get('publishTime', '')
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})")
ai_search_result_list.append(AiSearchResult(
url=url,
title=title,
host_name=host_name,
body=desc,
publish_time=date
))
self.ai_answer.search_result = ai_search_result_list
async def check_login(self):
# 找登录后才会出现的侧边栏
try:
await self.browser_page.locator("//div[@id='eb_sidebar']").wait_for(state='attached', timeout=20000)
except Exception:
# 更新session状态
await AiSeoApis.update_spider_session(self.session_info['id'], 2)
raise Exception(f"{self.get_platform_name()}登录失败 session_id: {self.session_info['id']}")
def get_platform_id(self) -> int:
return 6
def get_platform_name(self) -> str:
return 'YiYan'
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
async def run():
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS,
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
channel="chrome",
args=config.BROWSER_ARGS)
spider = YiYanSpider(browser, '你好', '')
await spider.run()
if __name__ == '__main__':
asyncio.run(run())

174
spiders/ai_seo/yuanbao.py

@ -0,0 +1,174 @@
# coding=utf-8
import asyncio
import re
from datetime import datetime
from functools import partial, wraps
from playwright.async_api import Playwright, Browser, async_playwright
from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer, AiSearchResult
from utils import create_logger
from glom import glom, Coalesce
from utils.image_utils import crop_image_left
logger = create_logger(__name__)
class YuanBaoSpider(AbstractAiSeoSpider):
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response)
def get_home_url(self) -> str:
return 'https://yuanbao.tencent.com/'
async def _do_spider(self) -> AiAnswer:
# 初始化数据
self._init_data()
self.is_get_detail = False
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(2)
# 开启深度思考
if self.think:
think_button = self.browser_page.locator("//button[@dt-button-id='deep_think']")
if await think_button.is_visible():
model_id = await think_button.get_attribute('dt-model-id')
if not model_id == 'deep_seek':
await think_button.click()
await asyncio.sleep(2)
# 开启联网搜索
search_button = self.browser_page.locator("//button[@dt-button-id='online_search']")
if await search_button.is_visible():
class_str = await search_button.get_attribute('class')
clazz = class_str.split(' ')
if 'checked' not in clazz:
logger.debug('未开启联网搜索')
await search_button.click()
await asyncio.sleep(1)
# 开始操作
chat_input_element = self.browser_page.locator("//div[contains(@class, 'chat-input-editor')]")
await chat_input_element.click()
# 输入提问词
await self.browser_page.keyboard.type(self.prompt)
await asyncio.sleep(2)
await self.browser_page.keyboard.press('Enter')
# 监听请求
self.browser_page.on('response', partial(self.__listen_response))
await self.completed_event.wait()
# 报错检查
if self.fail_status:
raise self.fail_exception
# # 获取回答元素
answer_element = self.browser_page.locator("//div[@class='agent-chat__list__item__content']").nth(-1)
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 500
# # 调整视口大小
await self.browser_page.set_viewport_size({
'width': 1920,
'height': int(view_port_height)
})
# 收起侧栏
# await self.browser_page.locator("//div[@data-desc='fold']").click()
# 打开联网搜索结果
search_list_element = self.browser_page.locator("(//div[contains(@data-title, '资料作为参考')])[1]/span")
if await search_list_element.is_visible():
await search_list_element.click()
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)
crop_image_left(screenshot_path, 260)
self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer
async def __listen_response(self, response):
if '/agent/conversation/v1/detail' not in response.url or self.is_get_detail:
return
json_data = await response.json()
# 取值key
if not json_data['convs']:
return
convs = json_data['convs']
content = {}
for conv in convs:
key = 'speechesV2.0.content'
content = glom(conv, key, default=[])
if len(content) > 1:
break
# 循环获取content中的内容
search_list = None
think = None
text = None
for item in content:
if item['type'] == 'text':
text = item.get('msg', '')
elif item['type'] == 'searchGuid':
search_list = item.get('docs', [])
elif item['type'] == 'think':
think = item.get('content', '')
logger.debug(f'ai回复内容: {text}')
ai_search_result_list = []
self.ai_answer.answer = text
if search_list:
pattern = r'\[\^(\d+)\]'
index_data = list(set(re.findall(pattern, self.ai_answer.answer)))
for index,search_result in enumerate(search_list):
if str(index+1) in index_data:
ai_search_result_list.append(
AiSearchResult(
title=search_result.get('title', ''),
url=search_result.get('url', ''),
host_name=search_result.get('web_site_name', ''),
body=search_result.get('quote', ''),
publish_time=search_result.get('publish_time', 0),
is_referenced = "1"
)
)
else:
ai_search_result_list.append(
AiSearchResult(
title=search_result.get('title', ''),
url=search_result.get('url', ''),
host_name=search_result.get('web_site_name', ''),
body=search_result.get('quote', ''),
publish_time=search_result.get('publish_time', 0),
is_referenced = "0"
)
)
logger.debug(f'ai参考资料: {search_list}')
self.ai_answer.search_result = ai_search_result_list
self.is_get_detail = True
self.completed_event.set()
def handle_listen_response_error(self, func):
"""
:param func:
:return:
"""
@wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True)
# 标记失败状态 记录异常
self.fail_status = True
self.fail_exception = e
self.completed_event.set()
return wrapper
def get_platform_id(self) -> int:
return 3
def get_platform_name(self) -> str:
return 'YuanBao'

7
static/stealth.min.js
File diff suppressed because it is too large
View File

111
utils/__init__.py

@ -0,0 +1,111 @@
# coding=utf-8
import hashlib
import os
from pathlib import Path
import re
from .logger_utils import create_logger
import json
from typing import Any, Union
from datetime import datetime
def parse_nested_json(
json_str: str,
default: Any = None,
recursive: bool = True
) -> Union[dict, list, Any]:
"""
JSON字符串
:param json_str: JSON字符串
:param default:
:param recursive: JSON字符串
:return: Python对象
"""
def _parse(obj: Any) -> Any:
# 递归解析嵌套结构
if isinstance(obj, dict):
return {k: _parse(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [_parse(elem) for elem in obj]
elif recursive and isinstance(obj, str):
try:
parsed = json.loads(obj)
return _parse(parsed) # 递归解析新对象
except json.JSONDecodeError:
return obj
else:
return obj
# 处理空输入
if not json_str:
return default if default is not None else {}
try:
# 首次解析外层JSON
parsed = json.loads(json_str)
# 递归处理嵌套结构
return _parse(parsed)
except (TypeError, json.JSONDecodeError) as e:
# 处理解码错误和类型错误
return default if default is not None else {}
except Exception as e:
# 其他异常处理(可选记录日志)
return default if default is not None else {}
def convert_timestamp(timestamp):
"""
datetime
:
timestamp --
:
datetime
"""
# 如果时间戳大于 1e10,认为是毫秒级时间戳
if timestamp > 1e10:
timestamp /= 1000.0 # 转换为秒级时间戳
return datetime.fromtimestamp(timestamp)
def make_sha256_hash(file_path: str) -> str:
"""
:param file_path:
:return:
"""
hash_obj = hashlib.new("sha256")
with open(file_path, "rb") as f:
while chunk := f.read(8192): # 分块读取避免大文件内存溢出
hash_obj.update(chunk)
return hash_obj.hexdigest()
def css_to_dict(css_line):
"""
CSS变量声明转换为Python字典
:
css_line (str): CSS变量声明的字符串
'--ds-button-color: #fff; -button-text-color: #4c4c4c; button-border-color: rgba(0, 0, 0, 0.12);'
:
dict: CSS变量名和值的字典
"""
# 使用正则表达式匹配所有变量声明
# 匹配格式:可能带有1-2个横线的变量名,然后是冒号和值
pattern = r'(-{0,2}[a-zA-Z0-9-]+)\s*:\s*([^;]+);?'
matches = re.findall(pattern, css_line)
# 将匹配结果转换为字典
result = {}
for match in matches:
var_name = match[0] # 保留原始横线数量
var_value = match[1].strip()
result[var_name] = var_value
return result

114
utils/ai.py

@ -0,0 +1,114 @@
# coding=utf-8
import asyncio
import json
from openai import OpenAI
from utils import create_logger
from utils.ai_seo_api_utils import AiSeoApis
import config
logger = create_logger(platform="ai")
client = OpenAI(api_key=config.OPENAI_API_KEY, base_url="https://api.deepseek.com")
async def main():
results = await AiSeoApis.get_task_result_list(project_id=3)
for result in results:
if result['read_rank_status'] == 1:
logger.info(f"[{result['id']}] 已读取过排名")
continue
prompt = f"""
: , , , json数组
: json中包含brands字段, ,
:
{result['content']}
"""
response = client.chat.completions.create(
model="deepseek-chat",
response_format={
'type': 'json_object'
},
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt},
],
stream=False
)
# 读取ai返回的json
ai_json_result = {}
# ai 返回的排名
rank = 0
# ai读取的状态
read_rank_status = 2
try:
ai_json_result = json.loads(response.choices[0].message.content)
read_rank_status = 1
except Exception as e:
logger.error(f"[{result['id']}] 读取ai返回的json失败: {e}")
logger.error(f"ai提示词: {prompt}")
continue
# 读取排名
brands = ai_json_result.get('brands', [])
index = 1
for brand in brands:
if '沙利文' in brand:
rank = index
logger.info(f"[{result['id']}] 品牌词提及, 排名: {rank}")
break
index = index + 1
# 更新排名
update_result = await AiSeoApis.update_result_rank(result['id'], rank, read_rank_status)
logger.info(f"[{result['id']}] 更新排名结果: {update_result}")
async def read_rank(content, brand_word):
# 切割品牌词
brand_words = brand_word.split(',')
prompt = f"""
: , , , json数组
: json中包含brands字段, ,
:
{content}
"""
response = client.chat.completions.create(
model="deepseek-chat",
response_format={
'type': 'json_object'
},
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt},
],
stream=False
)
# 读取ai返回的json
ai_json_result = {}
# ai 返回的排名
rank = 0
# ai读取的状态
read_rank_status = 2
try:
ai_json_result = json.loads(response.choices[0].message.content)
read_rank_status = 1
except Exception as e:
logger.error(f"读取ai返回的json失败: {e}")
logger.error(f"ai提示词: {prompt}")
return [], 0
# 读取排名
brands = ai_json_result.get('brands', [])
index = 1
for brand in brands:
found = any(sub in brand for sub in brand_words)
if found:
rank = index
logger.info(f"品牌词提及, 排名: {rank}")
break
index = index + 1
return brands, rank
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())

275
utils/ai_seo_api_utils.py

@ -0,0 +1,275 @@
# coding=utf-8
import datetime
import json
import os
import httpx
import config
from utils import create_logger
logger = create_logger(__name__)
class AiSeoApis:
@staticmethod
def build_full_url(uri):
return f"{config.AI_SEO_BASE_URL}{uri}"
@staticmethod
async def get_one_task(date='', platform_ids=''):
"""
:return:
"""
uri = '/api/third/getTask'
url = AiSeoApis.build_full_url(uri)
params = {**config.AI_SEO_API_AUTH}
if date:
params['date'] = date
if platform_ids:
params['platform_ids'] = platform_ids
async with httpx.AsyncClient() as client:
response = await client.get(url, params=params, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def get_urgent_task_count():
"""
:return:
"""
uri = '/api/frontend/thirdParty/getUrgentTaskCount'
url = AiSeoApis.build_full_url(uri)
params = {**config.AI_SEO_API_AUTH}
async with httpx.AsyncClient() as client:
response = await client.get(url, params=params, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def upload_screenshot_file(file_path):
"""
:param file_path:
:return:
"""
uri = '/api/third/oss/upload'
url = AiSeoApis.build_full_url(uri)
params = {
**config.AI_SEO_API_AUTH,
'oss_path': 'ai_seo/screenshot'
}
with open(file_path, 'rb') as file:
async with httpx.AsyncClient() as client:
files = {'file': (file_path, file, 'image/jpeg')}
response = await client.post(url, params=params, files=files, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def submit_task(json_data):
"""
:param json_data:
:return:
"""
uri = '/api/third/submitProjectTask'
url = AiSeoApis.build_full_url(uri)
async with httpx.AsyncClient() as client:
print("json_data",json.dumps(json_data))
response = await client.post(url, json=json_data, timeout=120)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def get_task_result_list(project_id):
"""
:return:
"""
uri = '/api/frontend/thirdParty/projectResult/list'
url = AiSeoApis.build_full_url(uri)
params = {**config.AI_SEO_API_AUTH, 'project_id': project_id}
async with httpx.AsyncClient() as client:
response = await client.get(url, params=params)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def update_result_rank(result_id, rank, read_rank_status):
"""
:return:
"""
uri = '/api/frontend/thirdParty/projectResult/updateRank'
url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'id': result_id, 'rank': rank, 'read_rank_status': read_rank_status}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=json_data)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取任务失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def update_task_status(task_id, status):
"""
:param task_id:
:param status:
:return:
"""
uri = '/api/third/updateTask'
url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'task_id': task_id, 'status': status}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=json_data, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"更新任务失败: {json_result['msg']}")
return None
return json_result['data']
@staticmethod
async def heartbeat(dc_id, load_count=0):
"""
:param dc_id:
:param load_count:
:return:
"""
uri = '/api/frontend/thirdParty/spider/heartbeat'
url = AiSeoApis.build_full_url(uri)
send_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
json_data = {
**config.AI_SEO_API_AUTH,
'dc_id': dc_id,
'load_count': load_count,
'send_time': send_time
}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=json_data, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"心跳失败: {json_result['msg']}")
return None
return json_result['data']
@staticmethod
async def get_spider_session(platform_id):
"""
:param platform_id:
:return:
"""
uri = '/api/third/getOneSpiderSession'
url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id}
async with httpx.AsyncClient() as client:
response = await client.get(url, params=json_data, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取爬虫session失败")
return None
return json_result['data']
@staticmethod
async def download_spider_session_file(url, path):
"""
:param url:
:param path:
:return:
"""
# 获取文件所在目录
dir_path = os.path.dirname(path)
os.makedirs(dir_path, exist_ok=True)
async with httpx.AsyncClient(verify=False) as client:
response = await client.get(url, follow_redirects=True)
with open(path, 'wb') as file:
file.write(response.content)
@staticmethod
async def update_spider_session(session_id, status=1):
"""
:param session_id:
:param status:
:return:
"""
uri = '/api/frontend/thirdParty/spider/session/update'
url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'id': session_id, 'status': status}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=json_data, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"更新爬虫session失败")
return None
return json_result['data']
@staticmethod
async def upload_session_file(file_path):
"""
session文件
:param file_path:
:return:
"""
uri = '/api/frontend/thirdParty/oss/upload'
url = AiSeoApis.build_full_url(uri)
params = {
**config.AI_SEO_API_AUTH,
'oss_path': 'ai_seo/session'
}
with open(file_path, 'rb') as file:
async with httpx.AsyncClient() as client:
files = {'file': (file_path, file, 'application/json')}
response = await client.post(url, params=params, files=files, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"上传session文件失败: {json_result['msg']}")
return json_result['data']
@staticmethod
async def save_spider_session(platform_id, file_url, file_hash, account=''):
"""
session
:param file_url:
:param platform_id:
:param url:
:param file_hash:
:param account:
:return:
"""
uri = '/api/frontend/thirdParty/spider/session/save'
url = AiSeoApis.build_full_url(uri)
json_data = {
**config.AI_SEO_API_AUTH,
'platform_id': platform_id,
'account': account,
'url': file_url,
'hash': file_hash
}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=json_data, timeout=120)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"保存session: {json_result['msg']}")
return json_result['data']

43
utils/image_utils.py

@ -0,0 +1,43 @@
# coding=utf-8
from PIL import Image
import os
from utils import create_logger
logger = create_logger(__name__)
def crop_image_left(image_path, crop_width):
"""
:
image_path (str):
crop_width (int):
"""
try:
# 打开原始图片
with Image.open(image_path) as img:
# 获取图片尺寸
width, height = img.size
# 验证切割宽度是否有效
if crop_width <= 0 or crop_width >= width:
raise ValueError(f"切割宽度必须大于0且小于图片宽度({width}px)")
# 计算切割区域 (left, upper, right, lower)
crop_box = (crop_width, 0, width, height)
# 执行切割
cropped_img = img.crop(crop_box)
# 临时保存切割后的图片
temp_path = image_path + ".png"
cropped_img.save(temp_path, quality=95)
# 覆盖原文件
os.replace(temp_path, image_path)
logger.info(f"成功从左侧切割 {crop_width}px 并覆盖原图")
except Exception as e:
print(f"处理图片时出错: {e}")
if 'temp_path' in locals() and os.path.exists(temp_path):
os.remove(temp_path)

48
utils/logger_utils.py

@ -0,0 +1,48 @@
# coding=utf-8
import sys
from loguru import logger
import os
from datetime import datetime
import config
def create_logger(platform: str):
"""
:param platform:
:param name:
"""
# 确定日志目录,按月份保存
current_month = datetime.now().strftime("%Y-%m")
log_dir = os.path.join("logs", current_month)
os.makedirs(log_dir, exist_ok=True)
# 确定日志文件名称为当前日的部分
current_day = datetime.now().strftime("%d")
log_file = os.path.join(log_dir, f"{current_day}.log")
logger.remove()
# 文件处理器
logger.add(
log_file,
level=config.LOG_LEVEL,
rotation="1 day", # 每天轮换日志文件
retention="3 months", # 日志保留3个月
encoding="utf-8",
enqueue=True, # 异步写入
backtrace=True, # 显示回溯信息
diagnose=True, # 显示诊断信息
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | "
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>"
)
# 控制台处理器
logger.add(
sys.stderr,
level=config.LOG_LEVEL,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | "
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>",
colorize=True
)
# 设置默认的上下文信息
return logger.bind(platform=platform)

42
utils/session_utils.py

@ -0,0 +1,42 @@
# coding=utf-8
from pathlib import Path
import config
from utils.ai_seo_api_utils import AiSeoApis
async def get_spider_session(platform_id):
"""
session
:param platform_id:
:return:
"""
base_path = f'{config.ROOT_PATH}/data/session_data'
# 爬虫信息
session_info = await AiSeoApis.get_spider_session(platform_id)
if not session_info:
raise Exception(f"平台id: {platform_id} 没有可用的爬虫session")
# 根据id去爬虫文件夹中找
target = search_session_file(session_info['id'], base_path)
# 如果没有找到 下载这个文件并保存
if not target:
await AiSeoApis.download_spider_session_file(session_info['url'], F"{base_path}/{session_info['id']}.json")
target = f"{session_info['id']}.json"
else:
target = target[0]
session_info['session_path'] = f"{base_path}/{target}"
return session_info
def search_session_file(session_id, path):
folder_path = Path(path)
file_filter = f"{session_id}.json"
return [file.name for file in folder_path.glob(file_filter)]
async def main():
path = await get_spider_session(1)
print(path)
if __name__ == '__main__':
import asyncio
asyncio.run(main())
Loading…
Cancel
Save