commit
2d51402f2d
42 changed files with 3102 additions and 0 deletions
-
135abs_spider.py
-
76config.py
-
92config.py.example
-
1data/session/deepseek.json
-
1data/session/doubao.json
-
1data/session/kimi.json
-
1data/session/metaso.json
-
1data/session/tongyi.json
-
1data/session/yiyan.json
-
1data/session/yuanbao.json
-
1data/session_data/deepseek.json
-
1data/session_data/doubao.json
-
1data/session_data/kimi.json
-
1data/session_data/metaso.json
-
1data/session_data/tongyi.json
-
1data/session_data/yiyan.json
-
1data/session_data/yuanbao.json
-
1domain/__init__.py
-
68domain/ai_seo.py
-
100login.py
-
85main.py
-
12requirements.txt
-
78resave.py
-
238run.py
-
119run_deepseek.py
-
1spiders/__init__.py
-
9spiders/ai_seo/__init__.py
-
189spiders/ai_seo/deepseek.py
-
164spiders/ai_seo/doubao.py
-
148spiders/ai_seo/kimi.py
-
196spiders/ai_seo/metaso.py
-
174spiders/ai_seo/nanometer.py
-
176spiders/ai_seo/tongyi.py
-
213spiders/ai_seo/yiyan.py
-
174spiders/ai_seo/yuanbao.py
-
7static/stealth.min.js
-
111utils/__init__.py
-
114utils/ai.py
-
275utils/ai_seo_api_utils.py
-
43utils/image_utils.py
-
48utils/logger_utils.py
-
42utils/session_utils.py
@ -0,0 +1,135 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import uuid |
||||
|
from abc import ABC, abstractmethod |
||||
|
from asyncio import Event |
||||
|
|
||||
|
from playwright.async_api import Browser, BrowserContext, Page |
||||
|
|
||||
|
import config |
||||
|
from domain.ai_seo import AiAnswer |
||||
|
from utils import create_logger |
||||
|
from utils.session_utils import get_spider_session |
||||
|
|
||||
|
logger = create_logger("abs_spider") |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
||||
|
class AbstractAiSeoSpider(ABC): |
||||
|
browser: Browser |
||||
|
browser_content: BrowserContext |
||||
|
browser_page: Page |
||||
|
platform_id: int |
||||
|
platform_name: str |
||||
|
prompt: str |
||||
|
keyword: str |
||||
|
completed_event: Event | None = None |
||||
|
ai_answer: AiAnswer | None = None |
||||
|
fail_status: bool = False |
||||
|
fail_exception: Exception | None = None |
||||
|
load_session: bool = True |
||||
|
session_info: dict | None = None |
||||
|
task_id: int = 0 |
||||
|
think: bool = False |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False, load_session: bool = True): |
||||
|
self.browser = browser |
||||
|
self.platform_id = self.get_platform_id() |
||||
|
self.platform_name = self.get_platform_name() |
||||
|
self.prompt = prompt |
||||
|
self.keyword = keyword |
||||
|
self.load_session = load_session |
||||
|
self.think = think |
||||
|
|
||||
|
def _init_data(self): |
||||
|
self.completed_event = asyncio.Event() |
||||
|
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword) |
||||
|
self.index_data = None |
||||
|
|
||||
|
def _get_session_path(self): |
||||
|
sessions = { |
||||
|
1: "deepseek", |
||||
|
5: "doubao", |
||||
|
4: "kimi", |
||||
|
2: "tongyi", |
||||
|
6: "yiyan", |
||||
|
3: "yuanbao" |
||||
|
} |
||||
|
# todo 支持多session管理 |
||||
|
|
||||
|
session_path = f"./data/session/{sessions.get(self.platform_id, 'deepseek')}.json" |
||||
|
return session_path |
||||
|
|
||||
|
def _get_screenshot_path(self): |
||||
|
unique_id = str(uuid.uuid4()).replace('-', '') |
||||
|
screenshot_path = f'{config.SCREENSHOT_BASE_PATH}/{self.platform_name}_{unique_id}.png' |
||||
|
return screenshot_path |
||||
|
|
||||
|
async def __init_page(self): |
||||
|
if self.load_session: |
||||
|
self.session_info = await get_spider_session(self.platform_id) |
||||
|
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path']) |
||||
|
else: |
||||
|
self.browser_content = await self.browser.new_context() |
||||
|
self.browser_page = await self.browser_content.new_page() |
||||
|
await self.browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE) |
||||
|
# 加载伪装脚本 |
||||
|
await self.browser_page.add_init_script(""" |
||||
|
Object.defineProperties(navigator, {webdriver:{get:()=>false}}); |
||||
|
""") |
||||
|
await self.browser_page.add_init_script('static/stealth.min.js') |
||||
|
|
||||
|
async def _close(self): |
||||
|
await self.browser_page.close() |
||||
|
await self.browser_content.close() |
||||
|
|
||||
|
async def _login(self): |
||||
|
""" |
||||
|
登录 |
||||
|
:return: |
||||
|
""" |
||||
|
await self.__init_page() |
||||
|
await self.browser_page.goto(self.get_home_url()) |
||||
|
unique_id = str(uuid.uuid4()).replace('-', '') |
||||
|
session_path = f"./data/session/{self.get_platform_name()}/{unique_id}.json" |
||||
|
input("请手动登录后按回车继续...") |
||||
|
await self.browser_content.storage_state(path=session_path) |
||||
|
logger.info(f"[{self.platform_name}]登录成功: {session_path}") |
||||
|
await self._close() |
||||
|
|
||||
|
async def run(self) -> AiAnswer | None: |
||||
|
""" |
||||
|
运行爬虫 |
||||
|
:return: |
||||
|
""" |
||||
|
try: |
||||
|
await self.__init_page() |
||||
|
logger.info(f"{self.platform_name}爬虫开始运行 提问词: {self.prompt}") |
||||
|
return await self._do_spider() |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.platform_name}爬虫运行异常 参数: {self.prompt, self.keyword}") |
||||
|
logger.error(f"异常信息: {str(e)}") |
||||
|
raise e |
||||
|
finally: |
||||
|
await self._close() |
||||
|
|
||||
|
@abstractmethod |
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
""" |
||||
|
爬虫具体逻辑 |
||||
|
:return: |
||||
|
""" |
||||
|
pass |
||||
|
|
||||
|
@abstractmethod |
||||
|
def get_platform_id(self) -> int: |
||||
|
pass |
||||
|
|
||||
|
@abstractmethod |
||||
|
def get_platform_name(self) -> str: |
||||
|
pass |
||||
|
|
||||
|
@abstractmethod |
||||
|
def get_home_url(self) -> str: |
||||
|
pass |
||||
@ -0,0 +1,76 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
LOG_LEVEL = 'DEBUG' |
||||
|
DC_ID = 'dev-01' |
||||
|
ROOT_PATH = r'C:\Users\Administrator\Desktop\spider_ai_seo' |
||||
|
|
||||
|
SCREENSHOT_BASE_PATH = 'screenshot' |
||||
|
BROWSER_HANDLESS = False |
||||
|
BROWSER_ENABLE_SANDBOX = False |
||||
|
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"] |
||||
|
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"] |
||||
|
|
||||
|
PAGE_INIT_VIEWPORT_SIZE = { |
||||
|
'width': 1920, |
||||
|
'height': 1080 |
||||
|
} |
||||
|
|
||||
|
AI_SEO_BASE_URL = 'https://geo-api.neicela.com' |
||||
|
AI_SEO_API_AUTH = { |
||||
|
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c', |
||||
|
'secret': '5588071d36f0bc61af849c311a03f2c4' |
||||
|
} |
||||
|
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f' |
||||
|
|
||||
|
AI_SEO_JOB_RANGE = { |
||||
|
'start_time': '00:10', |
||||
|
'end_time': '23:59' |
||||
|
} |
||||
|
|
||||
|
# aiseo任务是否启用 |
||||
|
AI_SEO_JOB_ENABLE = True |
||||
|
# aiseo任务运行间隔 |
||||
|
AI_SEO_JOB_INTERVAL = 5 |
||||
|
# aiseo任务获取平台 |
||||
|
AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13'] |
||||
|
# aiseo任务最大并发量 |
||||
|
AI_SEO_JOB_MAX_INSTANCES = 2 |
||||
|
|
||||
|
|
||||
|
DEEPSEEK_SEO_JOB_RANGE = { |
||||
|
'start_time': '00:10', |
||||
|
'end_time': '23:59' |
||||
|
} |
||||
|
# deepseek任务是否启用 |
||||
|
DEEPSEEK_JOB_ENABLE = True |
||||
|
# deepseek任务获取间隔 |
||||
|
DEEPSEEK_JOB_INTERVAL = 30 |
||||
|
# deepseek任务获取平台 |
||||
|
DEEPSEEK_JOB_PLATFORM_IDS = ['1'] |
||||
|
# deepseek任务最大并发量 |
||||
|
DEEPSEEK_JOB_MAX_INSTANCES = 1 |
||||
|
# 测试提示词 |
||||
|
TEST_KEYWORDS = [ |
||||
|
'正典燕窝', |
||||
|
'燕窝供货商', |
||||
|
'燕窝供应链平台', |
||||
|
'燕窝加盟', |
||||
|
'燕窝加盟选哪个品牌好', |
||||
|
] |
||||
|
|
||||
|
# 测试平台 |
||||
|
TEST_PLATFORM = [6] |
||||
|
# TEST_PLATFORM = [] |
||||
|
# 测试间隔 |
||||
|
TEST_INTERVAL = 10 |
||||
|
|
||||
|
RESAVE_CONFIG = { |
||||
|
'platform_ids': '1', |
||||
|
'dates': ['2025-04-16'] |
||||
|
} |
||||
|
|
||||
|
# 任务队列redis配置 |
||||
|
ARQ_REDIS_HOST = 'localhost' |
||||
|
ARQ_REDIS_PORT = 6379 |
||||
|
ARQ_REDIS_DB = 5 |
||||
|
ARQ_REDIS_PASSWORD = None |
||||
@ -0,0 +1,92 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
LOG_LEVEL = 'INFO' |
||||
|
DC_ID = 'dev-01' |
||||
|
|
||||
|
SCREENSHOT_BASE_PATH = 'screenshot' |
||||
|
BROWSER_HANDLESS = False |
||||
|
BROWSER_ENABLE_SANDBOX = False |
||||
|
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"] |
||||
|
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"] |
||||
|
|
||||
|
PAGE_INIT_VIEWPORT_SIZE = { |
||||
|
'width': 1920, |
||||
|
'height': 1080 |
||||
|
} |
||||
|
|
||||
|
AI_SEO_BASE_URL = 'https://aiseo-api.neicela.com' |
||||
|
AI_SEO_API_AUTH = { |
||||
|
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c', |
||||
|
'secret': '5588071d36f0bc61af849c311a03f2c4' |
||||
|
} |
||||
|
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f' |
||||
|
|
||||
|
AI_SEO_JOB_DATE = '2025-04-21' |
||||
|
# aiseo任务运行间隔 |
||||
|
AI_SEO_JOB_INTERVAL = 20 |
||||
|
# aiseo任务获取平台 |
||||
|
AI_SEO_JOB_PLATFORM_IDS = ['2', '3', '4', '5', '6', '7'] |
||||
|
# aiseo任务最大并发量 |
||||
|
AI_SEO_JOB_MAX_INSTANCES = 2 |
||||
|
|
||||
|
# deepseek任务获取间隔 |
||||
|
DEEPSEEK_JOB_INTERVAL = 120 |
||||
|
# deepseek任务获取平台 |
||||
|
DEEPSEEK_JOB_PLATFORM_IDS = ['1'] |
||||
|
# deepseek任务最大并发量i |
||||
|
DEEPSEEK_JOB_MAX_INSTANCES = 1 |
||||
|
# 测试提示词 |
||||
|
TEST_KEYWORDS = [ |
||||
|
# '行业咨询公司排行榜', |
||||
|
# '咨询公司排行榜', |
||||
|
# '中国有哪些知名的行业咨询公司', |
||||
|
# '请推荐中国十大行业咨询公司', |
||||
|
# '中国行业咨询机构中,哪些是最具影响力的', |
||||
|
# '行业顾问排行榜上,哪些公司名列前茅', |
||||
|
# '在行业咨询领域,哪些公司被誉为行业领军者', |
||||
|
# '我想了解中国行业咨询公司的排名,有吗', |
||||
|
# '请给我一份中国行业顾问公司的权威榜单', |
||||
|
# '中国有哪些行业咨询公司备受业界认可', |
||||
|
# '能否介绍一下中国行业咨询领域的佼佼者', |
||||
|
# '中国行业咨询公司的排名情况如何', |
||||
|
# '中国领先的行业顾问公司有哪些', |
||||
|
# '中国行业顾问排行榜上的公司都是哪些', |
||||
|
# '有没有一份关于中国行业咨询机构的综合排名', |
||||
|
# '十大行业研究机构排名', |
||||
|
# '行业研究机构推荐', |
||||
|
# '中国权威的行业研究机构有哪些', |
||||
|
# '能否列出一些中国顶级的行业研究机构', |
||||
|
# '哪些行业研究机构在中国市场上最具竞争力', |
||||
|
# '行业研究机构中,哪些在业界享有盛誉', |
||||
|
# '我想找一家权威的行业研究机构,有推荐吗', |
||||
|
# '请列举一些在行业研究方面有着深厚底蕴的中国公司', |
||||
|
# '我想了解中国行业研究领域的权威机构,能介绍一下吗?', |
||||
|
# '高端卫浴品牌', |
||||
|
# '数智卫浴品牌', |
||||
|
# '科技卫浴品牌', |
||||
|
# '智能马桶品牌', |
||||
|
# '智能淋浴器推荐', |
||||
|
# '小孩湿疹用什么药膏', |
||||
|
# '皮肤湿疹用什么药膏', |
||||
|
# '特应性皮炎用什么药膏最有效', |
||||
|
# '湿疹药膏排行榜', |
||||
|
# '皮炎性湿疹治疗药膏', |
||||
|
'大学生买什么平板电脑' |
||||
|
] |
||||
|
|
||||
|
# 测试平台 |
||||
|
# TEST_PLATFORM = [2,3,4,5,6,7] |
||||
|
TEST_PLATFORM = [7] |
||||
|
# 测试间隔 |
||||
|
TEST_INTERVAL = 15 |
||||
|
|
||||
|
RESAVE_CONFIG = { |
||||
|
'platform_ids': '1', |
||||
|
'dates': ['2025-04-16'] |
||||
|
} |
||||
|
|
||||
|
# 任务队列redis配置 |
||||
|
ARQ_REDIS_HOST = 'localhost' |
||||
|
ARQ_REDIS_PORT = 6379 |
||||
|
ARQ_REDIS_DB = 5 |
||||
|
ARQ_REDIS_PASSWORD = None |
||||
1
data/session/deepseek.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/doubao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/kimi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/metaso.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/tongyi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/yiyan.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/yuanbao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/deepseek.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/doubao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/kimi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/metaso.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/tongyi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/yiyan.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/yuanbao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1 @@ |
|||||
|
# coding=utf-8 |
||||
@ -0,0 +1,68 @@ |
|||||
|
# coding=utf-8 |
||||
|
from dataclasses import dataclass, field |
||||
|
import os |
||||
|
import config |
||||
|
import utils |
||||
|
from datetime import datetime |
||||
|
|
||||
|
@dataclass |
||||
|
class AiSearchResult: |
||||
|
|
||||
|
""" |
||||
|
ai搜索结果对象 |
||||
|
""" |
||||
|
|
||||
|
# 标题 |
||||
|
title: str = '' |
||||
|
# url |
||||
|
url: str = '' |
||||
|
# 来源 |
||||
|
host_name: str = '' |
||||
|
# 描述 |
||||
|
body: str = '' |
||||
|
# 发布时间 |
||||
|
publish_time: str|int|float = '' |
||||
|
#是否被ai引用 |
||||
|
is_referenced: str = '0' |
||||
|
#情感倾向" 1- 中立 2- 正面 3- 负面 |
||||
|
sentiment_type = "" |
||||
|
|
||||
|
#情感类型 |
||||
|
type = '' |
||||
|
def __post_init__(self): |
||||
|
if isinstance(self.publish_time, float): |
||||
|
self.publish_time = int(self.publish_time) |
||||
|
|
||||
|
if isinstance(self.publish_time, int): |
||||
|
self.publish_time = utils.convert_timestamp(self.publish_time).strftime('%Y-%m-%d') |
||||
|
|
||||
|
if isinstance(self.publish_time, str): |
||||
|
try: |
||||
|
now = datetime.now() |
||||
|
publish = datetime.strptime(self.publish_time, f'%m-%d') |
||||
|
except ValueError: |
||||
|
return |
||||
|
self.publish_time = publish.strftime(f'{now.year}-%m-%d') |
||||
|
|
||||
|
|
||||
|
@dataclass |
||||
|
class AiAnswer: |
||||
|
""" |
||||
|
ai回答对象 |
||||
|
""" |
||||
|
|
||||
|
platform_id: int |
||||
|
platform_name: str |
||||
|
prompt: str |
||||
|
keyword: str |
||||
|
answer: str = '' |
||||
|
search_result: list[AiSearchResult] = field(default_factory=list) |
||||
|
screenshot_file: str = '' |
||||
|
# 状态 |
||||
|
run_status: bool = True |
||||
|
|
||||
|
def __post_init__(self): |
||||
|
self.screenshot_file = os.path.abspath(self.screenshot_file) |
||||
|
|
||||
|
|
||||
|
|
||||
@ -0,0 +1,100 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import os |
||||
|
import time |
||||
|
|
||||
|
from playwright.async_api import async_playwright |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from spiders.ai_seo import * |
||||
|
import config |
||||
|
from utils import make_sha256_hash |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
|
||||
|
SPIDER_CLS = { |
||||
|
1: DeepseekSpider, |
||||
|
2: TongyiSpider, |
||||
|
3: YuanBaoSpider, |
||||
|
4: KimiSpider, |
||||
|
5: DouBaoSpider, |
||||
|
6: YiYanSpider, |
||||
|
7: NanometerSpider, |
||||
|
13: MetasoSpider |
||||
|
} |
||||
|
|
||||
|
|
||||
|
async def init_browser() -> tuple: |
||||
|
""" |
||||
|
初始化浏览器实例 |
||||
|
:return: |
||||
|
""" |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.chromium.launch(headless=False, |
||||
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
channel="chrome", |
||||
|
args=config.BROWSER_ARGS) |
||||
|
content = await browser.new_context() |
||||
|
return playwright, browser |
||||
|
|
||||
|
|
||||
|
async def main(): |
||||
|
playwright, browser = await init_browser() |
||||
|
main_info = """ |
||||
|
请选择要登录的平台: |
||||
|
1.Deepseek |
||||
|
2.通义千问 |
||||
|
3.腾讯元宝 |
||||
|
4.Kimi |
||||
|
5.豆包 |
||||
|
6.文心一言 |
||||
|
13.秘塔AI搜索 |
||||
|
""" |
||||
|
print(main_info) |
||||
|
platform_id = input() |
||||
|
cls = SPIDER_CLS.get(int(platform_id), None) |
||||
|
# 如果没有找到对应的爬虫类,抛出异常 |
||||
|
if not cls: |
||||
|
print('输入的平台id不存在') |
||||
|
# 要求用户填写登录账号 |
||||
|
account = input('请输入登录账号:') |
||||
|
# 创建并返回爬虫实例 |
||||
|
spider = cls(browser, '', '') |
||||
|
# 获取首页url |
||||
|
home_url = spider.get_home_url() |
||||
|
# 打开页面 |
||||
|
browser_content = await browser.new_context() |
||||
|
browser_page = await browser_content.new_page() |
||||
|
await browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE) |
||||
|
print('创建浏览器成功') |
||||
|
# 加载伪装脚本 |
||||
|
await browser_page.add_init_script(""" |
||||
|
Object.defineProperties(navigator, {webdriver:{get:()=>false}}); |
||||
|
""") |
||||
|
await browser_page.add_init_script('static/stealth.min.js') |
||||
|
print('伪装脚本加载成功') |
||||
|
await browser_page.goto(home_url, timeout=6000000) |
||||
|
print('加载首页成功') |
||||
|
input('使用手机号登录 并发送验证码后按回车键继续...') |
||||
|
# 保存登录后的上下文 |
||||
|
session_path = f"{config.ROOT_PATH}/data/tmp/session_{time.time()}.json" |
||||
|
# 检查文件夹 |
||||
|
dir_path = os.path.dirname(session_path) |
||||
|
os.makedirs(dir_path, exist_ok=True) |
||||
|
await browser_content.storage_state(path=session_path) |
||||
|
await browser_page.close() |
||||
|
await browser_content.close() |
||||
|
await browser.close() |
||||
|
print(f"登录成功 保存到{session_path}") |
||||
|
# 上传登录后的上下文 |
||||
|
upload_data = await AiSeoApis.upload_session_file(session_path) |
||||
|
session_url = upload_data['url'] |
||||
|
print(f"session文件上传成功 url:{session_url}") |
||||
|
# 计算文件hash |
||||
|
file_hash = make_sha256_hash(session_path) |
||||
|
result = await AiSeoApis.save_spider_session(platform_id, session_url, file_hash, account) |
||||
|
print("session文件保存成功") |
||||
|
print(result) |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(main()) |
||||
@ -0,0 +1,85 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
import asyncio |
||||
|
import json |
||||
|
import os |
||||
|
from dataclasses import asdict |
||||
|
from datetime import datetime |
||||
|
|
||||
|
from playwright.async_api import async_playwright |
||||
|
|
||||
|
import config |
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer |
||||
|
from spiders.ai_seo import * |
||||
|
from utils.logger_utils import create_logger |
||||
|
|
||||
|
logger = create_logger("app") |
||||
|
|
||||
|
SPIDER_CLS = { |
||||
|
1: DeepseekSpider, |
||||
|
2: TongyiSpider, |
||||
|
3: YuanBaoSpider, |
||||
|
4: KimiSpider, |
||||
|
5: DouBaoSpider, |
||||
|
6: YiYanSpider, |
||||
|
7: NanometerSpider, |
||||
|
13: MetasoSpider |
||||
|
} |
||||
|
|
||||
|
|
||||
|
async def init_browser() -> tuple: |
||||
|
""" |
||||
|
初始化浏览器实例 |
||||
|
:return: |
||||
|
""" |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
||||
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
channel="chrome", |
||||
|
args=config.BROWSER_ARGS) |
||||
|
content = await browser.new_context() |
||||
|
return playwright, browser |
||||
|
|
||||
|
|
||||
|
def get_spider(platform_id, prompt, brand, browser) -> AbstractAiSeoSpider: |
||||
|
cls = SPIDER_CLS.get(int(platform_id), None) |
||||
|
if not cls: |
||||
|
raise ValueError(f"未找到对应的爬虫类,platform_id={platform_id}") |
||||
|
return cls(browser, prompt, brand, True) |
||||
|
|
||||
|
|
||||
|
def save_local(ai_answer: AiAnswer): |
||||
|
now = datetime.now().strftime("%Y-%m-%d") |
||||
|
base_path = f'./data/{ai_answer.platform_name}/{now}' |
||||
|
|
||||
|
if not os.path.exists(base_path): |
||||
|
os.makedirs(base_path) |
||||
|
|
||||
|
json_file_path = f'{base_path}/{ai_answer.prompt}.json' |
||||
|
_dict = asdict(ai_answer) |
||||
|
json_str = json.dumps(_dict, indent=4, ensure_ascii=False) |
||||
|
with open(json_file_path, 'w', encoding='utf-8') as f: |
||||
|
f.write(json_str) |
||||
|
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}") |
||||
|
|
||||
|
|
||||
|
async def test(): |
||||
|
playwright, browser = await init_browser() |
||||
|
prompts = config.TEST_KEYWORDS |
||||
|
index = 1 |
||||
|
for prompt in prompts: |
||||
|
logger.info(f"[{index}/{len(prompts)}] {prompt}") |
||||
|
for platform in config.TEST_PLATFORM: |
||||
|
spider = get_spider(platform, prompt, '品牌词', browser) |
||||
|
ai_answer = await spider.run() |
||||
|
if ai_answer: |
||||
|
save_local(ai_answer) |
||||
|
await asyncio.sleep(config.TEST_INTERVAL) |
||||
|
index = index + 1 |
||||
|
await asyncio.sleep(config.TEST_INTERVAL * 6) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(test()) |
||||
@ -0,0 +1,12 @@ |
|||||
|
ftfy==6.3.1 |
||||
|
glom==24.11.0 |
||||
|
httpx==0.28.1 |
||||
|
json_repair==0.40.0 |
||||
|
loguru==0.7.3 |
||||
|
openai~=1.72.0 |
||||
|
playwright==1.51.0 |
||||
|
arq~=0.26.3 |
||||
|
APScheduler~=3.11.0 |
||||
|
pillow~=11.2.1 |
||||
|
pyperclip~=1.9.0 |
||||
|
PyJWT~=2.10.1 |
||||
@ -0,0 +1,78 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
import os |
||||
|
from datetime import datetime |
||||
|
import random |
||||
|
|
||||
|
import config |
||||
|
from utils import create_logger |
||||
|
from utils.ai import read_rank |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
platform_names = { |
||||
|
1: 'Deepseek', |
||||
|
2: 'TongYi', |
||||
|
3: 'YuanBao', |
||||
|
4: 'Kimi', |
||||
|
5: 'DouBao', |
||||
|
6: 'YiYan', |
||||
|
7: 'Nano' |
||||
|
} |
||||
|
|
||||
|
async def main(): |
||||
|
datas = config.RESAVE_CONFIG['dates'] |
||||
|
for date in datas: |
||||
|
logger.info(f'start: {date}') |
||||
|
while True: |
||||
|
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=config.RESAVE_CONFIG['platform_ids']) |
||||
|
if not task_data: |
||||
|
logger.info('没有任务') |
||||
|
break |
||||
|
logger.info(f"获取到任务: id: {task_data['id']} 关键词: {task_data['keyword']} 品牌词: {task_data['brand']}") |
||||
|
await save(task_data) |
||||
|
await asyncio.sleep(1) |
||||
|
|
||||
|
async def save(task_data): |
||||
|
keyword = task_data.get('keyword') |
||||
|
platform_id = task_data.get('platform_id') |
||||
|
gather_date_str = task_data.get('gather_filter') |
||||
|
gather_date = datetime.strptime(gather_date_str, "%Y-%m-%d %H:%M:%S") |
||||
|
dir_name = gather_date.strftime("%Y-%m-%d") |
||||
|
platform_name = platform_names.get(platform_id) |
||||
|
|
||||
|
data_path = f'./data/{platform_name}/{dir_name}/{keyword}.json' |
||||
|
|
||||
|
# 读取文件内容 |
||||
|
if not os.path.exists(data_path): |
||||
|
logger.info(f'文件不存在: {data_path}') |
||||
|
return |
||||
|
json_data = {} |
||||
|
with open(data_path, 'r', encoding='utf-8') as file: |
||||
|
json_data = json.loads(file.read()) |
||||
|
upload_data = await AiSeoApis.upload_screenshot_file(json_data['screenshot_file']) |
||||
|
json_data = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
**json_data, |
||||
|
'task_id': task_data['id'], |
||||
|
'rank': random.randint(0, 15), |
||||
|
'start_time': gather_date.strftime("%Y-%m-%d 06:10:15"), |
||||
|
'end_time': gather_date.strftime("%Y-%m-%d 06:12:15"), |
||||
|
'screenshot_url': upload_data['url'] |
||||
|
} |
||||
|
if not json_data.get('answer', ''): |
||||
|
json_data['answer'] = '未知' |
||||
|
json_data['rank'] = 0 |
||||
|
else: |
||||
|
brands, rank = await read_rank(json_data['answer'], task_data['brand']) |
||||
|
json_data['rank'] = rank |
||||
|
json_data['words'] = brands |
||||
|
result = await AiSeoApis.submit_task(json_data) |
||||
|
logger.info(f"任务提交成功: id: {task_data['id']}") |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(main()) |
||||
|
|
||||
@ -0,0 +1,238 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import uuid |
||||
|
from dataclasses import asdict |
||||
|
from datetime import datetime, timedelta |
||||
|
|
||||
|
import requests |
||||
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler |
||||
|
from playwright.async_api import async_playwright, Browser |
||||
|
|
||||
|
import config |
||||
|
from spiders.ai_seo import * |
||||
|
from utils import create_logger |
||||
|
from utils.ai import read_rank |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
scheduler = AsyncIOScheduler() |
||||
|
SPIDER_CLS = { |
||||
|
1: DeepseekSpider, |
||||
|
2: TongyiSpider, |
||||
|
3: YuanBaoSpider, |
||||
|
4: KimiSpider, |
||||
|
5: DouBaoSpider, |
||||
|
6: YiYanSpider, |
||||
|
7: NanometerSpider, |
||||
|
13: MetasoSpider |
||||
|
} |
||||
|
|
||||
|
spider_pool: dict = {} |
||||
|
|
||||
|
async def init_browser() -> tuple: |
||||
|
""" |
||||
|
初始化浏览器实例 |
||||
|
:return: |
||||
|
""" |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
||||
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
channel="chrome", |
||||
|
args=config.BROWSER_ARGS) |
||||
|
return playwright, browser |
||||
|
|
||||
|
def get_spider(platform_id, prompt, brand, browser) -> None | DeepseekSpider | TongyiSpider | YuanBaoSpider | KimiSpider | DouBaoSpider | YiYanSpider | NanometerSpider: |
||||
|
""" |
||||
|
根据平台ID获取相应的爬虫实例。 |
||||
|
|
||||
|
参数: |
||||
|
- platform_id: 平台标识符,用于选择合适的爬虫类。 |
||||
|
- prompt: 用户查询提示,用于爬虫处理。 |
||||
|
- brand: 品牌信息,用于爬虫处理。 |
||||
|
- browser: 浏览器实例,供爬虫使用。 |
||||
|
|
||||
|
返回: |
||||
|
- AbstractAiSeoSpider: 返回一个抽象的AI SEO爬虫实例。 |
||||
|
|
||||
|
异常: |
||||
|
- ValueError: 如果未找到对应的爬虫类,则抛出此异常。 |
||||
|
""" |
||||
|
# 根据平台ID获取对应的爬虫类 |
||||
|
cls = SPIDER_CLS.get(int(platform_id), None) |
||||
|
# 如果没有找到对应的爬虫类,抛出异常 |
||||
|
if not cls: |
||||
|
return None |
||||
|
# 创建并返回爬虫实例 |
||||
|
return cls(browser, prompt, brand) |
||||
|
|
||||
|
|
||||
|
async def ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id): |
||||
|
status, date = calc_task_date(time_range) |
||||
|
|
||||
|
# if not status: |
||||
|
# # 是否有紧急任务 |
||||
|
# task_result = await AiSeoApis.get_urgent_task_count() |
||||
|
# if task_result['count'] <= 0: |
||||
|
# return |
||||
|
current_job = scheduler.get_job(job_id) |
||||
|
# current_job.pause() |
||||
|
platform_str = ','.join(platform_ids) |
||||
|
# 获取任务信息 |
||||
|
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=platform_str) |
||||
|
|
||||
|
if not task_data: |
||||
|
logger.info(f'[{type_name}]未获取到任务信息') |
||||
|
# current_job.resume() |
||||
|
return |
||||
|
task_id = task_data['id'] |
||||
|
logger.info(f"获取到{task_data['project_id']}项目任务: id: {task_data['id']} 平台id: {task_data['platform_id']} " |
||||
|
f"关键词: {task_data['keyword']} 品牌词: {task_data['brand']}") |
||||
|
|
||||
|
# 记录开始时间 |
||||
|
start_time = datetime.now() |
||||
|
# 创建爬虫实例 |
||||
|
spider = get_spider(task_data['platform_id'], task_data['keyword'], task_data['brand'], browser) |
||||
|
# 记录任务id |
||||
|
spider.task_id = task_id |
||||
|
spider_pool[run_id] = spider |
||||
|
logger.info(f"RunId注册成功: TaskId: {task_id} 平台: {spider.platform_name}") |
||||
|
# 是否开启深度思考 |
||||
|
if task_data['thinking'] == 1: |
||||
|
spider.think = True |
||||
|
if not spider: |
||||
|
await AiSeoApis.update_task_status(task_id, 5) |
||||
|
logger.error(f"未找到对应的爬虫类 请检查任务信息: id: {task_data['id']} platform_id: {task_data['platform_id']}") |
||||
|
return |
||||
|
ai_answer = None |
||||
|
try: |
||||
|
# 运行爬虫并获取结果 |
||||
|
ai_answer = await spider.run() |
||||
|
except Exception as e: |
||||
|
await AiSeoApis.update_task_status(task_id, 4) |
||||
|
logger.info(f"回滚任务状态: id: {task_id}") |
||||
|
spider_pool.pop(run_id, None) |
||||
|
return |
||||
|
if not ai_answer: |
||||
|
await AiSeoApis.update_task_status(task_id, 4) |
||||
|
logger.error(f"爬虫运行失败 id: {task_data['id']} platform_id: {task_data['platform_id']}") |
||||
|
spider_pool.pop(run_id, None) |
||||
|
return |
||||
|
# 记录结束时间 |
||||
|
end_time = datetime.now() |
||||
|
|
||||
|
# 提交爬虫结果 |
||||
|
answer_data = asdict(ai_answer) |
||||
|
# 上传截图 |
||||
|
upload_data = await AiSeoApis.upload_screenshot_file(answer_data['screenshot_file']) |
||||
|
# 结果参数 |
||||
|
answer_data = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
**answer_data, |
||||
|
'task_id': task_data['id'], |
||||
|
'rank': 0, |
||||
|
'start_time': start_time.strftime("%Y-%m-%d %H:%M:%S"), |
||||
|
'end_time': end_time.strftime("%Y-%m-%d %H:%M:%S"), |
||||
|
'screenshot_url': upload_data['url'] |
||||
|
} |
||||
|
if not answer_data.get('answer', ''): |
||||
|
answer_data['answer'] = '未知' |
||||
|
answer_data['rank'] = 0 |
||||
|
else: |
||||
|
brands, rank = await read_rank(answer_data['answer'], task_data['brand']) |
||||
|
answer_data['rank'] = rank |
||||
|
answer_data['words'] = brands |
||||
|
# print('answer_data',answer_data) |
||||
|
search_results = list() |
||||
|
for data in answer_data.get("search_result"): |
||||
|
data_ = {**config.AI_SEO_API_AUTH,"content": data.get("title")} |
||||
|
rest = '' |
||||
|
try: |
||||
|
resp = requests.post(url='https://geo-api.neicela.com/api/third/getSentimentType',json=data_,timeout=600) |
||||
|
# print(resp.text) |
||||
|
rest = resp.json() |
||||
|
except Exception as e: |
||||
|
print(str(e)) |
||||
|
# logger.info("调用getSentimentType接口出现异常:",str(e)) |
||||
|
|
||||
|
# print("rest",rest) |
||||
|
if rest.get("code") == 0: |
||||
|
data.update(rest.get("data")) |
||||
|
search_results.append(data) |
||||
|
answer_data['search_result'] = search_results |
||||
|
result = await AiSeoApis.submit_task(answer_data) |
||||
|
logger.info(f"任务提交成功: id: {task_data['id']}") |
||||
|
spider_pool.pop(run_id, None) |
||||
|
|
||||
|
async def ai_seo_job_with_timeout(browser, platform_ids, time_range, job_id, type_name, timeout=1200): |
||||
|
# 生成一个唯一的run_id |
||||
|
run_id = str(uuid.uuid4()).replace("-", "") |
||||
|
try: |
||||
|
await asyncio.wait_for(ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id), timeout) |
||||
|
except asyncio.TimeoutError: |
||||
|
spider = spider_pool.get(run_id, None) |
||||
|
if spider: |
||||
|
await spider._close() |
||||
|
logger.error(f"任务超时: 平台: {spider.platform_id}") |
||||
|
spider_pool.pop(run_id, None) |
||||
|
await AiSeoApis.update_task_status(spider.task_id, 4) |
||||
|
logger.info(f"回滚任务状态: id: {spider.task_id}") |
||||
|
|
||||
|
async def heartbeat(browser: Browser): |
||||
|
load_count = len(browser.contexts) |
||||
|
result = await AiSeoApis.heartbeat(config.DC_ID, load_count) |
||||
|
logger.success(f"心跳: 机器id: {config.DC_ID} 负载量: {load_count} 发送时间: {result.get('send_time', '')}") |
||||
|
|
||||
|
def calc_task_date(time_range): |
||||
|
|
||||
|
# 解析时间 |
||||
|
start_time = datetime.strptime(time_range['start_time'], "%H:%M").time() |
||||
|
end_time = datetime.strptime(time_range['end_time'], "%H:%M").time() |
||||
|
|
||||
|
# 获取带时区的当前时间 |
||||
|
now = datetime.now() |
||||
|
current_time = now.time() |
||||
|
|
||||
|
# 判断逻辑 |
||||
|
if end_time < start_time: |
||||
|
# 跨天时间段 |
||||
|
if current_time >= start_time or current_time <= end_time: |
||||
|
# 如果当前时间在次日结束时间前,开始日期是昨天 |
||||
|
start_date = (now - timedelta(days=1)).date() if current_time <= end_time else now.date() |
||||
|
return True, start_date.strftime("%Y-%m-%d") |
||||
|
else: |
||||
|
# 非跨天时间段 |
||||
|
if start_time <= current_time <= end_time: |
||||
|
return True, now.date().strftime("%Y-%m-%d") |
||||
|
|
||||
|
return False, None |
||||
|
|
||||
|
async def main(): |
||||
|
# 初始化浏览器实例 |
||||
|
playwright, browser = await init_browser() |
||||
|
logger.info('初始化浏览器成功') |
||||
|
if config.AI_SEO_JOB_ENABLE: |
||||
|
# 启动一般平台aiseo任务 |
||||
|
scheduler.add_job(ai_seo_job_with_timeout, 'interval', |
||||
|
id='ai_seo_job', seconds=config.AI_SEO_JOB_INTERVAL, max_instances=config.AI_SEO_JOB_MAX_INSTANCES, coalesce=False, |
||||
|
args=[browser, config.AI_SEO_JOB_PLATFORM_IDS, config.AI_SEO_JOB_RANGE, 'ai_seo_job', '一般AI平台']) |
||||
|
logger.success('启动一般AI平台任务成功') |
||||
|
if config.DEEPSEEK_JOB_ENABLE: |
||||
|
# 启动deepseek-aiseo任务 |
||||
|
scheduler.add_job(ai_seo_job_with_timeout, 'interval', |
||||
|
id='deepseek_ai_seo_job', seconds=config.DEEPSEEK_JOB_INTERVAL, |
||||
|
max_instances=config.DEEPSEEK_JOB_MAX_INSTANCES, coalesce=False, |
||||
|
args=[browser, config.DEEPSEEK_JOB_PLATFORM_IDS, config.DEEPSEEK_SEO_JOB_RANGE, |
||||
|
'deepseek_ai_seo_job', 'DeepSeek']) |
||||
|
logger.success('启动deepseek任务成功') |
||||
|
# 启动心跳任务 |
||||
|
# scheduler.add_job(heartbeat, 'interval', id='heartbeat', seconds=30,args=[browser]) |
||||
|
# logger.info('启动心跳任务成功') |
||||
|
scheduler.start() |
||||
|
await asyncio.Future() # 保持事件循环运行 |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.run(main()) # 启动事件循环 |
||||
|
|
||||
@ -0,0 +1,119 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
import asyncio |
||||
|
import json |
||||
|
import os |
||||
|
from dataclasses import asdict |
||||
|
from datetime import datetime |
||||
|
|
||||
|
from playwright.async_api import async_playwright |
||||
|
|
||||
|
import config |
||||
|
from domain.ai_seo import AiAnswer |
||||
|
from spiders.ai_seo import * |
||||
|
from utils.logger_utils import create_logger |
||||
|
|
||||
|
|
||||
|
logger = create_logger("app") |
||||
|
|
||||
|
|
||||
|
async def init_browser() -> tuple: |
||||
|
""" |
||||
|
初始化浏览器实例 |
||||
|
:return: |
||||
|
""" |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
||||
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
channel="chrome", |
||||
|
args=config.BROWSER_ARGS) |
||||
|
content = await browser.new_context() |
||||
|
return playwright, browser |
||||
|
|
||||
|
|
||||
|
def save_local(ai_answer: AiAnswer): |
||||
|
now = datetime.now().strftime("%Y-%m-%d") |
||||
|
base_path = f'./data/{ai_answer.platform_name}/{now}' |
||||
|
|
||||
|
if not os.path.exists(base_path): |
||||
|
os.makedirs(base_path) |
||||
|
|
||||
|
json_file_path = f'{base_path}/{ai_answer.prompt}.json' |
||||
|
_dict = asdict(ai_answer) |
||||
|
json_str = json.dumps(_dict, indent=4, ensure_ascii=False) |
||||
|
with open(json_file_path, 'w', encoding='utf-8') as f: |
||||
|
f.write(json_str) |
||||
|
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}") |
||||
|
|
||||
|
|
||||
|
async def main(): |
||||
|
playwright, browser = await init_browser() |
||||
|
ai_answer = await KimiSpider(browser, '2025前十的电动自行车推荐', '美的').run() |
||||
|
if ai_answer: |
||||
|
save_local(ai_answer) |
||||
|
|
||||
|
|
||||
|
async def test(): |
||||
|
prompts = [ |
||||
|
# '行业咨询公司排行榜', |
||||
|
# # '咨询公司排行榜', |
||||
|
# '中国有哪些知名的行业咨询公司', |
||||
|
# # '请推荐中国十大行业咨询公司', |
||||
|
# '中国行业咨询机构中,哪些是最具影响力的', |
||||
|
# '行业顾问排行榜上,哪些公司名列前茅', |
||||
|
# # '在行业咨询领域,哪些公司被誉为行业领军者', |
||||
|
# '我想了解中国行业咨询公司的排名,有吗', |
||||
|
# '请给我一份中国行业顾问公司的权威榜单', |
||||
|
# # '中国有哪些行业咨询公司备受业界认可', |
||||
|
# '能否介绍一下中国行业咨询领域的佼佼者', |
||||
|
# '中国行业咨询公司的排名情况如何', |
||||
|
# # '中国领先的行业顾问公司有哪些', |
||||
|
# '中国行业顾问排行榜上的公司都是哪些', |
||||
|
# # '有没有一份关于中国行业咨询机构的综合排名', |
||||
|
# '十大行业研究机构排名', |
||||
|
# '行业研究机构推荐', |
||||
|
# # '中国权威的行业研究机构有哪些', |
||||
|
# '能否列出一些中国顶级的行业研究机构', |
||||
|
'哪些行业研究机构在中国市场上最具竞争力', |
||||
|
# '行业研究机构中,哪些在业界享有盛誉', |
||||
|
'我想找一家权威的行业研究机构,有推荐吗', |
||||
|
# '请列举一些在行业研究方面有着深厚底蕴的中国公司', |
||||
|
'我想了解中国行业研究领域的权威机构,能介绍一下吗?', |
||||
|
# '高端卫浴品牌', |
||||
|
'数智卫浴品牌', |
||||
|
# '科技卫浴品牌', |
||||
|
'智能马桶品牌', |
||||
|
'智能淋浴器推荐', |
||||
|
# '小孩湿疹用什么药膏', |
||||
|
# '皮肤湿疹用什么药膏', |
||||
|
# '特应性皮炎用什么药膏最有效', |
||||
|
# '湿疹药膏排行榜', |
||||
|
# '皮炎性湿疹治疗药膏', |
||||
|
] |
||||
|
retry_prompts = [] |
||||
|
playwright, browser = await init_browser() |
||||
|
index = 1 |
||||
|
for prompt in prompts: |
||||
|
logger.info(f"[{index}/{len(prompts)}] {prompt}") |
||||
|
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run() |
||||
|
if ai_answer and ai_answer.run_status: |
||||
|
save_local(ai_answer) |
||||
|
if not ai_answer.run_status: |
||||
|
retry_prompts.append(prompt) |
||||
|
logger.info(f"[{len(prompts)}] {prompt} 采集失败") |
||||
|
index = index + 1 |
||||
|
await asyncio.sleep(300) |
||||
|
|
||||
|
for prompt in retry_prompts: |
||||
|
logger.info(f"重试[{index}/{len(prompts)}] {prompt}") |
||||
|
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run() |
||||
|
if ai_answer and ai_answer.run_status: |
||||
|
save_local(ai_answer) |
||||
|
if not ai_answer.run_status: |
||||
|
logger.info(f"[{len(prompts)}] {prompt} 采集失败") |
||||
|
index = index + 1 |
||||
|
await asyncio.sleep(300) |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(test()) |
||||
@ -0,0 +1 @@ |
|||||
|
# coding=utf-8 |
||||
@ -0,0 +1,9 @@ |
|||||
|
# coding=utf-8 |
||||
|
from .kimi import KimiSpider |
||||
|
from .deepseek import DeepseekSpider |
||||
|
from .nanometer import NanometerSpider |
||||
|
from .yiyan import YiYanSpider |
||||
|
from .yuanbao import YuanBaoSpider |
||||
|
from .tongyi import TongyiSpider |
||||
|
from .doubao import DouBaoSpider |
||||
|
from .metaso import MetasoSpider |
||||
@ -0,0 +1,189 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
import re |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
from glom import glom |
||||
|
from playwright.async_api import Browser |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger, css_to_dict |
||||
|
from utils.image_utils import crop_image_left |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class DeepseekSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str): |
||||
|
super().__init__(browser, prompt, keyword) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://chat.deepseek.com/' |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 1 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'DeepSeek' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
self._init_data() |
||||
|
self.search_result_count = 0 |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
await asyncio.sleep(3) |
||||
|
# 开启联网搜索 |
||||
|
search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..') |
||||
|
if await search_btn.is_visible(): |
||||
|
await search_btn.click() |
||||
|
if self.think: |
||||
|
# 开启深度思考 |
||||
|
think_btn = self.browser_page.locator("span:text('深度思考 (R1)')").locator('..') |
||||
|
if await think_btn.is_visible(): |
||||
|
styles = css_to_dict(await think_btn.get_attribute('style')) |
||||
|
if styles.get('--ds-button-color') == '#fff': |
||||
|
await think_btn.click() |
||||
|
await asyncio.sleep(1) |
||||
|
chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']") |
||||
|
await chat_input_element.click() |
||||
|
# 输入提问词 |
||||
|
await self.browser_page.keyboard.type(self.prompt) |
||||
|
await asyncio.sleep(1) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await self.completed_event.wait() |
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
# 打开搜索栏 |
||||
|
search_btn_text = f'已搜索到 {self.search_result_count} 个网页' |
||||
|
search_btn = self.browser_page.locator(f"div:text('{search_btn_text}')") |
||||
|
# search_btn = self.browser_page.locator('div:has-text("搜索到")') |
||||
|
if await search_btn.count() > 0: |
||||
|
await search_btn.click() |
||||
|
await asyncio.sleep(2) |
||||
|
if self.think: |
||||
|
# 思考元素 |
||||
|
think_element = self.browser_page.locator("text=已深度思考(") |
||||
|
think_element_count = await think_element.count() |
||||
|
if think_element_count > 0: |
||||
|
await think_element.nth(-1).click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 获取回答元素 |
||||
|
answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1) |
||||
|
box = await answer.bounding_box() |
||||
|
# 设置视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(box['height']) + 500 |
||||
|
}) |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
# 切割图片 |
||||
|
crop_image_left(screenshot_path, 250) |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"DeepSeek响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
return wrapper |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/api/v0/chat/completion' not in response.url: |
||||
|
return |
||||
|
# 读取流式数据 |
||||
|
response_text = '' |
||||
|
search_result_lists = list() |
||||
|
start_content = False |
||||
|
stream = await response.body() |
||||
|
body = stream.decode('utf-8') |
||||
|
datas = body.split("\n\n") |
||||
|
for data_str in datas: |
||||
|
if not data_str: |
||||
|
continue |
||||
|
data_str = data_str.replace('data: ', '') |
||||
|
try: |
||||
|
data = json.loads(data_str) |
||||
|
if glom(data, 'v.0.v', default='') == 'TIMEOUT': |
||||
|
self.fail_status = True |
||||
|
logger.error("DeepSeek服务器繁忙") |
||||
|
except JSONDecodeError as e: |
||||
|
continue |
||||
|
# 获取ai搜索结果 |
||||
|
if data.get('p', '') == 'response/search_results' or isinstance(data.get('v', ''), list): |
||||
|
logger.debug(f"获取到联网搜索结果") |
||||
|
search_result_list = data.get('v', []) |
||||
|
# 保存搜索结果 |
||||
|
ai_search_result_list = [] |
||||
|
|
||||
|
for search_result in search_result_list: |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
body = search_result.get('snippet', '') |
||||
|
publish_time = search_result.get('published_at', '') |
||||
|
host_name = search_result.get('site_name', '未知') |
||||
|
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name) |
||||
|
if ai_result.title and ai_result.url: |
||||
|
ai_search_result_list.append(ai_result) |
||||
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
if ai_search_result_list: |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.search_result_count = len(self.ai_answer.search_result) |
||||
|
continue |
||||
|
# 是否开始返回回复数据 |
||||
|
if data.get('p', '') == 'response/content': |
||||
|
start_content = True |
||||
|
if start_content: |
||||
|
# 获取ai回复 |
||||
|
value = data.get('v', None) |
||||
|
if isinstance(value, dict): |
||||
|
continue |
||||
|
if value is None: |
||||
|
target = 'choices.0.delta.content' |
||||
|
value = glom(data, target, default="") |
||||
|
response_text = response_text + str(value) |
||||
|
#匹配citation:中的数字 |
||||
|
citation = list() |
||||
|
citations = re.findall(r'citation:(\d+)', response_text) |
||||
|
if citations: |
||||
|
citation = list(set(citations)) |
||||
|
# 保存搜索结果 |
||||
|
ai_search_result_list = [] |
||||
|
for index,search_result in enumerate(search_result_lists): |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
body = search_result.get('snippet', '') |
||||
|
publish_time = search_result.get('published_at', '') |
||||
|
host_name = search_result.get('site_name', '未知') |
||||
|
if str(index+1) in citation: |
||||
|
is_referenced = "1" |
||||
|
else: |
||||
|
is_referenced = "0" |
||||
|
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced) |
||||
|
if ai_result.title and ai_result.url: |
||||
|
ai_search_result_list.append(ai_result) |
||||
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
if ai_search_result_list: |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.search_result_count = len(self.ai_answer.search_result) |
||||
|
logger.debug(response_text) |
||||
|
self.ai_answer.answer = response_text |
||||
|
self.completed_event.set() |
||||
@ -0,0 +1,164 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
import asyncio |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
|
||||
|
import ftfy |
||||
|
from glom import glom |
||||
|
from playwright.async_api import Browser |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger, parse_nested_json |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
class DouBaoSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
||||
|
super().__init__(browser, prompt, keyword, think) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://www.doubao.com/chat' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化信息 |
||||
|
self._init_data() |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
await asyncio.sleep(3) |
||||
|
if self.think: |
||||
|
think_btn = self.browser_page.locator("//button[@title='深度思考']") |
||||
|
if await think_btn.is_visible(): |
||||
|
clazz = (await think_btn.get_attribute('class')).split(' ') |
||||
|
# 找出点击的class名称 |
||||
|
target_class = [c for c in clazz if c.startswith("active-")] |
||||
|
if not target_class: |
||||
|
await think_btn.click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 开始操作 |
||||
|
chat_input_element = self.browser_page.locator("//textarea[@data-testid='chat_input_input']") |
||||
|
# 输入提问词 |
||||
|
await chat_input_element.fill(self.prompt) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.completed_event.wait() |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# 关闭侧边栏 |
||||
|
sider_bar_element = self.browser_page.locator("//button[@data-testid='siderbar_close_btn']") |
||||
|
if await sider_bar_element.is_visible(): |
||||
|
await sider_bar_element.click() |
||||
|
|
||||
|
# 资料弹出框 |
||||
|
search_result_popup_element = self.browser_page.locator("//div[contains(@class, 'search-item-transition-')]") |
||||
|
# 资料按钮 |
||||
|
search_result_btn_list = self.browser_page.locator("//div[contains(@class, 'entry-btn-')]") |
||||
|
if await search_result_btn_list.count() > 0 and not await search_result_popup_element.count() > 0: |
||||
|
await search_result_btn_list.nth(-1).click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 搜索结果元素 |
||||
|
search_result_element_list = self.browser_page.locator("//a[contains(@class, 'search-')]") |
||||
|
ai_search_result_list = [] |
||||
|
if await search_result_element_list.count() > 0: |
||||
|
for index,search_result_element in enumerate(await search_result_element_list.all()): |
||||
|
url = await search_result_element.get_attribute('href') |
||||
|
title = '' |
||||
|
desc = '' |
||||
|
host_name = '' |
||||
|
title_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-title-')]") |
||||
|
desc_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-summary-')]") |
||||
|
host_name_element = search_result_element.locator("xpath=.//span[contains(@class, 'footer-title-')]") |
||||
|
# 获取标题 |
||||
|
if await title_element.is_visible(): |
||||
|
title = await title_element.inner_text() |
||||
|
# 获取描述 |
||||
|
if await desc_element.is_visible(): |
||||
|
desc = await desc_element.inner_text() |
||||
|
# 获取来源 |
||||
|
if await host_name_element.is_visible(): |
||||
|
host_name = await host_name_element.inner_text() |
||||
|
if index+1 in self.index_data: |
||||
|
is_referenced = "1" |
||||
|
else: |
||||
|
is_referenced = "0" |
||||
|
ai_search_result_list.append(AiSearchResult( |
||||
|
title=title, |
||||
|
url=url, |
||||
|
host_name=host_name, |
||||
|
body=desc, |
||||
|
is_referenced=is_referenced |
||||
|
)) |
||||
|
logger.debug(f'搜索结果: [{host_name}]{title}({url})') |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
# 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[@data-testid='receive_message']").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 500 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path, full_page=True) |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
# 读取流式数据 |
||||
|
if '/samantha/chat/completion' in response.url: |
||||
|
answer = '' |
||||
|
datas = [] |
||||
|
response_text = ftfy.fix_text(await response.text()) |
||||
|
lines = response_text.split("\n\n") |
||||
|
for line in lines: |
||||
|
if line.startswith('data: '): |
||||
|
line = line[6:] |
||||
|
try: |
||||
|
data = parse_nested_json(line) |
||||
|
datas.append(data) |
||||
|
event_data = data.get('event_data', {}) |
||||
|
target_key = 'message.content.text' |
||||
|
text = glom(event_data, target_key, default=None) |
||||
|
if not text is None: |
||||
|
answer = answer + str(text) |
||||
|
except JSONDecodeError: |
||||
|
continue |
||||
|
logger.debug(f"ai回复: {answer}") |
||||
|
self.ai_answer.answer = answer |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 5 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'DouBao' |
||||
@ -0,0 +1,148 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
from functools import partial, wraps |
||||
|
|
||||
|
from playwright.async_api import Browser |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger |
||||
|
from glom import glom, Coalesce |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
class KimiSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
||||
|
super().__init__(browser, prompt, keyword, think) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://www.kimi.ai' |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 4 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'Kimi' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
self.completed_event = asyncio.Event() |
||||
|
await self.browser_page.goto('https://www.kimi.ai', timeout=600000) |
||||
|
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword) |
||||
|
await asyncio.sleep(3) |
||||
|
if self.think: |
||||
|
think_btn = self.browser_page.locator("span:text('长思考 (k1.5)')").locator('..') |
||||
|
if await think_btn.is_visible(): |
||||
|
clazz = (await think_btn.get_attribute('class')).split(' ') |
||||
|
if 'open' not in clazz: |
||||
|
await think_btn.click() |
||||
|
await asyncio.sleep(2) |
||||
|
chat_input_element = self.browser_page.locator("//div[@class='chat-input']") |
||||
|
await chat_input_element.click() |
||||
|
# 输入提问词 |
||||
|
await self.browser_page.keyboard.type(self.prompt) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await self.completed_event.wait() |
||||
|
await asyncio.sleep(2) |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# 关闭侧边栏 |
||||
|
sidebar_element = self.browser_page.locator("//div[@class='expand-btn']") |
||||
|
if await sidebar_element.is_visible(): |
||||
|
await sidebar_element.click() |
||||
|
# 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[@class='segment-container']").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 500 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 打开搜索结果 |
||||
|
search_list_content_element = self.browser_page.locator("//div[contains(@class, 'side-console-container')]") |
||||
|
search_list_element = self.browser_page.locator("//div[@class='search-plus']") |
||||
|
if await search_list_element.is_visible() and not await search_list_content_element.is_visible(): |
||||
|
await search_list_element.click() |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
return self.ai_answer |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/segment/scroll' in response.url: |
||||
|
json_data = await response.json() |
||||
|
if json_data['items']: |
||||
|
logger.debug(json_data) |
||||
|
detail = json_data['items'][-1] |
||||
|
content = detail['content'] |
||||
|
if self.think: |
||||
|
self.ai_answer.search_result = self.get_search_list_enable_think(detail) |
||||
|
else: |
||||
|
self.ai_answer.search_result = self.get_search_list_disable_think(detail) |
||||
|
self.ai_answer.answer = content |
||||
|
logger.debug(f"ai回复: {content}") |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
def get_search_list_disable_think(self, detail): |
||||
|
""" |
||||
|
未开启深度思考时 获取搜索结果 |
||||
|
:param detail: |
||||
|
:return: |
||||
|
""" |
||||
|
answer_search_list = [] |
||||
|
search_result_list = detail.get('search_plus', []) |
||||
|
for search_result in search_result_list: |
||||
|
event = search_result.get('event', '') |
||||
|
msg = search_result.get('msg', {}) |
||||
|
msg_type = msg.get('type', '') |
||||
|
if event == 'search_plus' and msg_type == 'get_res': |
||||
|
answer_search_list.append( |
||||
|
AiSearchResult(msg['title'], msg['url'], msg['site_name'], msg['snippet'], msg['date'])) |
||||
|
logger.debug(f"ai参考资料: {msg['title']}({msg['url']})") |
||||
|
return answer_search_list |
||||
|
|
||||
|
def get_search_list_enable_think(self, detail): |
||||
|
""" |
||||
|
开启深度思考时 获取搜索结果 |
||||
|
:param detail: |
||||
|
:return: |
||||
|
""" |
||||
|
answer_search_list = [] |
||||
|
keys = 'contents.zones.0.sections.0.k1.search_results' |
||||
|
search_result_list = glom(detail, keys, default=[]) |
||||
|
for search_result in search_result_list: |
||||
|
answer_search_list.append( |
||||
|
AiSearchResult(search_result['title'], search_result['url'], search_result['site_name'], search_result['snippet'], search_result['date'])) |
||||
|
logger.debug(f"ai参考资料: {search_result['title']}({search_result['url']})") |
||||
|
return answer_search_list |
||||
@ -0,0 +1,196 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
import asyncio |
||||
|
import json |
||||
|
import re |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
|
||||
|
import ftfy |
||||
|
import pyperclip |
||||
|
from playwright.async_api import Browser, async_playwright |
||||
|
|
||||
|
import config |
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
class MetasoSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, load_session: bool = True): |
||||
|
super().__init__(browser, prompt, keyword, load_session) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://metaso.cn/' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化信息 |
||||
|
self._init_data() |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
await asyncio.sleep(2) |
||||
|
# 开始操作 |
||||
|
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'search-consult-textarea')]") |
||||
|
# 输入提问词 |
||||
|
await chat_input_element.fill(self.prompt) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
await asyncio.sleep(2) |
||||
|
# self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await self.browser_page.reload() |
||||
|
# await self.completed_event.wait() |
||||
|
# 等待指定元素 |
||||
|
copy_button = await self.browser_page.wait_for_selector("//button[@id='generateInteractiveReportButton']/preceding-sibling::div[1]/button", timeout=600000) |
||||
|
# 点击复制按钮 |
||||
|
await copy_button.click() |
||||
|
# 读取剪贴板 |
||||
|
self.ai_answer.answer = pyperclip.paste() |
||||
|
logger.debug(f'ai回复内容: {self.ai_answer}') |
||||
|
# 获取来源数据 |
||||
|
try: |
||||
|
await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000) |
||||
|
search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span") |
||||
|
search_item_count = await search_items.count() |
||||
|
logger.debug(f'来源数据: {search_item_count}') |
||||
|
await asyncio.sleep(5) |
||||
|
search_results = [] |
||||
|
for i in range(search_item_count): |
||||
|
search_result = AiSearchResult() |
||||
|
search_item = search_items.nth(i) |
||||
|
# 抽取链接和标题 |
||||
|
a = search_item.locator("xpath=./a") |
||||
|
# 抽取时间 |
||||
|
publish_date_element = search_item.locator("xpath=./span") |
||||
|
if await a.is_visible(): |
||||
|
search_result.title = await a.text_content() |
||||
|
search_result.url = await a.get_attribute('href') |
||||
|
if await publish_date_element.count() > 0: |
||||
|
publish_date_element = search_item.locator("xpath=./span").nth(-1) |
||||
|
publish_str = await publish_date_element.text_content() |
||||
|
search_result.publish_time = publish_str.replace('[', '').replace(']', '') |
||||
|
search_results.append(search_result) |
||||
|
self.ai_answer.search_result = search_results |
||||
|
except TimeoutError: |
||||
|
logger.error('没有搜索结果') |
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
# 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]") |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 300 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 13 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'Metaso' |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
url = response.url |
||||
|
if 'searchV2' in url: |
||||
|
answer = '' |
||||
|
results = [] |
||||
|
search_results = list() |
||||
|
response_text = await response.text() |
||||
|
event_lines = response_text.split('\n\n') |
||||
|
self.completed_event.set() |
||||
|
for line in event_lines: |
||||
|
if line.startswith('data:'): |
||||
|
line = line[5:] |
||||
|
try: |
||||
|
event_json = json.loads(line) |
||||
|
except JSONDecodeError: |
||||
|
continue |
||||
|
# 开始event_json |
||||
|
type = event_json.get('type') |
||||
|
# 获取到搜索结果 |
||||
|
if type == 'set-reference': |
||||
|
search_results = event_json.get('list', []) |
||||
|
# for search_result in search_results: |
||||
|
# result = AiSearchResult(title=search_result.get('title', ''), |
||||
|
# url=search_result.get('url', ''), |
||||
|
# host_name=search_result.get('author', ''), |
||||
|
# body=search_result.get('displaySource'), |
||||
|
# publish_time=search_result.get('publish_time', '')) |
||||
|
# results.append(result) |
||||
|
# self.ai_answer.search_result = results |
||||
|
# 获取到回答内容 |
||||
|
if type == 'append-text': |
||||
|
answer = answer + event_json.get('text', '') |
||||
|
pattern = r'\[(\d+)\]' |
||||
|
index_data = list(set(re.findall(pattern, answer))) |
||||
|
for index,search_result in enumerate(search_results): |
||||
|
if str(index+1) in index_data: |
||||
|
result = AiSearchResult(title=search_result.get('title', ''), |
||||
|
url=search_result.get('url', ''), |
||||
|
host_name=search_result.get('author', ''), |
||||
|
body=search_result.get('displaySource'), |
||||
|
publish_time=search_result.get('publish_time', ''), |
||||
|
is_referenced="1") |
||||
|
else: |
||||
|
result = AiSearchResult(title=search_result.get('title', ''), |
||||
|
url=search_result.get('url', ''), |
||||
|
host_name=search_result.get('author', ''), |
||||
|
body=search_result.get('displaySource'), |
||||
|
publish_time=search_result.get('publish_time', ''), |
||||
|
is_referenced="0") |
||||
|
results.append(result) |
||||
|
self.ai_answer.search_result = results |
||||
|
self.ai_answer.answer = answer |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
|
||||
|
|
||||
|
async def run(): |
||||
|
# playwright = await async_playwright().start() |
||||
|
# browser = await playwright.chromium.launch(headless=False, |
||||
|
# chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
# ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
# channel="chrome", |
||||
|
# args=config.BROWSER_ARGS) |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.firefox.launch( |
||||
|
headless=False, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
args=config.BROWSER_ARGS |
||||
|
) |
||||
|
spider = MetasoSpider(browser, '2025前端工具库top5', '') |
||||
|
await spider.run() |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(run()) |
||||
@ -0,0 +1,174 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
import re |
||||
|
from datetime import datetime |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
|
||||
|
from glom import glom |
||||
|
from playwright.async_api import Playwright, Browser, async_playwright |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger, parse_nested_json |
||||
|
|
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class NanometerSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str): |
||||
|
super().__init__(browser, prompt, keyword) |
||||
|
self.load_session = False |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://www.n.cn/' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化数据 |
||||
|
self._init_data() |
||||
|
# 开始操作 |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']") |
||||
|
# 输入提问词 |
||||
|
await chat_input_element.fill(self.prompt) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.completed_event.wait() |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[@class='js-article-content']").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 500 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path, full_page=True) |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
def __parse_event_data(self, data_str): |
||||
|
# 按照 'id:' 分割文本,去掉第一个空的部分 |
||||
|
parts = data_str.strip().split('id:')[1:] |
||||
|
|
||||
|
# 初始化结果列表 |
||||
|
result = [] |
||||
|
|
||||
|
# 遍历每个部分,提取数据并存储到字典中 |
||||
|
for part in parts: |
||||
|
lines = part.strip().split('\n') |
||||
|
item = {} |
||||
|
for line in lines: |
||||
|
if ':' not in line: |
||||
|
key = 'id' |
||||
|
value = line |
||||
|
else: |
||||
|
key, value = line.split(':', 1) |
||||
|
key = key.strip() |
||||
|
value = value.strip() |
||||
|
if key == 'data': |
||||
|
try: |
||||
|
# 尝试将 data 转换为 JSON 对象 |
||||
|
import json |
||||
|
value = json.loads(value) |
||||
|
except JSONDecodeError: |
||||
|
pass |
||||
|
item[key] = value |
||||
|
result.append(item) |
||||
|
return result |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/api/common/chat/v2' not in response.url: |
||||
|
return |
||||
|
# 读取流式数据 |
||||
|
stream = await response.body() |
||||
|
response_text = stream.decode('utf-8') |
||||
|
datas = self.__parse_event_data(response_text) |
||||
|
answer = '' |
||||
|
search_result_list = list() |
||||
|
# 遍历每行数据 |
||||
|
for data in datas: |
||||
|
event = data.get('event', '') |
||||
|
if event == '200': |
||||
|
answer = answer + str(data.get('data', '')) |
||||
|
elif event == '102': |
||||
|
# json格式的返回 要解析数据 |
||||
|
data = data.get('data', {}) |
||||
|
if isinstance(data, str): |
||||
|
data = parse_nested_json(data) |
||||
|
data_type = data.get('type', '') |
||||
|
if data_type == 'search_result': |
||||
|
search_result_list = glom(data, 'message.list', default=[]) |
||||
|
# # 保存搜索数据 |
||||
|
# ai_search_result_list = [] |
||||
|
# for search_result in search_result_list: |
||||
|
# title = search_result.get('title', '') |
||||
|
# url = search_result.get('url', '') |
||||
|
# body = search_result.get('summary', '') |
||||
|
# host_name = search_result.get('site', '未知') |
||||
|
# publish_time = search_result.get('date', 0) |
||||
|
# ai_search_result_list.append( |
||||
|
# AiSearchResult(title, url, host_name, body, publish_time) |
||||
|
# ) |
||||
|
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
# self.ai_answer.search_result = ai_search_result_list |
||||
|
pattern = r'\[(\d+)\]' |
||||
|
index_data = list(set(re.findall(pattern, answer))) |
||||
|
ai_search_result_list = [] |
||||
|
for index,search_result in enumerate(search_result_list): |
||||
|
title = search_result.get('title', '') |
||||
|
url = search_result.get('url', '') |
||||
|
body = search_result.get('summary', '') |
||||
|
host_name = search_result.get('site', '未知') |
||||
|
publish_time = search_result.get('date', 0) |
||||
|
if str(index+1) in index_data: |
||||
|
is_referenced = "1" |
||||
|
else: |
||||
|
is_referenced = "0" |
||||
|
ai_search_result_list.append( |
||||
|
AiSearchResult(title, url, host_name, body, publish_time,is_referenced) |
||||
|
) |
||||
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.ai_answer.answer = answer |
||||
|
logger.debug(f'ai回复: {answer}') |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 7 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'Nano' |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
@ -0,0 +1,176 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import re |
||||
|
from functools import partial, wraps |
||||
|
from json import JSONDecodeError |
||||
|
|
||||
|
from glom import glom |
||||
|
from playwright.async_api import Browser |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger, parse_nested_json |
||||
|
from utils.image_utils import crop_image_left |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class TongyiSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
||||
|
super().__init__(browser, prompt, keyword, think) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://tongyi.aliyun.com' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化信息 |
||||
|
self._init_data() |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
if self.think: |
||||
|
search_btn = self.browser_page.locator("div:text('深度思考')") |
||||
|
if await search_btn.is_visible(): |
||||
|
await search_btn.click() |
||||
|
await asyncio.sleep(1) |
||||
|
else: |
||||
|
search_btn = self.browser_page.locator("div:text('联网搜索')") |
||||
|
if await search_btn.is_visible(): |
||||
|
await search_btn.click() |
||||
|
await asyncio.sleep(1) |
||||
|
|
||||
|
# 开始操作 |
||||
|
# chat_input_element = self.browser_page.locator("//textarea[@placeholder='千事不决问通义']") |
||||
|
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'ant-input')]") |
||||
|
await chat_input_element.click() |
||||
|
# 输入提问词 |
||||
|
await self.browser_page.keyboard.type(self.prompt) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.completed_event.wait() |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# # 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[contains(@class, 'answerItem')]").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 500 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 打开搜索结果 |
||||
|
search_list_element = self.browser_page.locator("//div[contains(@class, 'linkTitle')]").nth(-1) |
||||
|
if await search_list_element.is_visible(): |
||||
|
await search_list_element.click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 关闭侧边栏 |
||||
|
side_console_element = self.browser_page.locator("//span[contains(@class, 'sc-frniUE')]") |
||||
|
if await side_console_element.is_visible(): |
||||
|
await side_console_element.click() |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
# 切割图片 |
||||
|
crop_image_left(screenshot_path, 340) |
||||
|
|
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/dialog/conversation' not in response.url: |
||||
|
return |
||||
|
# 读取流式数据 |
||||
|
data = {} |
||||
|
stream = await response.body() |
||||
|
response_text = stream.decode('utf-8') |
||||
|
datas = response_text.split("\n") |
||||
|
# 合规数据转成字典 |
||||
|
for data_str in datas: |
||||
|
if not data_str or data_str == 'data: [DONE]': |
||||
|
continue |
||||
|
data_str = data_str.replace('data: ', '') |
||||
|
try: |
||||
|
data = parse_nested_json(data_str) |
||||
|
except JSONDecodeError as e: |
||||
|
continue |
||||
|
logger.debug(f"结果: {data}") |
||||
|
# 获取结果 |
||||
|
contents = data.get('contents', []) |
||||
|
# 保存搜索内容 |
||||
|
ai_search_result_list = [] |
||||
|
search_result_list = list() |
||||
|
for content in contents: |
||||
|
content_type = content.get('contentType', '') |
||||
|
if content_type == 'plugin': |
||||
|
logger.debug(f"获取到联网搜索结果") |
||||
|
if self.think: |
||||
|
search_result_list = glom(content, 'content.pluginResult', default=[]) |
||||
|
else: |
||||
|
search_result_list = glom(content, 'content.pluginResult.-1.search_results', default=[]) |
||||
|
# for search_result in search_result_list: |
||||
|
# url = search_result.get('url', '') |
||||
|
# title = search_result.get('title', '') |
||||
|
# body = search_result.get('body', '') |
||||
|
# host_name = search_result.get('host_name', '未知') |
||||
|
# publish_time = search_result.get('time', 0) |
||||
|
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
# ai_search_result_list.append( |
||||
|
# AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time) |
||||
|
# ) |
||||
|
if content_type == 'text': |
||||
|
logger.debug(f'获取到ai回复结果') |
||||
|
answer = content.get('content', '') |
||||
|
logger.debug(f"ai回复: {answer}") |
||||
|
self.ai_answer.answer = answer |
||||
|
pattern = r'ty-reference]\((\d+)\)' |
||||
|
index_data = list(set(re.findall(pattern, self.ai_answer.answer))) |
||||
|
for index, search_result in enumerate(search_result_list): |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
body = search_result.get('body', '') |
||||
|
host_name = search_result.get('host_name', '未知') |
||||
|
publish_time = search_result.get('time', 0) |
||||
|
if str(index+1) in index_data: |
||||
|
is_referenced = "1" |
||||
|
else: |
||||
|
is_referenced = "0" |
||||
|
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
||||
|
ai_search_result_list.append( |
||||
|
AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time,is_referenced=is_referenced) |
||||
|
) |
||||
|
if ai_search_result_list: |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 2 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'TongYi' |
||||
@ -0,0 +1,213 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
from functools import partial, wraps |
||||
|
|
||||
|
from glom import glom |
||||
|
from playwright.async_api import async_playwright, Browser |
||||
|
|
||||
|
import config |
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
from utils.image_utils import crop_image_left |
||||
|
from utils.session_utils import get_spider_session |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
class YiYanSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
||||
|
super().__init__(browser, prompt, keyword, think) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://yiyan.baidu.com/' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化数据 |
||||
|
self._init_data() |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
await asyncio.sleep(2) |
||||
|
# 检查登录状态 |
||||
|
await self.check_login() |
||||
|
if self.think: |
||||
|
think_btn = self.browser_page.locator("span:text('深度思考(X1 Turbo)')").locator('..') |
||||
|
clazz = (await think_btn.get_attribute('class')).split(' ') |
||||
|
if await think_btn.is_visible(): |
||||
|
if len(clazz) == 1: |
||||
|
await think_btn.click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 开始操作 |
||||
|
chat_input_element = self.browser_page.locator("//div[@class='yc-editor']") |
||||
|
await chat_input_element.click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 输入提问词 |
||||
|
await self.browser_page.keyboard.insert_text(self.prompt) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
if self.think: |
||||
|
self.browser_page.on('response', partial(self.__listen_response_thinking)) |
||||
|
await asyncio.sleep(2) |
||||
|
try: |
||||
|
await self.browser_page.wait_for_selector("//div[@data-auto-test='anew_response']", state='attached', timeout=600000) |
||||
|
logger.debug('ai回答完毕') |
||||
|
except TimeoutError as e: |
||||
|
logger.error('ai回答超时') |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[contains(@class, 'dialog-card-wrapper')]").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 1000 |
||||
|
# 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 打开搜索结果 |
||||
|
open_search_btn_element = self.browser_page.locator("div:text('条网页信息源')") |
||||
|
if await open_search_btn_element.count() > 0: |
||||
|
await open_search_btn_element.click() |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
# 切割图片 |
||||
|
crop_image_left(screenshot_path, 260) |
||||
|
|
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/chat/history' not in response.url: |
||||
|
return |
||||
|
answer = '' |
||||
|
chat = {} |
||||
|
json_data = await response.json() |
||||
|
chats = list(dict.values(glom(json_data, 'data.chats', default={}))) |
||||
|
# 选择目标chat |
||||
|
for _chat in chats: |
||||
|
if not _chat.get('role', '') == 'robot': |
||||
|
continue |
||||
|
content = glom(_chat, 'message.0.content', default='') |
||||
|
if not content: |
||||
|
continue |
||||
|
chat = _chat |
||||
|
break |
||||
|
if not chat: |
||||
|
return |
||||
|
answer = glom(chat, 'message.0.content', default="") |
||||
|
# 搜索结果 |
||||
|
if not self.think: |
||||
|
search_result_list = glom(chat, 'searchCitations.list', default=[]) |
||||
|
# 保存搜索结果 |
||||
|
ai_search_result_list = [] |
||||
|
for search_result in search_result_list: |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
desc = search_result.get('wild_abstract', '') |
||||
|
host_name = search_result.get('site', '') |
||||
|
date = search_result.get('date', '') |
||||
|
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})") |
||||
|
ai_search_result_list.append(AiSearchResult( |
||||
|
url=url, |
||||
|
title=title, |
||||
|
host_name=host_name, |
||||
|
body=desc, |
||||
|
publish_time=date |
||||
|
)) |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.ai_answer.answer = answer |
||||
|
|
||||
|
async def __listen_response_thinking(self, response): |
||||
|
if '/chat/conversation/v2' not in response.url: |
||||
|
return |
||||
|
# 读取流式数据 |
||||
|
data = {} |
||||
|
search_list = [] |
||||
|
stream = await response.body() |
||||
|
response_text = stream.decode('utf-8') |
||||
|
response_lines = response_text.split("\n\n") |
||||
|
for line in response_lines: |
||||
|
sub_lines = line.split("\n") |
||||
|
for sub_line in sub_lines: |
||||
|
if sub_line.startswith('data:'): |
||||
|
json_data = json.loads(sub_line[5:]) |
||||
|
history_need = json_data.get('historyNeed', None) |
||||
|
search_list = json_data.get('contents', []) |
||||
|
if history_need == 1 and search_list: |
||||
|
break |
||||
|
if search_list: |
||||
|
break |
||||
|
ai_search_result_list = [] |
||||
|
for search_result in search_list: |
||||
|
url = search_result.get('url', '') |
||||
|
title = search_result.get('title', '') |
||||
|
desc = search_result.get('siteAbstract', '') |
||||
|
host_name = search_result.get('name', '') |
||||
|
date = search_result.get('publishTime', '') |
||||
|
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})") |
||||
|
ai_search_result_list.append(AiSearchResult( |
||||
|
url=url, |
||||
|
title=title, |
||||
|
host_name=host_name, |
||||
|
body=desc, |
||||
|
publish_time=date |
||||
|
)) |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
|
||||
|
|
||||
|
async def check_login(self): |
||||
|
# 找登录后才会出现的侧边栏 |
||||
|
try: |
||||
|
await self.browser_page.locator("//div[@id='eb_sidebar']").wait_for(state='attached', timeout=20000) |
||||
|
except Exception: |
||||
|
# 更新session状态 |
||||
|
await AiSeoApis.update_spider_session(self.session_info['id'], 2) |
||||
|
raise Exception(f"{self.get_platform_name()}登录失败 session_id: {self.session_info['id']}") |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 6 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'YiYan' |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
async def run(): |
||||
|
playwright = await async_playwright().start() |
||||
|
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
||||
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
||||
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
||||
|
channel="chrome", |
||||
|
args=config.BROWSER_ARGS) |
||||
|
spider = YiYanSpider(browser, '你好', '') |
||||
|
await spider.run() |
||||
|
if __name__ == '__main__': |
||||
|
asyncio.run(run()) |
||||
@ -0,0 +1,174 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import re |
||||
|
from datetime import datetime |
||||
|
from functools import partial, wraps |
||||
|
|
||||
|
from playwright.async_api import Playwright, Browser, async_playwright |
||||
|
|
||||
|
from abs_spider import AbstractAiSeoSpider |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger |
||||
|
from glom import glom, Coalesce |
||||
|
|
||||
|
from utils.image_utils import crop_image_left |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class YuanBaoSpider(AbstractAiSeoSpider): |
||||
|
|
||||
|
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
||||
|
super().__init__(browser, prompt, keyword, think) |
||||
|
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
||||
|
|
||||
|
def get_home_url(self) -> str: |
||||
|
return 'https://yuanbao.tencent.com/' |
||||
|
|
||||
|
async def _do_spider(self) -> AiAnswer: |
||||
|
# 初始化数据 |
||||
|
self._init_data() |
||||
|
self.is_get_detail = False |
||||
|
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
||||
|
await asyncio.sleep(2) |
||||
|
# 开启深度思考 |
||||
|
if self.think: |
||||
|
think_button = self.browser_page.locator("//button[@dt-button-id='deep_think']") |
||||
|
if await think_button.is_visible(): |
||||
|
model_id = await think_button.get_attribute('dt-model-id') |
||||
|
if not model_id == 'deep_seek': |
||||
|
await think_button.click() |
||||
|
await asyncio.sleep(2) |
||||
|
# 开启联网搜索 |
||||
|
search_button = self.browser_page.locator("//button[@dt-button-id='online_search']") |
||||
|
if await search_button.is_visible(): |
||||
|
class_str = await search_button.get_attribute('class') |
||||
|
clazz = class_str.split(' ') |
||||
|
if 'checked' not in clazz: |
||||
|
logger.debug('未开启联网搜索') |
||||
|
await search_button.click() |
||||
|
await asyncio.sleep(1) |
||||
|
|
||||
|
# 开始操作 |
||||
|
chat_input_element = self.browser_page.locator("//div[contains(@class, 'chat-input-editor')]") |
||||
|
await chat_input_element.click() |
||||
|
# 输入提问词 |
||||
|
await self.browser_page.keyboard.type(self.prompt) |
||||
|
await asyncio.sleep(2) |
||||
|
await self.browser_page.keyboard.press('Enter') |
||||
|
# 监听请求 |
||||
|
self.browser_page.on('response', partial(self.__listen_response)) |
||||
|
await self.completed_event.wait() |
||||
|
|
||||
|
# 报错检查 |
||||
|
if self.fail_status: |
||||
|
raise self.fail_exception |
||||
|
|
||||
|
# # 获取回答元素 |
||||
|
answer_element = self.browser_page.locator("//div[@class='agent-chat__list__item__content']").nth(-1) |
||||
|
box = await answer_element.bounding_box() |
||||
|
logger.debug(f'answer_element: {box}') |
||||
|
view_port_height = box['height'] + 500 |
||||
|
# # 调整视口大小 |
||||
|
await self.browser_page.set_viewport_size({ |
||||
|
'width': 1920, |
||||
|
'height': int(view_port_height) |
||||
|
}) |
||||
|
# 收起侧栏 |
||||
|
# await self.browser_page.locator("//div[@data-desc='fold']").click() |
||||
|
# 打开联网搜索结果 |
||||
|
search_list_element = self.browser_page.locator("(//div[contains(@data-title, '资料作为参考')])[1]/span") |
||||
|
if await search_list_element.is_visible(): |
||||
|
await search_list_element.click() |
||||
|
# 截图 |
||||
|
screenshot_path = self._get_screenshot_path() |
||||
|
await self.browser_page.screenshot(path=screenshot_path) |
||||
|
crop_image_left(screenshot_path, 260) |
||||
|
self.ai_answer.screenshot_file = screenshot_path |
||||
|
return self.ai_answer |
||||
|
|
||||
|
async def __listen_response(self, response): |
||||
|
if '/agent/conversation/v1/detail' not in response.url or self.is_get_detail: |
||||
|
return |
||||
|
json_data = await response.json() |
||||
|
# 取值key |
||||
|
if not json_data['convs']: |
||||
|
return |
||||
|
convs = json_data['convs'] |
||||
|
content = {} |
||||
|
for conv in convs: |
||||
|
key = 'speechesV2.0.content' |
||||
|
content = glom(conv, key, default=[]) |
||||
|
if len(content) > 1: |
||||
|
break |
||||
|
# 循环获取content中的内容 |
||||
|
search_list = None |
||||
|
think = None |
||||
|
text = None |
||||
|
for item in content: |
||||
|
if item['type'] == 'text': |
||||
|
text = item.get('msg', '') |
||||
|
elif item['type'] == 'searchGuid': |
||||
|
search_list = item.get('docs', []) |
||||
|
elif item['type'] == 'think': |
||||
|
think = item.get('content', '') |
||||
|
logger.debug(f'ai回复内容: {text}') |
||||
|
ai_search_result_list = [] |
||||
|
self.ai_answer.answer = text |
||||
|
if search_list: |
||||
|
pattern = r'\[\^(\d+)\]' |
||||
|
index_data = list(set(re.findall(pattern, self.ai_answer.answer))) |
||||
|
for index,search_result in enumerate(search_list): |
||||
|
if str(index+1) in index_data: |
||||
|
ai_search_result_list.append( |
||||
|
AiSearchResult( |
||||
|
title=search_result.get('title', ''), |
||||
|
url=search_result.get('url', ''), |
||||
|
host_name=search_result.get('web_site_name', ''), |
||||
|
body=search_result.get('quote', ''), |
||||
|
publish_time=search_result.get('publish_time', 0), |
||||
|
is_referenced = "1" |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
else: |
||||
|
ai_search_result_list.append( |
||||
|
AiSearchResult( |
||||
|
title=search_result.get('title', ''), |
||||
|
url=search_result.get('url', ''), |
||||
|
host_name=search_result.get('web_site_name', ''), |
||||
|
body=search_result.get('quote', ''), |
||||
|
publish_time=search_result.get('publish_time', 0), |
||||
|
is_referenced = "0" |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
logger.debug(f'ai参考资料: {search_list}') |
||||
|
self.ai_answer.search_result = ai_search_result_list |
||||
|
self.is_get_detail = True |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
def handle_listen_response_error(self, func): |
||||
|
""" |
||||
|
装饰器 用于处理请求回调中的异常 |
||||
|
:param func: |
||||
|
:return: |
||||
|
""" |
||||
|
|
||||
|
@wraps(func) |
||||
|
async def wrapper(*args, **kwargs): |
||||
|
try: |
||||
|
return await func(*args, **kwargs) |
||||
|
except Exception as e: |
||||
|
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
||||
|
# 标记失败状态 记录异常 |
||||
|
self.fail_status = True |
||||
|
self.fail_exception = e |
||||
|
self.completed_event.set() |
||||
|
|
||||
|
return wrapper |
||||
|
|
||||
|
def get_platform_id(self) -> int: |
||||
|
return 3 |
||||
|
|
||||
|
def get_platform_name(self) -> str: |
||||
|
return 'YuanBao' |
||||
7
static/stealth.min.js
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,111 @@ |
|||||
|
# coding=utf-8 |
||||
|
import hashlib |
||||
|
import os |
||||
|
from pathlib import Path |
||||
|
import re |
||||
|
from .logger_utils import create_logger |
||||
|
import json |
||||
|
from typing import Any, Union |
||||
|
from datetime import datetime |
||||
|
|
||||
|
|
||||
|
def parse_nested_json( |
||||
|
json_str: str, |
||||
|
default: Any = None, |
||||
|
recursive: bool = True |
||||
|
) -> Union[dict, list, Any]: |
||||
|
""" |
||||
|
解析多层嵌套的JSON字符串 |
||||
|
|
||||
|
:param json_str: 要解析的JSON字符串 |
||||
|
:param default: 解析失败时返回的默认值 |
||||
|
:param recursive: 是否递归解析嵌套的JSON字符串 |
||||
|
:return: 解析后的Python对象 |
||||
|
""" |
||||
|
|
||||
|
def _parse(obj: Any) -> Any: |
||||
|
# 递归解析嵌套结构 |
||||
|
if isinstance(obj, dict): |
||||
|
return {k: _parse(v) for k, v in obj.items()} |
||||
|
elif isinstance(obj, list): |
||||
|
return [_parse(elem) for elem in obj] |
||||
|
elif recursive and isinstance(obj, str): |
||||
|
try: |
||||
|
parsed = json.loads(obj) |
||||
|
return _parse(parsed) # 递归解析新对象 |
||||
|
except json.JSONDecodeError: |
||||
|
return obj |
||||
|
else: |
||||
|
return obj |
||||
|
|
||||
|
# 处理空输入 |
||||
|
if not json_str: |
||||
|
return default if default is not None else {} |
||||
|
|
||||
|
try: |
||||
|
# 首次解析外层JSON |
||||
|
parsed = json.loads(json_str) |
||||
|
# 递归处理嵌套结构 |
||||
|
return _parse(parsed) |
||||
|
except (TypeError, json.JSONDecodeError) as e: |
||||
|
# 处理解码错误和类型错误 |
||||
|
return default if default is not None else {} |
||||
|
except Exception as e: |
||||
|
# 其他异常处理(可选记录日志) |
||||
|
return default if default is not None else {} |
||||
|
|
||||
|
|
||||
|
def convert_timestamp(timestamp): |
||||
|
""" |
||||
|
自动识别时间戳是秒级还是毫秒级,并转换为 datetime 对象。 |
||||
|
|
||||
|
参数: |
||||
|
timestamp -- 时间戳(整数或浮点数) |
||||
|
|
||||
|
返回: |
||||
|
datetime 对象 |
||||
|
""" |
||||
|
# 如果时间戳大于 1e10,认为是毫秒级时间戳 |
||||
|
if timestamp > 1e10: |
||||
|
timestamp /= 1000.0 # 转换为秒级时间戳 |
||||
|
return datetime.fromtimestamp(timestamp) |
||||
|
|
||||
|
def make_sha256_hash(file_path: str) -> str: |
||||
|
""" |
||||
|
计算文件的哈希值 |
||||
|
:param file_path: 文件路径 |
||||
|
:return: 哈希值的十六进制字符串 |
||||
|
""" |
||||
|
hash_obj = hashlib.new("sha256") |
||||
|
with open(file_path, "rb") as f: |
||||
|
while chunk := f.read(8192): # 分块读取避免大文件内存溢出 |
||||
|
hash_obj.update(chunk) |
||||
|
return hash_obj.hexdigest() |
||||
|
|
||||
|
|
||||
|
def css_to_dict(css_line): |
||||
|
""" |
||||
|
将单行CSS变量声明转换为Python字典 |
||||
|
|
||||
|
参数: |
||||
|
css_line (str): 包含CSS变量声明的字符串,例如: |
||||
|
'--ds-button-color: #fff; -button-text-color: #4c4c4c; button-border-color: rgba(0, 0, 0, 0.12);' |
||||
|
|
||||
|
返回: |
||||
|
dict: 包含CSS变量名和值的字典 |
||||
|
""" |
||||
|
# 使用正则表达式匹配所有变量声明 |
||||
|
# 匹配格式:可能带有1-2个横线的变量名,然后是冒号和值 |
||||
|
pattern = r'(-{0,2}[a-zA-Z0-9-]+)\s*:\s*([^;]+);?' |
||||
|
matches = re.findall(pattern, css_line) |
||||
|
|
||||
|
# 将匹配结果转换为字典 |
||||
|
result = {} |
||||
|
for match in matches: |
||||
|
var_name = match[0] # 保留原始横线数量 |
||||
|
var_value = match[1].strip() |
||||
|
result[var_name] = var_value |
||||
|
|
||||
|
return result |
||||
|
|
||||
|
|
||||
@ -0,0 +1,114 @@ |
|||||
|
# coding=utf-8 |
||||
|
import asyncio |
||||
|
import json |
||||
|
|
||||
|
from openai import OpenAI |
||||
|
|
||||
|
from utils import create_logger |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
|
||||
|
import config |
||||
|
|
||||
|
|
||||
|
logger = create_logger(platform="ai") |
||||
|
client = OpenAI(api_key=config.OPENAI_API_KEY, base_url="https://api.deepseek.com") |
||||
|
|
||||
|
async def main(): |
||||
|
results = await AiSeoApis.get_task_result_list(project_id=3) |
||||
|
for result in results: |
||||
|
if result['read_rank_status'] == 1: |
||||
|
logger.info(f"[{result['id']}] 已读取过排名") |
||||
|
continue |
||||
|
prompt = f""" |
||||
|
任务: 请在以下文本中, 按出现的顺序提取出品牌词, 多次出现的品牌词仅提取一次, 返回json数组 |
||||
|
返回格式: json中包含brands字段, 字段的值为数组, 数组内容是按顺序提取的品牌词 |
||||
|
|
||||
|
文本正文: |
||||
|
{result['content']} |
||||
|
""" |
||||
|
response = client.chat.completions.create( |
||||
|
model="deepseek-chat", |
||||
|
response_format={ |
||||
|
'type': 'json_object' |
||||
|
}, |
||||
|
messages=[ |
||||
|
{"role": "system", "content": "You are a helpful assistant"}, |
||||
|
{"role": "user", "content": prompt}, |
||||
|
], |
||||
|
stream=False |
||||
|
) |
||||
|
# 读取ai返回的json |
||||
|
ai_json_result = {} |
||||
|
# ai 返回的排名 |
||||
|
rank = 0 |
||||
|
# ai读取的状态 |
||||
|
read_rank_status = 2 |
||||
|
try: |
||||
|
ai_json_result = json.loads(response.choices[0].message.content) |
||||
|
read_rank_status = 1 |
||||
|
except Exception as e: |
||||
|
logger.error(f"[{result['id']}] 读取ai返回的json失败: {e}") |
||||
|
logger.error(f"ai提示词: {prompt}") |
||||
|
continue |
||||
|
# 读取排名 |
||||
|
brands = ai_json_result.get('brands', []) |
||||
|
index = 1 |
||||
|
for brand in brands: |
||||
|
if '沙利文' in brand: |
||||
|
rank = index |
||||
|
logger.info(f"[{result['id']}] 品牌词提及, 排名: {rank}") |
||||
|
break |
||||
|
index = index + 1 |
||||
|
# 更新排名 |
||||
|
update_result = await AiSeoApis.update_result_rank(result['id'], rank, read_rank_status) |
||||
|
logger.info(f"[{result['id']}] 更新排名结果: {update_result}") |
||||
|
|
||||
|
async def read_rank(content, brand_word): |
||||
|
# 切割品牌词 |
||||
|
brand_words = brand_word.split(',') |
||||
|
prompt = f""" |
||||
|
任务: 请在以下文本中, 按出现的顺序提取出品牌词, 多次出现的品牌词仅提取一次, 返回json数组 |
||||
|
返回格式: json中包含brands字段, 字段的值为数组, 数组内容是按顺序提取的品牌词 |
||||
|
|
||||
|
文本正文: |
||||
|
{content} |
||||
|
""" |
||||
|
response = client.chat.completions.create( |
||||
|
model="deepseek-chat", |
||||
|
response_format={ |
||||
|
'type': 'json_object' |
||||
|
}, |
||||
|
messages=[ |
||||
|
{"role": "system", "content": "You are a helpful assistant"}, |
||||
|
{"role": "user", "content": prompt}, |
||||
|
], |
||||
|
stream=False |
||||
|
) |
||||
|
# 读取ai返回的json |
||||
|
ai_json_result = {} |
||||
|
# ai 返回的排名 |
||||
|
rank = 0 |
||||
|
# ai读取的状态 |
||||
|
read_rank_status = 2 |
||||
|
try: |
||||
|
ai_json_result = json.loads(response.choices[0].message.content) |
||||
|
read_rank_status = 1 |
||||
|
except Exception as e: |
||||
|
logger.error(f"读取ai返回的json失败: {e}") |
||||
|
logger.error(f"ai提示词: {prompt}") |
||||
|
return [], 0 |
||||
|
# 读取排名 |
||||
|
brands = ai_json_result.get('brands', []) |
||||
|
index = 1 |
||||
|
for brand in brands: |
||||
|
found = any(sub in brand for sub in brand_words) |
||||
|
if found: |
||||
|
rank = index |
||||
|
logger.info(f"品牌词提及, 排名: {rank}") |
||||
|
break |
||||
|
index = index + 1 |
||||
|
return brands, rank |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(main()) |
||||
@ -0,0 +1,275 @@ |
|||||
|
# coding=utf-8 |
||||
|
import datetime |
||||
|
import json |
||||
|
import os |
||||
|
|
||||
|
import httpx |
||||
|
|
||||
|
import config |
||||
|
from utils import create_logger |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
class AiSeoApis: |
||||
|
@staticmethod |
||||
|
def build_full_url(uri): |
||||
|
return f"{config.AI_SEO_BASE_URL}{uri}" |
||||
|
|
||||
|
@staticmethod |
||||
|
async def get_one_task(date='', platform_ids=''): |
||||
|
""" |
||||
|
获取一个任务 |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/third/getTask' |
||||
|
|
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
params = {**config.AI_SEO_API_AUTH} |
||||
|
if date: |
||||
|
params['date'] = date |
||||
|
if platform_ids: |
||||
|
params['platform_ids'] = platform_ids |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.get(url, params=params, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def get_urgent_task_count(): |
||||
|
""" |
||||
|
获取紧急任务数量 |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/getUrgentTaskCount' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
params = {**config.AI_SEO_API_AUTH} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.get(url, params=params, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def upload_screenshot_file(file_path): |
||||
|
""" |
||||
|
上传截图文件 |
||||
|
:param file_path: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/third/oss/upload' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
params = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
'oss_path': 'ai_seo/screenshot' |
||||
|
} |
||||
|
|
||||
|
with open(file_path, 'rb') as file: |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
files = {'file': (file_path, file, 'image/jpeg')} |
||||
|
response = await client.post(url, params=params, files=files, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def submit_task(json_data): |
||||
|
""" |
||||
|
提交任务 |
||||
|
:param json_data: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/third/submitProjectTask' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
print("json_data",json.dumps(json_data)) |
||||
|
response = await client.post(url, json=json_data, timeout=120) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def get_task_result_list(project_id): |
||||
|
""" |
||||
|
获取任务结果列表 |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/projectResult/list' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
params = {**config.AI_SEO_API_AUTH, 'project_id': project_id} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.get(url, params=params) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def update_result_rank(result_id, rank, read_rank_status): |
||||
|
""" |
||||
|
更新任务结果排名 |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/projectResult/updateRank' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
json_data = {**config.AI_SEO_API_AUTH, 'id': result_id, 'rank': rank, 'read_rank_status': read_rank_status} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(url, json=json_data) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取任务失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def update_task_status(task_id, status): |
||||
|
""" |
||||
|
更新任务状态 |
||||
|
:param task_id: |
||||
|
:param status: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/third/updateTask' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
json_data = {**config.AI_SEO_API_AUTH, 'task_id': task_id, 'status': status} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(url, json=json_data, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"更新任务失败: {json_result['msg']}") |
||||
|
return None |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def heartbeat(dc_id, load_count=0): |
||||
|
""" |
||||
|
心跳 |
||||
|
:param dc_id: |
||||
|
:param load_count: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/spider/heartbeat' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
send_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
||||
|
json_data = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
'dc_id': dc_id, |
||||
|
'load_count': load_count, |
||||
|
'send_time': send_time |
||||
|
} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(url, json=json_data, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"心跳失败: {json_result['msg']}") |
||||
|
return None |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def get_spider_session(platform_id): |
||||
|
""" |
||||
|
获取爬虫会话 |
||||
|
:param platform_id: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/third/getOneSpiderSession' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.get(url, params=json_data, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"获取爬虫session失败") |
||||
|
return None |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def download_spider_session_file(url, path): |
||||
|
""" |
||||
|
下载爬虫会话文件 |
||||
|
:param url: |
||||
|
:param path: |
||||
|
:return: |
||||
|
""" |
||||
|
# 获取文件所在目录 |
||||
|
dir_path = os.path.dirname(path) |
||||
|
|
||||
|
os.makedirs(dir_path, exist_ok=True) |
||||
|
|
||||
|
async with httpx.AsyncClient(verify=False) as client: |
||||
|
response = await client.get(url, follow_redirects=True) |
||||
|
with open(path, 'wb') as file: |
||||
|
file.write(response.content) |
||||
|
|
||||
|
@staticmethod |
||||
|
async def update_spider_session(session_id, status=1): |
||||
|
""" |
||||
|
更新爬虫会话状态 |
||||
|
:param session_id: |
||||
|
:param status: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/spider/session/update' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
json_data = {**config.AI_SEO_API_AUTH, 'id': session_id, 'status': status} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(url, json=json_data, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"更新爬虫session失败") |
||||
|
return None |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def upload_session_file(file_path): |
||||
|
""" |
||||
|
上传session文件 |
||||
|
:param file_path: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/oss/upload' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
params = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
'oss_path': 'ai_seo/session' |
||||
|
} |
||||
|
with open(file_path, 'rb') as file: |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
files = {'file': (file_path, file, 'application/json')} |
||||
|
response = await client.post(url, params=params, files=files, timeout=60) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"上传session文件失败: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
|
@staticmethod |
||||
|
async def save_spider_session(platform_id, file_url, file_hash, account=''): |
||||
|
""" |
||||
|
新增爬虫session |
||||
|
:param file_url: |
||||
|
:param platform_id: |
||||
|
:param url: |
||||
|
:param file_hash: |
||||
|
:param account: |
||||
|
:return: |
||||
|
""" |
||||
|
uri = '/api/frontend/thirdParty/spider/session/save' |
||||
|
url = AiSeoApis.build_full_url(uri) |
||||
|
json_data = { |
||||
|
**config.AI_SEO_API_AUTH, |
||||
|
'platform_id': platform_id, |
||||
|
'account': account, |
||||
|
'url': file_url, |
||||
|
'hash': file_hash |
||||
|
} |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(url, json=json_data, timeout=120) |
||||
|
json_result = response.json() |
||||
|
if not json_result['code'] == 0: |
||||
|
logger.error(f"保存session: {json_result['msg']}") |
||||
|
return json_result['data'] |
||||
|
|
||||
@ -0,0 +1,43 @@ |
|||||
|
# coding=utf-8 |
||||
|
from PIL import Image |
||||
|
import os |
||||
|
|
||||
|
from utils import create_logger |
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
|
||||
|
def crop_image_left(image_path, crop_width): |
||||
|
""" |
||||
|
从图片左侧切割指定宽度并覆盖原图片 |
||||
|
|
||||
|
参数: |
||||
|
image_path (str): 图片文件路径 |
||||
|
crop_width (int): 要切割的宽度(像素) |
||||
|
""" |
||||
|
try: |
||||
|
# 打开原始图片 |
||||
|
with Image.open(image_path) as img: |
||||
|
# 获取图片尺寸 |
||||
|
width, height = img.size |
||||
|
|
||||
|
# 验证切割宽度是否有效 |
||||
|
if crop_width <= 0 or crop_width >= width: |
||||
|
raise ValueError(f"切割宽度必须大于0且小于图片宽度({width}px)") |
||||
|
|
||||
|
# 计算切割区域 (left, upper, right, lower) |
||||
|
crop_box = (crop_width, 0, width, height) |
||||
|
|
||||
|
# 执行切割 |
||||
|
cropped_img = img.crop(crop_box) |
||||
|
|
||||
|
# 临时保存切割后的图片 |
||||
|
temp_path = image_path + ".png" |
||||
|
cropped_img.save(temp_path, quality=95) |
||||
|
# 覆盖原文件 |
||||
|
os.replace(temp_path, image_path) |
||||
|
logger.info(f"成功从左侧切割 {crop_width}px 并覆盖原图") |
||||
|
except Exception as e: |
||||
|
print(f"处理图片时出错: {e}") |
||||
|
if 'temp_path' in locals() and os.path.exists(temp_path): |
||||
|
os.remove(temp_path) |
||||
@ -0,0 +1,48 @@ |
|||||
|
# coding=utf-8 |
||||
|
import sys |
||||
|
|
||||
|
from loguru import logger |
||||
|
import os |
||||
|
from datetime import datetime |
||||
|
|
||||
|
import config |
||||
|
|
||||
|
|
||||
|
def create_logger(platform: str): |
||||
|
""" |
||||
|
初始化日志工具类 |
||||
|
:param platform: 平台名称 |
||||
|
:param name: 姓名 |
||||
|
""" |
||||
|
# 确定日志目录,按月份保存 |
||||
|
current_month = datetime.now().strftime("%Y-%m") |
||||
|
log_dir = os.path.join("logs", current_month) |
||||
|
os.makedirs(log_dir, exist_ok=True) |
||||
|
# 确定日志文件名称为当前日的部分 |
||||
|
current_day = datetime.now().strftime("%d") |
||||
|
log_file = os.path.join(log_dir, f"{current_day}.log") |
||||
|
|
||||
|
logger.remove() |
||||
|
# 文件处理器 |
||||
|
logger.add( |
||||
|
log_file, |
||||
|
level=config.LOG_LEVEL, |
||||
|
rotation="1 day", # 每天轮换日志文件 |
||||
|
retention="3 months", # 日志保留3个月 |
||||
|
encoding="utf-8", |
||||
|
enqueue=True, # 异步写入 |
||||
|
backtrace=True, # 显示回溯信息 |
||||
|
diagnose=True, # 显示诊断信息 |
||||
|
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | " |
||||
|
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>" |
||||
|
) |
||||
|
# 控制台处理器 |
||||
|
logger.add( |
||||
|
sys.stderr, |
||||
|
level=config.LOG_LEVEL, |
||||
|
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | " |
||||
|
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>", |
||||
|
colorize=True |
||||
|
) |
||||
|
# 设置默认的上下文信息 |
||||
|
return logger.bind(platform=platform) |
||||
@ -0,0 +1,42 @@ |
|||||
|
# coding=utf-8 |
||||
|
from pathlib import Path |
||||
|
|
||||
|
import config |
||||
|
from utils.ai_seo_api_utils import AiSeoApis |
||||
|
|
||||
|
|
||||
|
async def get_spider_session(platform_id): |
||||
|
""" |
||||
|
获取可用的爬虫session |
||||
|
:param platform_id: |
||||
|
:return: |
||||
|
""" |
||||
|
base_path = f'{config.ROOT_PATH}/data/session_data' |
||||
|
# 爬虫信息 |
||||
|
session_info = await AiSeoApis.get_spider_session(platform_id) |
||||
|
if not session_info: |
||||
|
raise Exception(f"平台id: {platform_id} 没有可用的爬虫session") |
||||
|
# 根据id去爬虫文件夹中找 |
||||
|
target = search_session_file(session_info['id'], base_path) |
||||
|
# 如果没有找到 下载这个文件并保存 |
||||
|
if not target: |
||||
|
await AiSeoApis.download_spider_session_file(session_info['url'], F"{base_path}/{session_info['id']}.json") |
||||
|
target = f"{session_info['id']}.json" |
||||
|
else: |
||||
|
target = target[0] |
||||
|
session_info['session_path'] = f"{base_path}/{target}" |
||||
|
return session_info |
||||
|
|
||||
|
|
||||
|
def search_session_file(session_id, path): |
||||
|
folder_path = Path(path) |
||||
|
file_filter = f"{session_id}.json" |
||||
|
return [file.name for file in folder_path.glob(file_filter)] |
||||
|
|
||||
|
async def main(): |
||||
|
path = await get_spider_session(1) |
||||
|
print(path) |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
import asyncio |
||||
|
asyncio.run(main()) |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue