commit
2d51402f2d
42 changed files with 3102 additions and 0 deletions
-
135abs_spider.py
-
76config.py
-
92config.py.example
-
1data/session/deepseek.json
-
1data/session/doubao.json
-
1data/session/kimi.json
-
1data/session/metaso.json
-
1data/session/tongyi.json
-
1data/session/yiyan.json
-
1data/session/yuanbao.json
-
1data/session_data/deepseek.json
-
1data/session_data/doubao.json
-
1data/session_data/kimi.json
-
1data/session_data/metaso.json
-
1data/session_data/tongyi.json
-
1data/session_data/yiyan.json
-
1data/session_data/yuanbao.json
-
1domain/__init__.py
-
68domain/ai_seo.py
-
100login.py
-
85main.py
-
12requirements.txt
-
78resave.py
-
238run.py
-
119run_deepseek.py
-
1spiders/__init__.py
-
9spiders/ai_seo/__init__.py
-
189spiders/ai_seo/deepseek.py
-
164spiders/ai_seo/doubao.py
-
148spiders/ai_seo/kimi.py
-
196spiders/ai_seo/metaso.py
-
174spiders/ai_seo/nanometer.py
-
176spiders/ai_seo/tongyi.py
-
213spiders/ai_seo/yiyan.py
-
174spiders/ai_seo/yuanbao.py
-
7static/stealth.min.js
-
111utils/__init__.py
-
114utils/ai.py
-
275utils/ai_seo_api_utils.py
-
43utils/image_utils.py
-
48utils/logger_utils.py
-
42utils/session_utils.py
@ -0,0 +1,135 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import uuid |
|||
from abc import ABC, abstractmethod |
|||
from asyncio import Event |
|||
|
|||
from playwright.async_api import Browser, BrowserContext, Page |
|||
|
|||
import config |
|||
from domain.ai_seo import AiAnswer |
|||
from utils import create_logger |
|||
from utils.session_utils import get_spider_session |
|||
|
|||
logger = create_logger("abs_spider") |
|||
|
|||
|
|||
|
|||
|
|||
class AbstractAiSeoSpider(ABC): |
|||
browser: Browser |
|||
browser_content: BrowserContext |
|||
browser_page: Page |
|||
platform_id: int |
|||
platform_name: str |
|||
prompt: str |
|||
keyword: str |
|||
completed_event: Event | None = None |
|||
ai_answer: AiAnswer | None = None |
|||
fail_status: bool = False |
|||
fail_exception: Exception | None = None |
|||
load_session: bool = True |
|||
session_info: dict | None = None |
|||
task_id: int = 0 |
|||
think: bool = False |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False, load_session: bool = True): |
|||
self.browser = browser |
|||
self.platform_id = self.get_platform_id() |
|||
self.platform_name = self.get_platform_name() |
|||
self.prompt = prompt |
|||
self.keyword = keyword |
|||
self.load_session = load_session |
|||
self.think = think |
|||
|
|||
def _init_data(self): |
|||
self.completed_event = asyncio.Event() |
|||
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword) |
|||
self.index_data = None |
|||
|
|||
def _get_session_path(self): |
|||
sessions = { |
|||
1: "deepseek", |
|||
5: "doubao", |
|||
4: "kimi", |
|||
2: "tongyi", |
|||
6: "yiyan", |
|||
3: "yuanbao" |
|||
} |
|||
# todo 支持多session管理 |
|||
|
|||
session_path = f"./data/session/{sessions.get(self.platform_id, 'deepseek')}.json" |
|||
return session_path |
|||
|
|||
def _get_screenshot_path(self): |
|||
unique_id = str(uuid.uuid4()).replace('-', '') |
|||
screenshot_path = f'{config.SCREENSHOT_BASE_PATH}/{self.platform_name}_{unique_id}.png' |
|||
return screenshot_path |
|||
|
|||
async def __init_page(self): |
|||
if self.load_session: |
|||
self.session_info = await get_spider_session(self.platform_id) |
|||
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path']) |
|||
else: |
|||
self.browser_content = await self.browser.new_context() |
|||
self.browser_page = await self.browser_content.new_page() |
|||
await self.browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE) |
|||
# 加载伪装脚本 |
|||
await self.browser_page.add_init_script(""" |
|||
Object.defineProperties(navigator, {webdriver:{get:()=>false}}); |
|||
""") |
|||
await self.browser_page.add_init_script('static/stealth.min.js') |
|||
|
|||
async def _close(self): |
|||
await self.browser_page.close() |
|||
await self.browser_content.close() |
|||
|
|||
async def _login(self): |
|||
""" |
|||
登录 |
|||
:return: |
|||
""" |
|||
await self.__init_page() |
|||
await self.browser_page.goto(self.get_home_url()) |
|||
unique_id = str(uuid.uuid4()).replace('-', '') |
|||
session_path = f"./data/session/{self.get_platform_name()}/{unique_id}.json" |
|||
input("请手动登录后按回车继续...") |
|||
await self.browser_content.storage_state(path=session_path) |
|||
logger.info(f"[{self.platform_name}]登录成功: {session_path}") |
|||
await self._close() |
|||
|
|||
async def run(self) -> AiAnswer | None: |
|||
""" |
|||
运行爬虫 |
|||
:return: |
|||
""" |
|||
try: |
|||
await self.__init_page() |
|||
logger.info(f"{self.platform_name}爬虫开始运行 提问词: {self.prompt}") |
|||
return await self._do_spider() |
|||
except Exception as e: |
|||
logger.error(f"{self.platform_name}爬虫运行异常 参数: {self.prompt, self.keyword}") |
|||
logger.error(f"异常信息: {str(e)}") |
|||
raise e |
|||
finally: |
|||
await self._close() |
|||
|
|||
@abstractmethod |
|||
async def _do_spider(self) -> AiAnswer: |
|||
""" |
|||
爬虫具体逻辑 |
|||
:return: |
|||
""" |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_platform_id(self) -> int: |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_platform_name(self) -> str: |
|||
pass |
|||
|
|||
@abstractmethod |
|||
def get_home_url(self) -> str: |
|||
pass |
|||
@ -0,0 +1,76 @@ |
|||
# coding=utf-8 |
|||
|
|||
LOG_LEVEL = 'DEBUG' |
|||
DC_ID = 'dev-01' |
|||
ROOT_PATH = r'C:\Users\Administrator\Desktop\spider_ai_seo' |
|||
|
|||
SCREENSHOT_BASE_PATH = 'screenshot' |
|||
BROWSER_HANDLESS = False |
|||
BROWSER_ENABLE_SANDBOX = False |
|||
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"] |
|||
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"] |
|||
|
|||
PAGE_INIT_VIEWPORT_SIZE = { |
|||
'width': 1920, |
|||
'height': 1080 |
|||
} |
|||
|
|||
AI_SEO_BASE_URL = 'https://geo-api.neicela.com' |
|||
AI_SEO_API_AUTH = { |
|||
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c', |
|||
'secret': '5588071d36f0bc61af849c311a03f2c4' |
|||
} |
|||
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f' |
|||
|
|||
AI_SEO_JOB_RANGE = { |
|||
'start_time': '00:10', |
|||
'end_time': '23:59' |
|||
} |
|||
|
|||
# aiseo任务是否启用 |
|||
AI_SEO_JOB_ENABLE = True |
|||
# aiseo任务运行间隔 |
|||
AI_SEO_JOB_INTERVAL = 5 |
|||
# aiseo任务获取平台 |
|||
AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13'] |
|||
# aiseo任务最大并发量 |
|||
AI_SEO_JOB_MAX_INSTANCES = 2 |
|||
|
|||
|
|||
DEEPSEEK_SEO_JOB_RANGE = { |
|||
'start_time': '00:10', |
|||
'end_time': '23:59' |
|||
} |
|||
# deepseek任务是否启用 |
|||
DEEPSEEK_JOB_ENABLE = True |
|||
# deepseek任务获取间隔 |
|||
DEEPSEEK_JOB_INTERVAL = 30 |
|||
# deepseek任务获取平台 |
|||
DEEPSEEK_JOB_PLATFORM_IDS = ['1'] |
|||
# deepseek任务最大并发量 |
|||
DEEPSEEK_JOB_MAX_INSTANCES = 1 |
|||
# 测试提示词 |
|||
TEST_KEYWORDS = [ |
|||
'正典燕窝', |
|||
'燕窝供货商', |
|||
'燕窝供应链平台', |
|||
'燕窝加盟', |
|||
'燕窝加盟选哪个品牌好', |
|||
] |
|||
|
|||
# 测试平台 |
|||
TEST_PLATFORM = [6] |
|||
# TEST_PLATFORM = [] |
|||
# 测试间隔 |
|||
TEST_INTERVAL = 10 |
|||
|
|||
RESAVE_CONFIG = { |
|||
'platform_ids': '1', |
|||
'dates': ['2025-04-16'] |
|||
} |
|||
|
|||
# 任务队列redis配置 |
|||
ARQ_REDIS_HOST = 'localhost' |
|||
ARQ_REDIS_PORT = 6379 |
|||
ARQ_REDIS_DB = 5 |
|||
ARQ_REDIS_PASSWORD = None |
|||
@ -0,0 +1,92 @@ |
|||
# coding=utf-8 |
|||
|
|||
LOG_LEVEL = 'INFO' |
|||
DC_ID = 'dev-01' |
|||
|
|||
SCREENSHOT_BASE_PATH = 'screenshot' |
|||
BROWSER_HANDLESS = False |
|||
BROWSER_ENABLE_SANDBOX = False |
|||
BROWSER_IGNORE_DEFAULT_ARGS = ["enable-automation"] |
|||
BROWSER_ARGS = ["--start-maximized", "--window-size=1920*1080"] |
|||
|
|||
PAGE_INIT_VIEWPORT_SIZE = { |
|||
'width': 1920, |
|||
'height': 1080 |
|||
} |
|||
|
|||
AI_SEO_BASE_URL = 'https://aiseo-api.neicela.com' |
|||
AI_SEO_API_AUTH = { |
|||
'app_id': 'aa65700299848d6f21b969dbc9f6cf7c', |
|||
'secret': '5588071d36f0bc61af849c311a03f2c4' |
|||
} |
|||
OPENAI_API_KEY = 'sk-d0107243adbb43f482cdb14d694b434f' |
|||
|
|||
AI_SEO_JOB_DATE = '2025-04-21' |
|||
# aiseo任务运行间隔 |
|||
AI_SEO_JOB_INTERVAL = 20 |
|||
# aiseo任务获取平台 |
|||
AI_SEO_JOB_PLATFORM_IDS = ['2', '3', '4', '5', '6', '7'] |
|||
# aiseo任务最大并发量 |
|||
AI_SEO_JOB_MAX_INSTANCES = 2 |
|||
|
|||
# deepseek任务获取间隔 |
|||
DEEPSEEK_JOB_INTERVAL = 120 |
|||
# deepseek任务获取平台 |
|||
DEEPSEEK_JOB_PLATFORM_IDS = ['1'] |
|||
# deepseek任务最大并发量i |
|||
DEEPSEEK_JOB_MAX_INSTANCES = 1 |
|||
# 测试提示词 |
|||
TEST_KEYWORDS = [ |
|||
# '行业咨询公司排行榜', |
|||
# '咨询公司排行榜', |
|||
# '中国有哪些知名的行业咨询公司', |
|||
# '请推荐中国十大行业咨询公司', |
|||
# '中国行业咨询机构中,哪些是最具影响力的', |
|||
# '行业顾问排行榜上,哪些公司名列前茅', |
|||
# '在行业咨询领域,哪些公司被誉为行业领军者', |
|||
# '我想了解中国行业咨询公司的排名,有吗', |
|||
# '请给我一份中国行业顾问公司的权威榜单', |
|||
# '中国有哪些行业咨询公司备受业界认可', |
|||
# '能否介绍一下中国行业咨询领域的佼佼者', |
|||
# '中国行业咨询公司的排名情况如何', |
|||
# '中国领先的行业顾问公司有哪些', |
|||
# '中国行业顾问排行榜上的公司都是哪些', |
|||
# '有没有一份关于中国行业咨询机构的综合排名', |
|||
# '十大行业研究机构排名', |
|||
# '行业研究机构推荐', |
|||
# '中国权威的行业研究机构有哪些', |
|||
# '能否列出一些中国顶级的行业研究机构', |
|||
# '哪些行业研究机构在中国市场上最具竞争力', |
|||
# '行业研究机构中,哪些在业界享有盛誉', |
|||
# '我想找一家权威的行业研究机构,有推荐吗', |
|||
# '请列举一些在行业研究方面有着深厚底蕴的中国公司', |
|||
# '我想了解中国行业研究领域的权威机构,能介绍一下吗?', |
|||
# '高端卫浴品牌', |
|||
# '数智卫浴品牌', |
|||
# '科技卫浴品牌', |
|||
# '智能马桶品牌', |
|||
# '智能淋浴器推荐', |
|||
# '小孩湿疹用什么药膏', |
|||
# '皮肤湿疹用什么药膏', |
|||
# '特应性皮炎用什么药膏最有效', |
|||
# '湿疹药膏排行榜', |
|||
# '皮炎性湿疹治疗药膏', |
|||
'大学生买什么平板电脑' |
|||
] |
|||
|
|||
# 测试平台 |
|||
# TEST_PLATFORM = [2,3,4,5,6,7] |
|||
TEST_PLATFORM = [7] |
|||
# 测试间隔 |
|||
TEST_INTERVAL = 15 |
|||
|
|||
RESAVE_CONFIG = { |
|||
'platform_ids': '1', |
|||
'dates': ['2025-04-16'] |
|||
} |
|||
|
|||
# 任务队列redis配置 |
|||
ARQ_REDIS_HOST = 'localhost' |
|||
ARQ_REDIS_PORT = 6379 |
|||
ARQ_REDIS_DB = 5 |
|||
ARQ_REDIS_PASSWORD = None |
|||
1
data/session/deepseek.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/doubao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/kimi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/metaso.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/tongyi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/yiyan.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session/yuanbao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/deepseek.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/doubao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/kimi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/metaso.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/tongyi.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/yiyan.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
data/session_data/yuanbao.json
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1 @@ |
|||
# coding=utf-8 |
|||
@ -0,0 +1,68 @@ |
|||
# coding=utf-8 |
|||
from dataclasses import dataclass, field |
|||
import os |
|||
import config |
|||
import utils |
|||
from datetime import datetime |
|||
|
|||
@dataclass |
|||
class AiSearchResult: |
|||
|
|||
""" |
|||
ai搜索结果对象 |
|||
""" |
|||
|
|||
# 标题 |
|||
title: str = '' |
|||
# url |
|||
url: str = '' |
|||
# 来源 |
|||
host_name: str = '' |
|||
# 描述 |
|||
body: str = '' |
|||
# 发布时间 |
|||
publish_time: str|int|float = '' |
|||
#是否被ai引用 |
|||
is_referenced: str = '0' |
|||
#情感倾向" 1- 中立 2- 正面 3- 负面 |
|||
sentiment_type = "" |
|||
|
|||
#情感类型 |
|||
type = '' |
|||
def __post_init__(self): |
|||
if isinstance(self.publish_time, float): |
|||
self.publish_time = int(self.publish_time) |
|||
|
|||
if isinstance(self.publish_time, int): |
|||
self.publish_time = utils.convert_timestamp(self.publish_time).strftime('%Y-%m-%d') |
|||
|
|||
if isinstance(self.publish_time, str): |
|||
try: |
|||
now = datetime.now() |
|||
publish = datetime.strptime(self.publish_time, f'%m-%d') |
|||
except ValueError: |
|||
return |
|||
self.publish_time = publish.strftime(f'{now.year}-%m-%d') |
|||
|
|||
|
|||
@dataclass |
|||
class AiAnswer: |
|||
""" |
|||
ai回答对象 |
|||
""" |
|||
|
|||
platform_id: int |
|||
platform_name: str |
|||
prompt: str |
|||
keyword: str |
|||
answer: str = '' |
|||
search_result: list[AiSearchResult] = field(default_factory=list) |
|||
screenshot_file: str = '' |
|||
# 状态 |
|||
run_status: bool = True |
|||
|
|||
def __post_init__(self): |
|||
self.screenshot_file = os.path.abspath(self.screenshot_file) |
|||
|
|||
|
|||
|
|||
@ -0,0 +1,100 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import os |
|||
import time |
|||
|
|||
from playwright.async_api import async_playwright |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from spiders.ai_seo import * |
|||
import config |
|||
from utils import make_sha256_hash |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
|
|||
SPIDER_CLS = { |
|||
1: DeepseekSpider, |
|||
2: TongyiSpider, |
|||
3: YuanBaoSpider, |
|||
4: KimiSpider, |
|||
5: DouBaoSpider, |
|||
6: YiYanSpider, |
|||
7: NanometerSpider, |
|||
13: MetasoSpider |
|||
} |
|||
|
|||
|
|||
async def init_browser() -> tuple: |
|||
""" |
|||
初始化浏览器实例 |
|||
:return: |
|||
""" |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.chromium.launch(headless=False, |
|||
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
channel="chrome", |
|||
args=config.BROWSER_ARGS) |
|||
content = await browser.new_context() |
|||
return playwright, browser |
|||
|
|||
|
|||
async def main(): |
|||
playwright, browser = await init_browser() |
|||
main_info = """ |
|||
请选择要登录的平台: |
|||
1.Deepseek |
|||
2.通义千问 |
|||
3.腾讯元宝 |
|||
4.Kimi |
|||
5.豆包 |
|||
6.文心一言 |
|||
13.秘塔AI搜索 |
|||
""" |
|||
print(main_info) |
|||
platform_id = input() |
|||
cls = SPIDER_CLS.get(int(platform_id), None) |
|||
# 如果没有找到对应的爬虫类,抛出异常 |
|||
if not cls: |
|||
print('输入的平台id不存在') |
|||
# 要求用户填写登录账号 |
|||
account = input('请输入登录账号:') |
|||
# 创建并返回爬虫实例 |
|||
spider = cls(browser, '', '') |
|||
# 获取首页url |
|||
home_url = spider.get_home_url() |
|||
# 打开页面 |
|||
browser_content = await browser.new_context() |
|||
browser_page = await browser_content.new_page() |
|||
await browser_page.set_viewport_size(config.PAGE_INIT_VIEWPORT_SIZE) |
|||
print('创建浏览器成功') |
|||
# 加载伪装脚本 |
|||
await browser_page.add_init_script(""" |
|||
Object.defineProperties(navigator, {webdriver:{get:()=>false}}); |
|||
""") |
|||
await browser_page.add_init_script('static/stealth.min.js') |
|||
print('伪装脚本加载成功') |
|||
await browser_page.goto(home_url, timeout=6000000) |
|||
print('加载首页成功') |
|||
input('使用手机号登录 并发送验证码后按回车键继续...') |
|||
# 保存登录后的上下文 |
|||
session_path = f"{config.ROOT_PATH}/data/tmp/session_{time.time()}.json" |
|||
# 检查文件夹 |
|||
dir_path = os.path.dirname(session_path) |
|||
os.makedirs(dir_path, exist_ok=True) |
|||
await browser_content.storage_state(path=session_path) |
|||
await browser_page.close() |
|||
await browser_content.close() |
|||
await browser.close() |
|||
print(f"登录成功 保存到{session_path}") |
|||
# 上传登录后的上下文 |
|||
upload_data = await AiSeoApis.upload_session_file(session_path) |
|||
session_url = upload_data['url'] |
|||
print(f"session文件上传成功 url:{session_url}") |
|||
# 计算文件hash |
|||
file_hash = make_sha256_hash(session_path) |
|||
result = await AiSeoApis.save_spider_session(platform_id, session_url, file_hash, account) |
|||
print("session文件保存成功") |
|||
print(result) |
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(main()) |
|||
@ -0,0 +1,85 @@ |
|||
# coding=utf-8 |
|||
|
|||
import asyncio |
|||
import json |
|||
import os |
|||
from dataclasses import asdict |
|||
from datetime import datetime |
|||
|
|||
from playwright.async_api import async_playwright |
|||
|
|||
import config |
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer |
|||
from spiders.ai_seo import * |
|||
from utils.logger_utils import create_logger |
|||
|
|||
logger = create_logger("app") |
|||
|
|||
SPIDER_CLS = { |
|||
1: DeepseekSpider, |
|||
2: TongyiSpider, |
|||
3: YuanBaoSpider, |
|||
4: KimiSpider, |
|||
5: DouBaoSpider, |
|||
6: YiYanSpider, |
|||
7: NanometerSpider, |
|||
13: MetasoSpider |
|||
} |
|||
|
|||
|
|||
async def init_browser() -> tuple: |
|||
""" |
|||
初始化浏览器实例 |
|||
:return: |
|||
""" |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
|||
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
channel="chrome", |
|||
args=config.BROWSER_ARGS) |
|||
content = await browser.new_context() |
|||
return playwright, browser |
|||
|
|||
|
|||
def get_spider(platform_id, prompt, brand, browser) -> AbstractAiSeoSpider: |
|||
cls = SPIDER_CLS.get(int(platform_id), None) |
|||
if not cls: |
|||
raise ValueError(f"未找到对应的爬虫类,platform_id={platform_id}") |
|||
return cls(browser, prompt, brand, True) |
|||
|
|||
|
|||
def save_local(ai_answer: AiAnswer): |
|||
now = datetime.now().strftime("%Y-%m-%d") |
|||
base_path = f'./data/{ai_answer.platform_name}/{now}' |
|||
|
|||
if not os.path.exists(base_path): |
|||
os.makedirs(base_path) |
|||
|
|||
json_file_path = f'{base_path}/{ai_answer.prompt}.json' |
|||
_dict = asdict(ai_answer) |
|||
json_str = json.dumps(_dict, indent=4, ensure_ascii=False) |
|||
with open(json_file_path, 'w', encoding='utf-8') as f: |
|||
f.write(json_str) |
|||
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}") |
|||
|
|||
|
|||
async def test(): |
|||
playwright, browser = await init_browser() |
|||
prompts = config.TEST_KEYWORDS |
|||
index = 1 |
|||
for prompt in prompts: |
|||
logger.info(f"[{index}/{len(prompts)}] {prompt}") |
|||
for platform in config.TEST_PLATFORM: |
|||
spider = get_spider(platform, prompt, '品牌词', browser) |
|||
ai_answer = await spider.run() |
|||
if ai_answer: |
|||
save_local(ai_answer) |
|||
await asyncio.sleep(config.TEST_INTERVAL) |
|||
index = index + 1 |
|||
await asyncio.sleep(config.TEST_INTERVAL * 6) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(test()) |
|||
@ -0,0 +1,12 @@ |
|||
ftfy==6.3.1 |
|||
glom==24.11.0 |
|||
httpx==0.28.1 |
|||
json_repair==0.40.0 |
|||
loguru==0.7.3 |
|||
openai~=1.72.0 |
|||
playwright==1.51.0 |
|||
arq~=0.26.3 |
|||
APScheduler~=3.11.0 |
|||
pillow~=11.2.1 |
|||
pyperclip~=1.9.0 |
|||
PyJWT~=2.10.1 |
|||
@ -0,0 +1,78 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
import os |
|||
from datetime import datetime |
|||
import random |
|||
|
|||
import config |
|||
from utils import create_logger |
|||
from utils.ai import read_rank |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
platform_names = { |
|||
1: 'Deepseek', |
|||
2: 'TongYi', |
|||
3: 'YuanBao', |
|||
4: 'Kimi', |
|||
5: 'DouBao', |
|||
6: 'YiYan', |
|||
7: 'Nano' |
|||
} |
|||
|
|||
async def main(): |
|||
datas = config.RESAVE_CONFIG['dates'] |
|||
for date in datas: |
|||
logger.info(f'start: {date}') |
|||
while True: |
|||
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=config.RESAVE_CONFIG['platform_ids']) |
|||
if not task_data: |
|||
logger.info('没有任务') |
|||
break |
|||
logger.info(f"获取到任务: id: {task_data['id']} 关键词: {task_data['keyword']} 品牌词: {task_data['brand']}") |
|||
await save(task_data) |
|||
await asyncio.sleep(1) |
|||
|
|||
async def save(task_data): |
|||
keyword = task_data.get('keyword') |
|||
platform_id = task_data.get('platform_id') |
|||
gather_date_str = task_data.get('gather_filter') |
|||
gather_date = datetime.strptime(gather_date_str, "%Y-%m-%d %H:%M:%S") |
|||
dir_name = gather_date.strftime("%Y-%m-%d") |
|||
platform_name = platform_names.get(platform_id) |
|||
|
|||
data_path = f'./data/{platform_name}/{dir_name}/{keyword}.json' |
|||
|
|||
# 读取文件内容 |
|||
if not os.path.exists(data_path): |
|||
logger.info(f'文件不存在: {data_path}') |
|||
return |
|||
json_data = {} |
|||
with open(data_path, 'r', encoding='utf-8') as file: |
|||
json_data = json.loads(file.read()) |
|||
upload_data = await AiSeoApis.upload_screenshot_file(json_data['screenshot_file']) |
|||
json_data = { |
|||
**config.AI_SEO_API_AUTH, |
|||
**json_data, |
|||
'task_id': task_data['id'], |
|||
'rank': random.randint(0, 15), |
|||
'start_time': gather_date.strftime("%Y-%m-%d 06:10:15"), |
|||
'end_time': gather_date.strftime("%Y-%m-%d 06:12:15"), |
|||
'screenshot_url': upload_data['url'] |
|||
} |
|||
if not json_data.get('answer', ''): |
|||
json_data['answer'] = '未知' |
|||
json_data['rank'] = 0 |
|||
else: |
|||
brands, rank = await read_rank(json_data['answer'], task_data['brand']) |
|||
json_data['rank'] = rank |
|||
json_data['words'] = brands |
|||
result = await AiSeoApis.submit_task(json_data) |
|||
logger.info(f"任务提交成功: id: {task_data['id']}") |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(main()) |
|||
|
|||
@ -0,0 +1,238 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import uuid |
|||
from dataclasses import asdict |
|||
from datetime import datetime, timedelta |
|||
|
|||
import requests |
|||
from apscheduler.schedulers.asyncio import AsyncIOScheduler |
|||
from playwright.async_api import async_playwright, Browser |
|||
|
|||
import config |
|||
from spiders.ai_seo import * |
|||
from utils import create_logger |
|||
from utils.ai import read_rank |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
scheduler = AsyncIOScheduler() |
|||
SPIDER_CLS = { |
|||
1: DeepseekSpider, |
|||
2: TongyiSpider, |
|||
3: YuanBaoSpider, |
|||
4: KimiSpider, |
|||
5: DouBaoSpider, |
|||
6: YiYanSpider, |
|||
7: NanometerSpider, |
|||
13: MetasoSpider |
|||
} |
|||
|
|||
spider_pool: dict = {} |
|||
|
|||
async def init_browser() -> tuple: |
|||
""" |
|||
初始化浏览器实例 |
|||
:return: |
|||
""" |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
|||
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
channel="chrome", |
|||
args=config.BROWSER_ARGS) |
|||
return playwright, browser |
|||
|
|||
def get_spider(platform_id, prompt, brand, browser) -> None | DeepseekSpider | TongyiSpider | YuanBaoSpider | KimiSpider | DouBaoSpider | YiYanSpider | NanometerSpider: |
|||
""" |
|||
根据平台ID获取相应的爬虫实例。 |
|||
|
|||
参数: |
|||
- platform_id: 平台标识符,用于选择合适的爬虫类。 |
|||
- prompt: 用户查询提示,用于爬虫处理。 |
|||
- brand: 品牌信息,用于爬虫处理。 |
|||
- browser: 浏览器实例,供爬虫使用。 |
|||
|
|||
返回: |
|||
- AbstractAiSeoSpider: 返回一个抽象的AI SEO爬虫实例。 |
|||
|
|||
异常: |
|||
- ValueError: 如果未找到对应的爬虫类,则抛出此异常。 |
|||
""" |
|||
# 根据平台ID获取对应的爬虫类 |
|||
cls = SPIDER_CLS.get(int(platform_id), None) |
|||
# 如果没有找到对应的爬虫类,抛出异常 |
|||
if not cls: |
|||
return None |
|||
# 创建并返回爬虫实例 |
|||
return cls(browser, prompt, brand) |
|||
|
|||
|
|||
async def ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id): |
|||
status, date = calc_task_date(time_range) |
|||
|
|||
# if not status: |
|||
# # 是否有紧急任务 |
|||
# task_result = await AiSeoApis.get_urgent_task_count() |
|||
# if task_result['count'] <= 0: |
|||
# return |
|||
current_job = scheduler.get_job(job_id) |
|||
# current_job.pause() |
|||
platform_str = ','.join(platform_ids) |
|||
# 获取任务信息 |
|||
task_data = await AiSeoApis.get_one_task(date=date, platform_ids=platform_str) |
|||
|
|||
if not task_data: |
|||
logger.info(f'[{type_name}]未获取到任务信息') |
|||
# current_job.resume() |
|||
return |
|||
task_id = task_data['id'] |
|||
logger.info(f"获取到{task_data['project_id']}项目任务: id: {task_data['id']} 平台id: {task_data['platform_id']} " |
|||
f"关键词: {task_data['keyword']} 品牌词: {task_data['brand']}") |
|||
|
|||
# 记录开始时间 |
|||
start_time = datetime.now() |
|||
# 创建爬虫实例 |
|||
spider = get_spider(task_data['platform_id'], task_data['keyword'], task_data['brand'], browser) |
|||
# 记录任务id |
|||
spider.task_id = task_id |
|||
spider_pool[run_id] = spider |
|||
logger.info(f"RunId注册成功: TaskId: {task_id} 平台: {spider.platform_name}") |
|||
# 是否开启深度思考 |
|||
if task_data['thinking'] == 1: |
|||
spider.think = True |
|||
if not spider: |
|||
await AiSeoApis.update_task_status(task_id, 5) |
|||
logger.error(f"未找到对应的爬虫类 请检查任务信息: id: {task_data['id']} platform_id: {task_data['platform_id']}") |
|||
return |
|||
ai_answer = None |
|||
try: |
|||
# 运行爬虫并获取结果 |
|||
ai_answer = await spider.run() |
|||
except Exception as e: |
|||
await AiSeoApis.update_task_status(task_id, 4) |
|||
logger.info(f"回滚任务状态: id: {task_id}") |
|||
spider_pool.pop(run_id, None) |
|||
return |
|||
if not ai_answer: |
|||
await AiSeoApis.update_task_status(task_id, 4) |
|||
logger.error(f"爬虫运行失败 id: {task_data['id']} platform_id: {task_data['platform_id']}") |
|||
spider_pool.pop(run_id, None) |
|||
return |
|||
# 记录结束时间 |
|||
end_time = datetime.now() |
|||
|
|||
# 提交爬虫结果 |
|||
answer_data = asdict(ai_answer) |
|||
# 上传截图 |
|||
upload_data = await AiSeoApis.upload_screenshot_file(answer_data['screenshot_file']) |
|||
# 结果参数 |
|||
answer_data = { |
|||
**config.AI_SEO_API_AUTH, |
|||
**answer_data, |
|||
'task_id': task_data['id'], |
|||
'rank': 0, |
|||
'start_time': start_time.strftime("%Y-%m-%d %H:%M:%S"), |
|||
'end_time': end_time.strftime("%Y-%m-%d %H:%M:%S"), |
|||
'screenshot_url': upload_data['url'] |
|||
} |
|||
if not answer_data.get('answer', ''): |
|||
answer_data['answer'] = '未知' |
|||
answer_data['rank'] = 0 |
|||
else: |
|||
brands, rank = await read_rank(answer_data['answer'], task_data['brand']) |
|||
answer_data['rank'] = rank |
|||
answer_data['words'] = brands |
|||
# print('answer_data',answer_data) |
|||
search_results = list() |
|||
for data in answer_data.get("search_result"): |
|||
data_ = {**config.AI_SEO_API_AUTH,"content": data.get("title")} |
|||
rest = '' |
|||
try: |
|||
resp = requests.post(url='https://geo-api.neicela.com/api/third/getSentimentType',json=data_,timeout=600) |
|||
# print(resp.text) |
|||
rest = resp.json() |
|||
except Exception as e: |
|||
print(str(e)) |
|||
# logger.info("调用getSentimentType接口出现异常:",str(e)) |
|||
|
|||
# print("rest",rest) |
|||
if rest.get("code") == 0: |
|||
data.update(rest.get("data")) |
|||
search_results.append(data) |
|||
answer_data['search_result'] = search_results |
|||
result = await AiSeoApis.submit_task(answer_data) |
|||
logger.info(f"任务提交成功: id: {task_data['id']}") |
|||
spider_pool.pop(run_id, None) |
|||
|
|||
async def ai_seo_job_with_timeout(browser, platform_ids, time_range, job_id, type_name, timeout=1200): |
|||
# 生成一个唯一的run_id |
|||
run_id = str(uuid.uuid4()).replace("-", "") |
|||
try: |
|||
await asyncio.wait_for(ai_seo_job(browser, platform_ids, time_range, job_id, type_name, run_id), timeout) |
|||
except asyncio.TimeoutError: |
|||
spider = spider_pool.get(run_id, None) |
|||
if spider: |
|||
await spider._close() |
|||
logger.error(f"任务超时: 平台: {spider.platform_id}") |
|||
spider_pool.pop(run_id, None) |
|||
await AiSeoApis.update_task_status(spider.task_id, 4) |
|||
logger.info(f"回滚任务状态: id: {spider.task_id}") |
|||
|
|||
async def heartbeat(browser: Browser): |
|||
load_count = len(browser.contexts) |
|||
result = await AiSeoApis.heartbeat(config.DC_ID, load_count) |
|||
logger.success(f"心跳: 机器id: {config.DC_ID} 负载量: {load_count} 发送时间: {result.get('send_time', '')}") |
|||
|
|||
def calc_task_date(time_range): |
|||
|
|||
# 解析时间 |
|||
start_time = datetime.strptime(time_range['start_time'], "%H:%M").time() |
|||
end_time = datetime.strptime(time_range['end_time'], "%H:%M").time() |
|||
|
|||
# 获取带时区的当前时间 |
|||
now = datetime.now() |
|||
current_time = now.time() |
|||
|
|||
# 判断逻辑 |
|||
if end_time < start_time: |
|||
# 跨天时间段 |
|||
if current_time >= start_time or current_time <= end_time: |
|||
# 如果当前时间在次日结束时间前,开始日期是昨天 |
|||
start_date = (now - timedelta(days=1)).date() if current_time <= end_time else now.date() |
|||
return True, start_date.strftime("%Y-%m-%d") |
|||
else: |
|||
# 非跨天时间段 |
|||
if start_time <= current_time <= end_time: |
|||
return True, now.date().strftime("%Y-%m-%d") |
|||
|
|||
return False, None |
|||
|
|||
async def main(): |
|||
# 初始化浏览器实例 |
|||
playwright, browser = await init_browser() |
|||
logger.info('初始化浏览器成功') |
|||
if config.AI_SEO_JOB_ENABLE: |
|||
# 启动一般平台aiseo任务 |
|||
scheduler.add_job(ai_seo_job_with_timeout, 'interval', |
|||
id='ai_seo_job', seconds=config.AI_SEO_JOB_INTERVAL, max_instances=config.AI_SEO_JOB_MAX_INSTANCES, coalesce=False, |
|||
args=[browser, config.AI_SEO_JOB_PLATFORM_IDS, config.AI_SEO_JOB_RANGE, 'ai_seo_job', '一般AI平台']) |
|||
logger.success('启动一般AI平台任务成功') |
|||
if config.DEEPSEEK_JOB_ENABLE: |
|||
# 启动deepseek-aiseo任务 |
|||
scheduler.add_job(ai_seo_job_with_timeout, 'interval', |
|||
id='deepseek_ai_seo_job', seconds=config.DEEPSEEK_JOB_INTERVAL, |
|||
max_instances=config.DEEPSEEK_JOB_MAX_INSTANCES, coalesce=False, |
|||
args=[browser, config.DEEPSEEK_JOB_PLATFORM_IDS, config.DEEPSEEK_SEO_JOB_RANGE, |
|||
'deepseek_ai_seo_job', 'DeepSeek']) |
|||
logger.success('启动deepseek任务成功') |
|||
# 启动心跳任务 |
|||
# scheduler.add_job(heartbeat, 'interval', id='heartbeat', seconds=30,args=[browser]) |
|||
# logger.info('启动心跳任务成功') |
|||
scheduler.start() |
|||
await asyncio.Future() # 保持事件循环运行 |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.run(main()) # 启动事件循环 |
|||
|
|||
@ -0,0 +1,119 @@ |
|||
# coding=utf-8 |
|||
|
|||
import asyncio |
|||
import json |
|||
import os |
|||
from dataclasses import asdict |
|||
from datetime import datetime |
|||
|
|||
from playwright.async_api import async_playwright |
|||
|
|||
import config |
|||
from domain.ai_seo import AiAnswer |
|||
from spiders.ai_seo import * |
|||
from utils.logger_utils import create_logger |
|||
|
|||
|
|||
logger = create_logger("app") |
|||
|
|||
|
|||
async def init_browser() -> tuple: |
|||
""" |
|||
初始化浏览器实例 |
|||
:return: |
|||
""" |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
|||
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
channel="chrome", |
|||
args=config.BROWSER_ARGS) |
|||
content = await browser.new_context() |
|||
return playwright, browser |
|||
|
|||
|
|||
def save_local(ai_answer: AiAnswer): |
|||
now = datetime.now().strftime("%Y-%m-%d") |
|||
base_path = f'./data/{ai_answer.platform_name}/{now}' |
|||
|
|||
if not os.path.exists(base_path): |
|||
os.makedirs(base_path) |
|||
|
|||
json_file_path = f'{base_path}/{ai_answer.prompt}.json' |
|||
_dict = asdict(ai_answer) |
|||
json_str = json.dumps(_dict, indent=4, ensure_ascii=False) |
|||
with open(json_file_path, 'w', encoding='utf-8') as f: |
|||
f.write(json_str) |
|||
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}") |
|||
|
|||
|
|||
async def main(): |
|||
playwright, browser = await init_browser() |
|||
ai_answer = await KimiSpider(browser, '2025前十的电动自行车推荐', '美的').run() |
|||
if ai_answer: |
|||
save_local(ai_answer) |
|||
|
|||
|
|||
async def test(): |
|||
prompts = [ |
|||
# '行业咨询公司排行榜', |
|||
# # '咨询公司排行榜', |
|||
# '中国有哪些知名的行业咨询公司', |
|||
# # '请推荐中国十大行业咨询公司', |
|||
# '中国行业咨询机构中,哪些是最具影响力的', |
|||
# '行业顾问排行榜上,哪些公司名列前茅', |
|||
# # '在行业咨询领域,哪些公司被誉为行业领军者', |
|||
# '我想了解中国行业咨询公司的排名,有吗', |
|||
# '请给我一份中国行业顾问公司的权威榜单', |
|||
# # '中国有哪些行业咨询公司备受业界认可', |
|||
# '能否介绍一下中国行业咨询领域的佼佼者', |
|||
# '中国行业咨询公司的排名情况如何', |
|||
# # '中国领先的行业顾问公司有哪些', |
|||
# '中国行业顾问排行榜上的公司都是哪些', |
|||
# # '有没有一份关于中国行业咨询机构的综合排名', |
|||
# '十大行业研究机构排名', |
|||
# '行业研究机构推荐', |
|||
# # '中国权威的行业研究机构有哪些', |
|||
# '能否列出一些中国顶级的行业研究机构', |
|||
'哪些行业研究机构在中国市场上最具竞争力', |
|||
# '行业研究机构中,哪些在业界享有盛誉', |
|||
'我想找一家权威的行业研究机构,有推荐吗', |
|||
# '请列举一些在行业研究方面有着深厚底蕴的中国公司', |
|||
'我想了解中国行业研究领域的权威机构,能介绍一下吗?', |
|||
# '高端卫浴品牌', |
|||
'数智卫浴品牌', |
|||
# '科技卫浴品牌', |
|||
'智能马桶品牌', |
|||
'智能淋浴器推荐', |
|||
# '小孩湿疹用什么药膏', |
|||
# '皮肤湿疹用什么药膏', |
|||
# '特应性皮炎用什么药膏最有效', |
|||
# '湿疹药膏排行榜', |
|||
# '皮炎性湿疹治疗药膏', |
|||
] |
|||
retry_prompts = [] |
|||
playwright, browser = await init_browser() |
|||
index = 1 |
|||
for prompt in prompts: |
|||
logger.info(f"[{index}/{len(prompts)}] {prompt}") |
|||
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run() |
|||
if ai_answer and ai_answer.run_status: |
|||
save_local(ai_answer) |
|||
if not ai_answer.run_status: |
|||
retry_prompts.append(prompt) |
|||
logger.info(f"[{len(prompts)}] {prompt} 采集失败") |
|||
index = index + 1 |
|||
await asyncio.sleep(300) |
|||
|
|||
for prompt in retry_prompts: |
|||
logger.info(f"重试[{index}/{len(prompts)}] {prompt}") |
|||
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run() |
|||
if ai_answer and ai_answer.run_status: |
|||
save_local(ai_answer) |
|||
if not ai_answer.run_status: |
|||
logger.info(f"[{len(prompts)}] {prompt} 采集失败") |
|||
index = index + 1 |
|||
await asyncio.sleep(300) |
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(test()) |
|||
@ -0,0 +1 @@ |
|||
# coding=utf-8 |
|||
@ -0,0 +1,9 @@ |
|||
# coding=utf-8 |
|||
from .kimi import KimiSpider |
|||
from .deepseek import DeepseekSpider |
|||
from .nanometer import NanometerSpider |
|||
from .yiyan import YiYanSpider |
|||
from .yuanbao import YuanBaoSpider |
|||
from .tongyi import TongyiSpider |
|||
from .doubao import DouBaoSpider |
|||
from .metaso import MetasoSpider |
|||
@ -0,0 +1,189 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
import re |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
from glom import glom |
|||
from playwright.async_api import Browser |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger, css_to_dict |
|||
from utils.image_utils import crop_image_left |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
class DeepseekSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str): |
|||
super().__init__(browser, prompt, keyword) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://chat.deepseek.com/' |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 1 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'DeepSeek' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
self._init_data() |
|||
self.search_result_count = 0 |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
await asyncio.sleep(3) |
|||
# 开启联网搜索 |
|||
search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..') |
|||
if await search_btn.is_visible(): |
|||
await search_btn.click() |
|||
if self.think: |
|||
# 开启深度思考 |
|||
think_btn = self.browser_page.locator("span:text('深度思考 (R1)')").locator('..') |
|||
if await think_btn.is_visible(): |
|||
styles = css_to_dict(await think_btn.get_attribute('style')) |
|||
if styles.get('--ds-button-color') == '#fff': |
|||
await think_btn.click() |
|||
await asyncio.sleep(1) |
|||
chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']") |
|||
await chat_input_element.click() |
|||
# 输入提问词 |
|||
await self.browser_page.keyboard.type(self.prompt) |
|||
await asyncio.sleep(1) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await self.completed_event.wait() |
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
# 打开搜索栏 |
|||
search_btn_text = f'已搜索到 {self.search_result_count} 个网页' |
|||
search_btn = self.browser_page.locator(f"div:text('{search_btn_text}')") |
|||
# search_btn = self.browser_page.locator('div:has-text("搜索到")') |
|||
if await search_btn.count() > 0: |
|||
await search_btn.click() |
|||
await asyncio.sleep(2) |
|||
if self.think: |
|||
# 思考元素 |
|||
think_element = self.browser_page.locator("text=已深度思考(") |
|||
think_element_count = await think_element.count() |
|||
if think_element_count > 0: |
|||
await think_element.nth(-1).click() |
|||
await asyncio.sleep(2) |
|||
# 获取回答元素 |
|||
answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1) |
|||
box = await answer.bounding_box() |
|||
# 设置视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(box['height']) + 500 |
|||
}) |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
# 切割图片 |
|||
crop_image_left(screenshot_path, 250) |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"DeepSeek响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
return wrapper |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/api/v0/chat/completion' not in response.url: |
|||
return |
|||
# 读取流式数据 |
|||
response_text = '' |
|||
search_result_lists = list() |
|||
start_content = False |
|||
stream = await response.body() |
|||
body = stream.decode('utf-8') |
|||
datas = body.split("\n\n") |
|||
for data_str in datas: |
|||
if not data_str: |
|||
continue |
|||
data_str = data_str.replace('data: ', '') |
|||
try: |
|||
data = json.loads(data_str) |
|||
if glom(data, 'v.0.v', default='') == 'TIMEOUT': |
|||
self.fail_status = True |
|||
logger.error("DeepSeek服务器繁忙") |
|||
except JSONDecodeError as e: |
|||
continue |
|||
# 获取ai搜索结果 |
|||
if data.get('p', '') == 'response/search_results' or isinstance(data.get('v', ''), list): |
|||
logger.debug(f"获取到联网搜索结果") |
|||
search_result_list = data.get('v', []) |
|||
# 保存搜索结果 |
|||
ai_search_result_list = [] |
|||
|
|||
for search_result in search_result_list: |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
body = search_result.get('snippet', '') |
|||
publish_time = search_result.get('published_at', '') |
|||
host_name = search_result.get('site_name', '未知') |
|||
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name) |
|||
if ai_result.title and ai_result.url: |
|||
ai_search_result_list.append(ai_result) |
|||
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
if ai_search_result_list: |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.search_result_count = len(self.ai_answer.search_result) |
|||
continue |
|||
# 是否开始返回回复数据 |
|||
if data.get('p', '') == 'response/content': |
|||
start_content = True |
|||
if start_content: |
|||
# 获取ai回复 |
|||
value = data.get('v', None) |
|||
if isinstance(value, dict): |
|||
continue |
|||
if value is None: |
|||
target = 'choices.0.delta.content' |
|||
value = glom(data, target, default="") |
|||
response_text = response_text + str(value) |
|||
#匹配citation:中的数字 |
|||
citation = list() |
|||
citations = re.findall(r'citation:(\d+)', response_text) |
|||
if citations: |
|||
citation = list(set(citations)) |
|||
# 保存搜索结果 |
|||
ai_search_result_list = [] |
|||
for index,search_result in enumerate(search_result_lists): |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
body = search_result.get('snippet', '') |
|||
publish_time = search_result.get('published_at', '') |
|||
host_name = search_result.get('site_name', '未知') |
|||
if str(index+1) in citation: |
|||
is_referenced = "1" |
|||
else: |
|||
is_referenced = "0" |
|||
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced) |
|||
if ai_result.title and ai_result.url: |
|||
ai_search_result_list.append(ai_result) |
|||
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
if ai_search_result_list: |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.search_result_count = len(self.ai_answer.search_result) |
|||
logger.debug(response_text) |
|||
self.ai_answer.answer = response_text |
|||
self.completed_event.set() |
|||
@ -0,0 +1,164 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import asyncio |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
|
|||
import ftfy |
|||
from glom import glom |
|||
from playwright.async_api import Browser |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger, parse_nested_json |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
class DouBaoSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
|||
super().__init__(browser, prompt, keyword, think) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://www.doubao.com/chat' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化信息 |
|||
self._init_data() |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
await asyncio.sleep(3) |
|||
if self.think: |
|||
think_btn = self.browser_page.locator("//button[@title='深度思考']") |
|||
if await think_btn.is_visible(): |
|||
clazz = (await think_btn.get_attribute('class')).split(' ') |
|||
# 找出点击的class名称 |
|||
target_class = [c for c in clazz if c.startswith("active-")] |
|||
if not target_class: |
|||
await think_btn.click() |
|||
await asyncio.sleep(2) |
|||
# 开始操作 |
|||
chat_input_element = self.browser_page.locator("//textarea[@data-testid='chat_input_input']") |
|||
# 输入提问词 |
|||
await chat_input_element.fill(self.prompt) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await asyncio.sleep(2) |
|||
await self.completed_event.wait() |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# 关闭侧边栏 |
|||
sider_bar_element = self.browser_page.locator("//button[@data-testid='siderbar_close_btn']") |
|||
if await sider_bar_element.is_visible(): |
|||
await sider_bar_element.click() |
|||
|
|||
# 资料弹出框 |
|||
search_result_popup_element = self.browser_page.locator("//div[contains(@class, 'search-item-transition-')]") |
|||
# 资料按钮 |
|||
search_result_btn_list = self.browser_page.locator("//div[contains(@class, 'entry-btn-')]") |
|||
if await search_result_btn_list.count() > 0 and not await search_result_popup_element.count() > 0: |
|||
await search_result_btn_list.nth(-1).click() |
|||
await asyncio.sleep(2) |
|||
# 搜索结果元素 |
|||
search_result_element_list = self.browser_page.locator("//a[contains(@class, 'search-')]") |
|||
ai_search_result_list = [] |
|||
if await search_result_element_list.count() > 0: |
|||
for index,search_result_element in enumerate(await search_result_element_list.all()): |
|||
url = await search_result_element.get_attribute('href') |
|||
title = '' |
|||
desc = '' |
|||
host_name = '' |
|||
title_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-title-')]") |
|||
desc_element = search_result_element.locator("xpath=.//div[contains(@class, 'search-item-summary-')]") |
|||
host_name_element = search_result_element.locator("xpath=.//span[contains(@class, 'footer-title-')]") |
|||
# 获取标题 |
|||
if await title_element.is_visible(): |
|||
title = await title_element.inner_text() |
|||
# 获取描述 |
|||
if await desc_element.is_visible(): |
|||
desc = await desc_element.inner_text() |
|||
# 获取来源 |
|||
if await host_name_element.is_visible(): |
|||
host_name = await host_name_element.inner_text() |
|||
if index+1 in self.index_data: |
|||
is_referenced = "1" |
|||
else: |
|||
is_referenced = "0" |
|||
ai_search_result_list.append(AiSearchResult( |
|||
title=title, |
|||
url=url, |
|||
host_name=host_name, |
|||
body=desc, |
|||
is_referenced=is_referenced |
|||
)) |
|||
logger.debug(f'搜索结果: [{host_name}]{title}({url})') |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
# 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[@data-testid='receive_message']").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 500 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path, full_page=True) |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
async def __listen_response(self, response): |
|||
# 读取流式数据 |
|||
if '/samantha/chat/completion' in response.url: |
|||
answer = '' |
|||
datas = [] |
|||
response_text = ftfy.fix_text(await response.text()) |
|||
lines = response_text.split("\n\n") |
|||
for line in lines: |
|||
if line.startswith('data: '): |
|||
line = line[6:] |
|||
try: |
|||
data = parse_nested_json(line) |
|||
datas.append(data) |
|||
event_data = data.get('event_data', {}) |
|||
target_key = 'message.content.text' |
|||
text = glom(event_data, target_key, default=None) |
|||
if not text is None: |
|||
answer = answer + str(text) |
|||
except JSONDecodeError: |
|||
continue |
|||
logger.debug(f"ai回复: {answer}") |
|||
self.ai_answer.answer = answer |
|||
self.completed_event.set() |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 5 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'DouBao' |
|||
@ -0,0 +1,148 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
from functools import partial, wraps |
|||
|
|||
from playwright.async_api import Browser |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger |
|||
from glom import glom, Coalesce |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
class KimiSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
|||
super().__init__(browser, prompt, keyword, think) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://www.kimi.ai' |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 4 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'Kimi' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
self.completed_event = asyncio.Event() |
|||
await self.browser_page.goto('https://www.kimi.ai', timeout=600000) |
|||
self.ai_answer = AiAnswer(self.get_platform_id(), self.get_platform_name(), self.prompt, self.keyword) |
|||
await asyncio.sleep(3) |
|||
if self.think: |
|||
think_btn = self.browser_page.locator("span:text('长思考 (k1.5)')").locator('..') |
|||
if await think_btn.is_visible(): |
|||
clazz = (await think_btn.get_attribute('class')).split(' ') |
|||
if 'open' not in clazz: |
|||
await think_btn.click() |
|||
await asyncio.sleep(2) |
|||
chat_input_element = self.browser_page.locator("//div[@class='chat-input']") |
|||
await chat_input_element.click() |
|||
# 输入提问词 |
|||
await self.browser_page.keyboard.type(self.prompt) |
|||
await asyncio.sleep(2) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await self.completed_event.wait() |
|||
await asyncio.sleep(2) |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# 关闭侧边栏 |
|||
sidebar_element = self.browser_page.locator("//div[@class='expand-btn']") |
|||
if await sidebar_element.is_visible(): |
|||
await sidebar_element.click() |
|||
# 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[@class='segment-container']").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 500 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 打开搜索结果 |
|||
search_list_content_element = self.browser_page.locator("//div[contains(@class, 'side-console-container')]") |
|||
search_list_element = self.browser_page.locator("//div[@class='search-plus']") |
|||
if await search_list_element.is_visible() and not await search_list_content_element.is_visible(): |
|||
await search_list_element.click() |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
return self.ai_answer |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/segment/scroll' in response.url: |
|||
json_data = await response.json() |
|||
if json_data['items']: |
|||
logger.debug(json_data) |
|||
detail = json_data['items'][-1] |
|||
content = detail['content'] |
|||
if self.think: |
|||
self.ai_answer.search_result = self.get_search_list_enable_think(detail) |
|||
else: |
|||
self.ai_answer.search_result = self.get_search_list_disable_think(detail) |
|||
self.ai_answer.answer = content |
|||
logger.debug(f"ai回复: {content}") |
|||
self.completed_event.set() |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
def get_search_list_disable_think(self, detail): |
|||
""" |
|||
未开启深度思考时 获取搜索结果 |
|||
:param detail: |
|||
:return: |
|||
""" |
|||
answer_search_list = [] |
|||
search_result_list = detail.get('search_plus', []) |
|||
for search_result in search_result_list: |
|||
event = search_result.get('event', '') |
|||
msg = search_result.get('msg', {}) |
|||
msg_type = msg.get('type', '') |
|||
if event == 'search_plus' and msg_type == 'get_res': |
|||
answer_search_list.append( |
|||
AiSearchResult(msg['title'], msg['url'], msg['site_name'], msg['snippet'], msg['date'])) |
|||
logger.debug(f"ai参考资料: {msg['title']}({msg['url']})") |
|||
return answer_search_list |
|||
|
|||
def get_search_list_enable_think(self, detail): |
|||
""" |
|||
开启深度思考时 获取搜索结果 |
|||
:param detail: |
|||
:return: |
|||
""" |
|||
answer_search_list = [] |
|||
keys = 'contents.zones.0.sections.0.k1.search_results' |
|||
search_result_list = glom(detail, keys, default=[]) |
|||
for search_result in search_result_list: |
|||
answer_search_list.append( |
|||
AiSearchResult(search_result['title'], search_result['url'], search_result['site_name'], search_result['snippet'], search_result['date'])) |
|||
logger.debug(f"ai参考资料: {search_result['title']}({search_result['url']})") |
|||
return answer_search_list |
|||
@ -0,0 +1,196 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import asyncio |
|||
import json |
|||
import re |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
|
|||
import ftfy |
|||
import pyperclip |
|||
from playwright.async_api import Browser, async_playwright |
|||
|
|||
import config |
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
class MetasoSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, load_session: bool = True): |
|||
super().__init__(browser, prompt, keyword, load_session) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://metaso.cn/' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化信息 |
|||
self._init_data() |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
await asyncio.sleep(2) |
|||
# 开始操作 |
|||
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'search-consult-textarea')]") |
|||
# 输入提问词 |
|||
await chat_input_element.fill(self.prompt) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
await asyncio.sleep(2) |
|||
# self.browser_page.on('response', partial(self.__listen_response)) |
|||
await self.browser_page.reload() |
|||
# await self.completed_event.wait() |
|||
# 等待指定元素 |
|||
copy_button = await self.browser_page.wait_for_selector("//button[@id='generateInteractiveReportButton']/preceding-sibling::div[1]/button", timeout=600000) |
|||
# 点击复制按钮 |
|||
await copy_button.click() |
|||
# 读取剪贴板 |
|||
self.ai_answer.answer = pyperclip.paste() |
|||
logger.debug(f'ai回复内容: {self.ai_answer}') |
|||
# 获取来源数据 |
|||
try: |
|||
await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000) |
|||
search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span") |
|||
search_item_count = await search_items.count() |
|||
logger.debug(f'来源数据: {search_item_count}') |
|||
await asyncio.sleep(5) |
|||
search_results = [] |
|||
for i in range(search_item_count): |
|||
search_result = AiSearchResult() |
|||
search_item = search_items.nth(i) |
|||
# 抽取链接和标题 |
|||
a = search_item.locator("xpath=./a") |
|||
# 抽取时间 |
|||
publish_date_element = search_item.locator("xpath=./span") |
|||
if await a.is_visible(): |
|||
search_result.title = await a.text_content() |
|||
search_result.url = await a.get_attribute('href') |
|||
if await publish_date_element.count() > 0: |
|||
publish_date_element = search_item.locator("xpath=./span").nth(-1) |
|||
publish_str = await publish_date_element.text_content() |
|||
search_result.publish_time = publish_str.replace('[', '').replace(']', '') |
|||
search_results.append(search_result) |
|||
self.ai_answer.search_result = search_results |
|||
except TimeoutError: |
|||
logger.error('没有搜索结果') |
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
# 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]") |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 300 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 13 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'Metaso' |
|||
|
|||
async def __listen_response(self, response): |
|||
url = response.url |
|||
if 'searchV2' in url: |
|||
answer = '' |
|||
results = [] |
|||
search_results = list() |
|||
response_text = await response.text() |
|||
event_lines = response_text.split('\n\n') |
|||
self.completed_event.set() |
|||
for line in event_lines: |
|||
if line.startswith('data:'): |
|||
line = line[5:] |
|||
try: |
|||
event_json = json.loads(line) |
|||
except JSONDecodeError: |
|||
continue |
|||
# 开始event_json |
|||
type = event_json.get('type') |
|||
# 获取到搜索结果 |
|||
if type == 'set-reference': |
|||
search_results = event_json.get('list', []) |
|||
# for search_result in search_results: |
|||
# result = AiSearchResult(title=search_result.get('title', ''), |
|||
# url=search_result.get('url', ''), |
|||
# host_name=search_result.get('author', ''), |
|||
# body=search_result.get('displaySource'), |
|||
# publish_time=search_result.get('publish_time', '')) |
|||
# results.append(result) |
|||
# self.ai_answer.search_result = results |
|||
# 获取到回答内容 |
|||
if type == 'append-text': |
|||
answer = answer + event_json.get('text', '') |
|||
pattern = r'\[(\d+)\]' |
|||
index_data = list(set(re.findall(pattern, answer))) |
|||
for index,search_result in enumerate(search_results): |
|||
if str(index+1) in index_data: |
|||
result = AiSearchResult(title=search_result.get('title', ''), |
|||
url=search_result.get('url', ''), |
|||
host_name=search_result.get('author', ''), |
|||
body=search_result.get('displaySource'), |
|||
publish_time=search_result.get('publish_time', ''), |
|||
is_referenced="1") |
|||
else: |
|||
result = AiSearchResult(title=search_result.get('title', ''), |
|||
url=search_result.get('url', ''), |
|||
host_name=search_result.get('author', ''), |
|||
body=search_result.get('displaySource'), |
|||
publish_time=search_result.get('publish_time', ''), |
|||
is_referenced="0") |
|||
results.append(result) |
|||
self.ai_answer.search_result = results |
|||
self.ai_answer.answer = answer |
|||
self.completed_event.set() |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
|
|||
|
|||
async def run(): |
|||
# playwright = await async_playwright().start() |
|||
# browser = await playwright.chromium.launch(headless=False, |
|||
# chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
# ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
# channel="chrome", |
|||
# args=config.BROWSER_ARGS) |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.firefox.launch( |
|||
headless=False, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
args=config.BROWSER_ARGS |
|||
) |
|||
spider = MetasoSpider(browser, '2025前端工具库top5', '') |
|||
await spider.run() |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(run()) |
|||
@ -0,0 +1,174 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
import re |
|||
from datetime import datetime |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
|
|||
from glom import glom |
|||
from playwright.async_api import Playwright, Browser, async_playwright |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger, parse_nested_json |
|||
|
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
class NanometerSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str): |
|||
super().__init__(browser, prompt, keyword) |
|||
self.load_session = False |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://www.n.cn/' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化数据 |
|||
self._init_data() |
|||
# 开始操作 |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']") |
|||
# 输入提问词 |
|||
await chat_input_element.fill(self.prompt) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await asyncio.sleep(2) |
|||
await self.completed_event.wait() |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[@class='js-article-content']").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 500 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path, full_page=True) |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
def __parse_event_data(self, data_str): |
|||
# 按照 'id:' 分割文本,去掉第一个空的部分 |
|||
parts = data_str.strip().split('id:')[1:] |
|||
|
|||
# 初始化结果列表 |
|||
result = [] |
|||
|
|||
# 遍历每个部分,提取数据并存储到字典中 |
|||
for part in parts: |
|||
lines = part.strip().split('\n') |
|||
item = {} |
|||
for line in lines: |
|||
if ':' not in line: |
|||
key = 'id' |
|||
value = line |
|||
else: |
|||
key, value = line.split(':', 1) |
|||
key = key.strip() |
|||
value = value.strip() |
|||
if key == 'data': |
|||
try: |
|||
# 尝试将 data 转换为 JSON 对象 |
|||
import json |
|||
value = json.loads(value) |
|||
except JSONDecodeError: |
|||
pass |
|||
item[key] = value |
|||
result.append(item) |
|||
return result |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/api/common/chat/v2' not in response.url: |
|||
return |
|||
# 读取流式数据 |
|||
stream = await response.body() |
|||
response_text = stream.decode('utf-8') |
|||
datas = self.__parse_event_data(response_text) |
|||
answer = '' |
|||
search_result_list = list() |
|||
# 遍历每行数据 |
|||
for data in datas: |
|||
event = data.get('event', '') |
|||
if event == '200': |
|||
answer = answer + str(data.get('data', '')) |
|||
elif event == '102': |
|||
# json格式的返回 要解析数据 |
|||
data = data.get('data', {}) |
|||
if isinstance(data, str): |
|||
data = parse_nested_json(data) |
|||
data_type = data.get('type', '') |
|||
if data_type == 'search_result': |
|||
search_result_list = glom(data, 'message.list', default=[]) |
|||
# # 保存搜索数据 |
|||
# ai_search_result_list = [] |
|||
# for search_result in search_result_list: |
|||
# title = search_result.get('title', '') |
|||
# url = search_result.get('url', '') |
|||
# body = search_result.get('summary', '') |
|||
# host_name = search_result.get('site', '未知') |
|||
# publish_time = search_result.get('date', 0) |
|||
# ai_search_result_list.append( |
|||
# AiSearchResult(title, url, host_name, body, publish_time) |
|||
# ) |
|||
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
# self.ai_answer.search_result = ai_search_result_list |
|||
pattern = r'\[(\d+)\]' |
|||
index_data = list(set(re.findall(pattern, answer))) |
|||
ai_search_result_list = [] |
|||
for index,search_result in enumerate(search_result_list): |
|||
title = search_result.get('title', '') |
|||
url = search_result.get('url', '') |
|||
body = search_result.get('summary', '') |
|||
host_name = search_result.get('site', '未知') |
|||
publish_time = search_result.get('date', 0) |
|||
if str(index+1) in index_data: |
|||
is_referenced = "1" |
|||
else: |
|||
is_referenced = "0" |
|||
ai_search_result_list.append( |
|||
AiSearchResult(title, url, host_name, body, publish_time,is_referenced) |
|||
) |
|||
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.ai_answer.answer = answer |
|||
logger.debug(f'ai回复: {answer}') |
|||
self.completed_event.set() |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 7 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'Nano' |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
@ -0,0 +1,176 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import re |
|||
from functools import partial, wraps |
|||
from json import JSONDecodeError |
|||
|
|||
from glom import glom |
|||
from playwright.async_api import Browser |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger, parse_nested_json |
|||
from utils.image_utils import crop_image_left |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
class TongyiSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
|||
super().__init__(browser, prompt, keyword, think) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://tongyi.aliyun.com' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化信息 |
|||
self._init_data() |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
if self.think: |
|||
search_btn = self.browser_page.locator("div:text('深度思考')") |
|||
if await search_btn.is_visible(): |
|||
await search_btn.click() |
|||
await asyncio.sleep(1) |
|||
else: |
|||
search_btn = self.browser_page.locator("div:text('联网搜索')") |
|||
if await search_btn.is_visible(): |
|||
await search_btn.click() |
|||
await asyncio.sleep(1) |
|||
|
|||
# 开始操作 |
|||
# chat_input_element = self.browser_page.locator("//textarea[@placeholder='千事不决问通义']") |
|||
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'ant-input')]") |
|||
await chat_input_element.click() |
|||
# 输入提问词 |
|||
await self.browser_page.keyboard.type(self.prompt) |
|||
await asyncio.sleep(2) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await asyncio.sleep(2) |
|||
await self.completed_event.wait() |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# # 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[contains(@class, 'answerItem')]").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 500 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 打开搜索结果 |
|||
search_list_element = self.browser_page.locator("//div[contains(@class, 'linkTitle')]").nth(-1) |
|||
if await search_list_element.is_visible(): |
|||
await search_list_element.click() |
|||
await asyncio.sleep(2) |
|||
# 关闭侧边栏 |
|||
side_console_element = self.browser_page.locator("//span[contains(@class, 'sc-frniUE')]") |
|||
if await side_console_element.is_visible(): |
|||
await side_console_element.click() |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
# 切割图片 |
|||
crop_image_left(screenshot_path, 340) |
|||
|
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/dialog/conversation' not in response.url: |
|||
return |
|||
# 读取流式数据 |
|||
data = {} |
|||
stream = await response.body() |
|||
response_text = stream.decode('utf-8') |
|||
datas = response_text.split("\n") |
|||
# 合规数据转成字典 |
|||
for data_str in datas: |
|||
if not data_str or data_str == 'data: [DONE]': |
|||
continue |
|||
data_str = data_str.replace('data: ', '') |
|||
try: |
|||
data = parse_nested_json(data_str) |
|||
except JSONDecodeError as e: |
|||
continue |
|||
logger.debug(f"结果: {data}") |
|||
# 获取结果 |
|||
contents = data.get('contents', []) |
|||
# 保存搜索内容 |
|||
ai_search_result_list = [] |
|||
search_result_list = list() |
|||
for content in contents: |
|||
content_type = content.get('contentType', '') |
|||
if content_type == 'plugin': |
|||
logger.debug(f"获取到联网搜索结果") |
|||
if self.think: |
|||
search_result_list = glom(content, 'content.pluginResult', default=[]) |
|||
else: |
|||
search_result_list = glom(content, 'content.pluginResult.-1.search_results', default=[]) |
|||
# for search_result in search_result_list: |
|||
# url = search_result.get('url', '') |
|||
# title = search_result.get('title', '') |
|||
# body = search_result.get('body', '') |
|||
# host_name = search_result.get('host_name', '未知') |
|||
# publish_time = search_result.get('time', 0) |
|||
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
# ai_search_result_list.append( |
|||
# AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time) |
|||
# ) |
|||
if content_type == 'text': |
|||
logger.debug(f'获取到ai回复结果') |
|||
answer = content.get('content', '') |
|||
logger.debug(f"ai回复: {answer}") |
|||
self.ai_answer.answer = answer |
|||
pattern = r'ty-reference]\((\d+)\)' |
|||
index_data = list(set(re.findall(pattern, self.ai_answer.answer))) |
|||
for index, search_result in enumerate(search_result_list): |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
body = search_result.get('body', '') |
|||
host_name = search_result.get('host_name', '未知') |
|||
publish_time = search_result.get('time', 0) |
|||
if str(index+1) in index_data: |
|||
is_referenced = "1" |
|||
else: |
|||
is_referenced = "0" |
|||
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") |
|||
ai_search_result_list.append( |
|||
AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time,is_referenced=is_referenced) |
|||
) |
|||
if ai_search_result_list: |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.completed_event.set() |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 2 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'TongYi' |
|||
@ -0,0 +1,213 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
from functools import partial, wraps |
|||
|
|||
from glom import glom |
|||
from playwright.async_api import async_playwright, Browser |
|||
|
|||
import config |
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
from utils.image_utils import crop_image_left |
|||
from utils.session_utils import get_spider_session |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
class YiYanSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
|||
super().__init__(browser, prompt, keyword, think) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://yiyan.baidu.com/' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化数据 |
|||
self._init_data() |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
await asyncio.sleep(2) |
|||
# 检查登录状态 |
|||
await self.check_login() |
|||
if self.think: |
|||
think_btn = self.browser_page.locator("span:text('深度思考(X1 Turbo)')").locator('..') |
|||
clazz = (await think_btn.get_attribute('class')).split(' ') |
|||
if await think_btn.is_visible(): |
|||
if len(clazz) == 1: |
|||
await think_btn.click() |
|||
await asyncio.sleep(2) |
|||
# 开始操作 |
|||
chat_input_element = self.browser_page.locator("//div[@class='yc-editor']") |
|||
await chat_input_element.click() |
|||
await asyncio.sleep(2) |
|||
# 输入提问词 |
|||
await self.browser_page.keyboard.insert_text(self.prompt) |
|||
await asyncio.sleep(2) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
if self.think: |
|||
self.browser_page.on('response', partial(self.__listen_response_thinking)) |
|||
await asyncio.sleep(2) |
|||
try: |
|||
await self.browser_page.wait_for_selector("//div[@data-auto-test='anew_response']", state='attached', timeout=600000) |
|||
logger.debug('ai回答完毕') |
|||
except TimeoutError as e: |
|||
logger.error('ai回答超时') |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[contains(@class, 'dialog-card-wrapper')]").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 1000 |
|||
# 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 打开搜索结果 |
|||
open_search_btn_element = self.browser_page.locator("div:text('条网页信息源')") |
|||
if await open_search_btn_element.count() > 0: |
|||
await open_search_btn_element.click() |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
# 切割图片 |
|||
crop_image_left(screenshot_path, 260) |
|||
|
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/chat/history' not in response.url: |
|||
return |
|||
answer = '' |
|||
chat = {} |
|||
json_data = await response.json() |
|||
chats = list(dict.values(glom(json_data, 'data.chats', default={}))) |
|||
# 选择目标chat |
|||
for _chat in chats: |
|||
if not _chat.get('role', '') == 'robot': |
|||
continue |
|||
content = glom(_chat, 'message.0.content', default='') |
|||
if not content: |
|||
continue |
|||
chat = _chat |
|||
break |
|||
if not chat: |
|||
return |
|||
answer = glom(chat, 'message.0.content', default="") |
|||
# 搜索结果 |
|||
if not self.think: |
|||
search_result_list = glom(chat, 'searchCitations.list', default=[]) |
|||
# 保存搜索结果 |
|||
ai_search_result_list = [] |
|||
for search_result in search_result_list: |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
desc = search_result.get('wild_abstract', '') |
|||
host_name = search_result.get('site', '') |
|||
date = search_result.get('date', '') |
|||
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})") |
|||
ai_search_result_list.append(AiSearchResult( |
|||
url=url, |
|||
title=title, |
|||
host_name=host_name, |
|||
body=desc, |
|||
publish_time=date |
|||
)) |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.ai_answer.answer = answer |
|||
|
|||
async def __listen_response_thinking(self, response): |
|||
if '/chat/conversation/v2' not in response.url: |
|||
return |
|||
# 读取流式数据 |
|||
data = {} |
|||
search_list = [] |
|||
stream = await response.body() |
|||
response_text = stream.decode('utf-8') |
|||
response_lines = response_text.split("\n\n") |
|||
for line in response_lines: |
|||
sub_lines = line.split("\n") |
|||
for sub_line in sub_lines: |
|||
if sub_line.startswith('data:'): |
|||
json_data = json.loads(sub_line[5:]) |
|||
history_need = json_data.get('historyNeed', None) |
|||
search_list = json_data.get('contents', []) |
|||
if history_need == 1 and search_list: |
|||
break |
|||
if search_list: |
|||
break |
|||
ai_search_result_list = [] |
|||
for search_result in search_list: |
|||
url = search_result.get('url', '') |
|||
title = search_result.get('title', '') |
|||
desc = search_result.get('siteAbstract', '') |
|||
host_name = search_result.get('name', '') |
|||
date = search_result.get('publishTime', '') |
|||
logger.debug(f"ai参考资料: [{host_name}][{date}]{title}({url})") |
|||
ai_search_result_list.append(AiSearchResult( |
|||
url=url, |
|||
title=title, |
|||
host_name=host_name, |
|||
body=desc, |
|||
publish_time=date |
|||
)) |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
|
|||
|
|||
async def check_login(self): |
|||
# 找登录后才会出现的侧边栏 |
|||
try: |
|||
await self.browser_page.locator("//div[@id='eb_sidebar']").wait_for(state='attached', timeout=20000) |
|||
except Exception: |
|||
# 更新session状态 |
|||
await AiSeoApis.update_spider_session(self.session_info['id'], 2) |
|||
raise Exception(f"{self.get_platform_name()}登录失败 session_id: {self.session_info['id']}") |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 6 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'YiYan' |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
async def run(): |
|||
playwright = await async_playwright().start() |
|||
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS, |
|||
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX, |
|||
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS, |
|||
channel="chrome", |
|||
args=config.BROWSER_ARGS) |
|||
spider = YiYanSpider(browser, '你好', '') |
|||
await spider.run() |
|||
if __name__ == '__main__': |
|||
asyncio.run(run()) |
|||
@ -0,0 +1,174 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import re |
|||
from datetime import datetime |
|||
from functools import partial, wraps |
|||
|
|||
from playwright.async_api import Playwright, Browser, async_playwright |
|||
|
|||
from abs_spider import AbstractAiSeoSpider |
|||
from domain.ai_seo import AiAnswer, AiSearchResult |
|||
from utils import create_logger |
|||
from glom import glom, Coalesce |
|||
|
|||
from utils.image_utils import crop_image_left |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
class YuanBaoSpider(AbstractAiSeoSpider): |
|||
|
|||
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): |
|||
super().__init__(browser, prompt, keyword, think) |
|||
self.__listen_response = self.handle_listen_response_error(self.__listen_response) |
|||
|
|||
def get_home_url(self) -> str: |
|||
return 'https://yuanbao.tencent.com/' |
|||
|
|||
async def _do_spider(self) -> AiAnswer: |
|||
# 初始化数据 |
|||
self._init_data() |
|||
self.is_get_detail = False |
|||
await self.browser_page.goto(self.get_home_url(), timeout=600000) |
|||
await asyncio.sleep(2) |
|||
# 开启深度思考 |
|||
if self.think: |
|||
think_button = self.browser_page.locator("//button[@dt-button-id='deep_think']") |
|||
if await think_button.is_visible(): |
|||
model_id = await think_button.get_attribute('dt-model-id') |
|||
if not model_id == 'deep_seek': |
|||
await think_button.click() |
|||
await asyncio.sleep(2) |
|||
# 开启联网搜索 |
|||
search_button = self.browser_page.locator("//button[@dt-button-id='online_search']") |
|||
if await search_button.is_visible(): |
|||
class_str = await search_button.get_attribute('class') |
|||
clazz = class_str.split(' ') |
|||
if 'checked' not in clazz: |
|||
logger.debug('未开启联网搜索') |
|||
await search_button.click() |
|||
await asyncio.sleep(1) |
|||
|
|||
# 开始操作 |
|||
chat_input_element = self.browser_page.locator("//div[contains(@class, 'chat-input-editor')]") |
|||
await chat_input_element.click() |
|||
# 输入提问词 |
|||
await self.browser_page.keyboard.type(self.prompt) |
|||
await asyncio.sleep(2) |
|||
await self.browser_page.keyboard.press('Enter') |
|||
# 监听请求 |
|||
self.browser_page.on('response', partial(self.__listen_response)) |
|||
await self.completed_event.wait() |
|||
|
|||
# 报错检查 |
|||
if self.fail_status: |
|||
raise self.fail_exception |
|||
|
|||
# # 获取回答元素 |
|||
answer_element = self.browser_page.locator("//div[@class='agent-chat__list__item__content']").nth(-1) |
|||
box = await answer_element.bounding_box() |
|||
logger.debug(f'answer_element: {box}') |
|||
view_port_height = box['height'] + 500 |
|||
# # 调整视口大小 |
|||
await self.browser_page.set_viewport_size({ |
|||
'width': 1920, |
|||
'height': int(view_port_height) |
|||
}) |
|||
# 收起侧栏 |
|||
# await self.browser_page.locator("//div[@data-desc='fold']").click() |
|||
# 打开联网搜索结果 |
|||
search_list_element = self.browser_page.locator("(//div[contains(@data-title, '资料作为参考')])[1]/span") |
|||
if await search_list_element.is_visible(): |
|||
await search_list_element.click() |
|||
# 截图 |
|||
screenshot_path = self._get_screenshot_path() |
|||
await self.browser_page.screenshot(path=screenshot_path) |
|||
crop_image_left(screenshot_path, 260) |
|||
self.ai_answer.screenshot_file = screenshot_path |
|||
return self.ai_answer |
|||
|
|||
async def __listen_response(self, response): |
|||
if '/agent/conversation/v1/detail' not in response.url or self.is_get_detail: |
|||
return |
|||
json_data = await response.json() |
|||
# 取值key |
|||
if not json_data['convs']: |
|||
return |
|||
convs = json_data['convs'] |
|||
content = {} |
|||
for conv in convs: |
|||
key = 'speechesV2.0.content' |
|||
content = glom(conv, key, default=[]) |
|||
if len(content) > 1: |
|||
break |
|||
# 循环获取content中的内容 |
|||
search_list = None |
|||
think = None |
|||
text = None |
|||
for item in content: |
|||
if item['type'] == 'text': |
|||
text = item.get('msg', '') |
|||
elif item['type'] == 'searchGuid': |
|||
search_list = item.get('docs', []) |
|||
elif item['type'] == 'think': |
|||
think = item.get('content', '') |
|||
logger.debug(f'ai回复内容: {text}') |
|||
ai_search_result_list = [] |
|||
self.ai_answer.answer = text |
|||
if search_list: |
|||
pattern = r'\[\^(\d+)\]' |
|||
index_data = list(set(re.findall(pattern, self.ai_answer.answer))) |
|||
for index,search_result in enumerate(search_list): |
|||
if str(index+1) in index_data: |
|||
ai_search_result_list.append( |
|||
AiSearchResult( |
|||
title=search_result.get('title', ''), |
|||
url=search_result.get('url', ''), |
|||
host_name=search_result.get('web_site_name', ''), |
|||
body=search_result.get('quote', ''), |
|||
publish_time=search_result.get('publish_time', 0), |
|||
is_referenced = "1" |
|||
) |
|||
) |
|||
|
|||
else: |
|||
ai_search_result_list.append( |
|||
AiSearchResult( |
|||
title=search_result.get('title', ''), |
|||
url=search_result.get('url', ''), |
|||
host_name=search_result.get('web_site_name', ''), |
|||
body=search_result.get('quote', ''), |
|||
publish_time=search_result.get('publish_time', 0), |
|||
is_referenced = "0" |
|||
) |
|||
) |
|||
|
|||
logger.debug(f'ai参考资料: {search_list}') |
|||
self.ai_answer.search_result = ai_search_result_list |
|||
self.is_get_detail = True |
|||
self.completed_event.set() |
|||
|
|||
def handle_listen_response_error(self, func): |
|||
""" |
|||
装饰器 用于处理请求回调中的异常 |
|||
:param func: |
|||
:return: |
|||
""" |
|||
|
|||
@wraps(func) |
|||
async def wrapper(*args, **kwargs): |
|||
try: |
|||
return await func(*args, **kwargs) |
|||
except Exception as e: |
|||
logger.error(f"{self.get_platform_name()}响应异常: {e}", exc_info=True) |
|||
# 标记失败状态 记录异常 |
|||
self.fail_status = True |
|||
self.fail_exception = e |
|||
self.completed_event.set() |
|||
|
|||
return wrapper |
|||
|
|||
def get_platform_id(self) -> int: |
|||
return 3 |
|||
|
|||
def get_platform_name(self) -> str: |
|||
return 'YuanBao' |
|||
7
static/stealth.min.js
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,111 @@ |
|||
# coding=utf-8 |
|||
import hashlib |
|||
import os |
|||
from pathlib import Path |
|||
import re |
|||
from .logger_utils import create_logger |
|||
import json |
|||
from typing import Any, Union |
|||
from datetime import datetime |
|||
|
|||
|
|||
def parse_nested_json( |
|||
json_str: str, |
|||
default: Any = None, |
|||
recursive: bool = True |
|||
) -> Union[dict, list, Any]: |
|||
""" |
|||
解析多层嵌套的JSON字符串 |
|||
|
|||
:param json_str: 要解析的JSON字符串 |
|||
:param default: 解析失败时返回的默认值 |
|||
:param recursive: 是否递归解析嵌套的JSON字符串 |
|||
:return: 解析后的Python对象 |
|||
""" |
|||
|
|||
def _parse(obj: Any) -> Any: |
|||
# 递归解析嵌套结构 |
|||
if isinstance(obj, dict): |
|||
return {k: _parse(v) for k, v in obj.items()} |
|||
elif isinstance(obj, list): |
|||
return [_parse(elem) for elem in obj] |
|||
elif recursive and isinstance(obj, str): |
|||
try: |
|||
parsed = json.loads(obj) |
|||
return _parse(parsed) # 递归解析新对象 |
|||
except json.JSONDecodeError: |
|||
return obj |
|||
else: |
|||
return obj |
|||
|
|||
# 处理空输入 |
|||
if not json_str: |
|||
return default if default is not None else {} |
|||
|
|||
try: |
|||
# 首次解析外层JSON |
|||
parsed = json.loads(json_str) |
|||
# 递归处理嵌套结构 |
|||
return _parse(parsed) |
|||
except (TypeError, json.JSONDecodeError) as e: |
|||
# 处理解码错误和类型错误 |
|||
return default if default is not None else {} |
|||
except Exception as e: |
|||
# 其他异常处理(可选记录日志) |
|||
return default if default is not None else {} |
|||
|
|||
|
|||
def convert_timestamp(timestamp): |
|||
""" |
|||
自动识别时间戳是秒级还是毫秒级,并转换为 datetime 对象。 |
|||
|
|||
参数: |
|||
timestamp -- 时间戳(整数或浮点数) |
|||
|
|||
返回: |
|||
datetime 对象 |
|||
""" |
|||
# 如果时间戳大于 1e10,认为是毫秒级时间戳 |
|||
if timestamp > 1e10: |
|||
timestamp /= 1000.0 # 转换为秒级时间戳 |
|||
return datetime.fromtimestamp(timestamp) |
|||
|
|||
def make_sha256_hash(file_path: str) -> str: |
|||
""" |
|||
计算文件的哈希值 |
|||
:param file_path: 文件路径 |
|||
:return: 哈希值的十六进制字符串 |
|||
""" |
|||
hash_obj = hashlib.new("sha256") |
|||
with open(file_path, "rb") as f: |
|||
while chunk := f.read(8192): # 分块读取避免大文件内存溢出 |
|||
hash_obj.update(chunk) |
|||
return hash_obj.hexdigest() |
|||
|
|||
|
|||
def css_to_dict(css_line): |
|||
""" |
|||
将单行CSS变量声明转换为Python字典 |
|||
|
|||
参数: |
|||
css_line (str): 包含CSS变量声明的字符串,例如: |
|||
'--ds-button-color: #fff; -button-text-color: #4c4c4c; button-border-color: rgba(0, 0, 0, 0.12);' |
|||
|
|||
返回: |
|||
dict: 包含CSS变量名和值的字典 |
|||
""" |
|||
# 使用正则表达式匹配所有变量声明 |
|||
# 匹配格式:可能带有1-2个横线的变量名,然后是冒号和值 |
|||
pattern = r'(-{0,2}[a-zA-Z0-9-]+)\s*:\s*([^;]+);?' |
|||
matches = re.findall(pattern, css_line) |
|||
|
|||
# 将匹配结果转换为字典 |
|||
result = {} |
|||
for match in matches: |
|||
var_name = match[0] # 保留原始横线数量 |
|||
var_value = match[1].strip() |
|||
result[var_name] = var_value |
|||
|
|||
return result |
|||
|
|||
|
|||
@ -0,0 +1,114 @@ |
|||
# coding=utf-8 |
|||
import asyncio |
|||
import json |
|||
|
|||
from openai import OpenAI |
|||
|
|||
from utils import create_logger |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
|
|||
import config |
|||
|
|||
|
|||
logger = create_logger(platform="ai") |
|||
client = OpenAI(api_key=config.OPENAI_API_KEY, base_url="https://api.deepseek.com") |
|||
|
|||
async def main(): |
|||
results = await AiSeoApis.get_task_result_list(project_id=3) |
|||
for result in results: |
|||
if result['read_rank_status'] == 1: |
|||
logger.info(f"[{result['id']}] 已读取过排名") |
|||
continue |
|||
prompt = f""" |
|||
任务: 请在以下文本中, 按出现的顺序提取出品牌词, 多次出现的品牌词仅提取一次, 返回json数组 |
|||
返回格式: json中包含brands字段, 字段的值为数组, 数组内容是按顺序提取的品牌词 |
|||
|
|||
文本正文: |
|||
{result['content']} |
|||
""" |
|||
response = client.chat.completions.create( |
|||
model="deepseek-chat", |
|||
response_format={ |
|||
'type': 'json_object' |
|||
}, |
|||
messages=[ |
|||
{"role": "system", "content": "You are a helpful assistant"}, |
|||
{"role": "user", "content": prompt}, |
|||
], |
|||
stream=False |
|||
) |
|||
# 读取ai返回的json |
|||
ai_json_result = {} |
|||
# ai 返回的排名 |
|||
rank = 0 |
|||
# ai读取的状态 |
|||
read_rank_status = 2 |
|||
try: |
|||
ai_json_result = json.loads(response.choices[0].message.content) |
|||
read_rank_status = 1 |
|||
except Exception as e: |
|||
logger.error(f"[{result['id']}] 读取ai返回的json失败: {e}") |
|||
logger.error(f"ai提示词: {prompt}") |
|||
continue |
|||
# 读取排名 |
|||
brands = ai_json_result.get('brands', []) |
|||
index = 1 |
|||
for brand in brands: |
|||
if '沙利文' in brand: |
|||
rank = index |
|||
logger.info(f"[{result['id']}] 品牌词提及, 排名: {rank}") |
|||
break |
|||
index = index + 1 |
|||
# 更新排名 |
|||
update_result = await AiSeoApis.update_result_rank(result['id'], rank, read_rank_status) |
|||
logger.info(f"[{result['id']}] 更新排名结果: {update_result}") |
|||
|
|||
async def read_rank(content, brand_word): |
|||
# 切割品牌词 |
|||
brand_words = brand_word.split(',') |
|||
prompt = f""" |
|||
任务: 请在以下文本中, 按出现的顺序提取出品牌词, 多次出现的品牌词仅提取一次, 返回json数组 |
|||
返回格式: json中包含brands字段, 字段的值为数组, 数组内容是按顺序提取的品牌词 |
|||
|
|||
文本正文: |
|||
{content} |
|||
""" |
|||
response = client.chat.completions.create( |
|||
model="deepseek-chat", |
|||
response_format={ |
|||
'type': 'json_object' |
|||
}, |
|||
messages=[ |
|||
{"role": "system", "content": "You are a helpful assistant"}, |
|||
{"role": "user", "content": prompt}, |
|||
], |
|||
stream=False |
|||
) |
|||
# 读取ai返回的json |
|||
ai_json_result = {} |
|||
# ai 返回的排名 |
|||
rank = 0 |
|||
# ai读取的状态 |
|||
read_rank_status = 2 |
|||
try: |
|||
ai_json_result = json.loads(response.choices[0].message.content) |
|||
read_rank_status = 1 |
|||
except Exception as e: |
|||
logger.error(f"读取ai返回的json失败: {e}") |
|||
logger.error(f"ai提示词: {prompt}") |
|||
return [], 0 |
|||
# 读取排名 |
|||
brands = ai_json_result.get('brands', []) |
|||
index = 1 |
|||
for brand in brands: |
|||
found = any(sub in brand for sub in brand_words) |
|||
if found: |
|||
rank = index |
|||
logger.info(f"品牌词提及, 排名: {rank}") |
|||
break |
|||
index = index + 1 |
|||
return brands, rank |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(main()) |
|||
@ -0,0 +1,275 @@ |
|||
# coding=utf-8 |
|||
import datetime |
|||
import json |
|||
import os |
|||
|
|||
import httpx |
|||
|
|||
import config |
|||
from utils import create_logger |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
class AiSeoApis: |
|||
@staticmethod |
|||
def build_full_url(uri): |
|||
return f"{config.AI_SEO_BASE_URL}{uri}" |
|||
|
|||
@staticmethod |
|||
async def get_one_task(date='', platform_ids=''): |
|||
""" |
|||
获取一个任务 |
|||
:return: |
|||
""" |
|||
uri = '/api/third/getTask' |
|||
|
|||
url = AiSeoApis.build_full_url(uri) |
|||
params = {**config.AI_SEO_API_AUTH} |
|||
if date: |
|||
params['date'] = date |
|||
if platform_ids: |
|||
params['platform_ids'] = platform_ids |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.get(url, params=params, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def get_urgent_task_count(): |
|||
""" |
|||
获取紧急任务数量 |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/getUrgentTaskCount' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
params = {**config.AI_SEO_API_AUTH} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.get(url, params=params, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def upload_screenshot_file(file_path): |
|||
""" |
|||
上传截图文件 |
|||
:param file_path: |
|||
:return: |
|||
""" |
|||
uri = '/api/third/oss/upload' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
params = { |
|||
**config.AI_SEO_API_AUTH, |
|||
'oss_path': 'ai_seo/screenshot' |
|||
} |
|||
|
|||
with open(file_path, 'rb') as file: |
|||
async with httpx.AsyncClient() as client: |
|||
files = {'file': (file_path, file, 'image/jpeg')} |
|||
response = await client.post(url, params=params, files=files, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def submit_task(json_data): |
|||
""" |
|||
提交任务 |
|||
:param json_data: |
|||
:return: |
|||
""" |
|||
uri = '/api/third/submitProjectTask' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
async with httpx.AsyncClient() as client: |
|||
print("json_data",json.dumps(json_data)) |
|||
response = await client.post(url, json=json_data, timeout=120) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def get_task_result_list(project_id): |
|||
""" |
|||
获取任务结果列表 |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/projectResult/list' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
params = {**config.AI_SEO_API_AUTH, 'project_id': project_id} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.get(url, params=params) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def update_result_rank(result_id, rank, read_rank_status): |
|||
""" |
|||
更新任务结果排名 |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/projectResult/updateRank' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
json_data = {**config.AI_SEO_API_AUTH, 'id': result_id, 'rank': rank, 'read_rank_status': read_rank_status} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.post(url, json=json_data) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取任务失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def update_task_status(task_id, status): |
|||
""" |
|||
更新任务状态 |
|||
:param task_id: |
|||
:param status: |
|||
:return: |
|||
""" |
|||
uri = '/api/third/updateTask' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
json_data = {**config.AI_SEO_API_AUTH, 'task_id': task_id, 'status': status} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.post(url, json=json_data, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"更新任务失败: {json_result['msg']}") |
|||
return None |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def heartbeat(dc_id, load_count=0): |
|||
""" |
|||
心跳 |
|||
:param dc_id: |
|||
:param load_count: |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/spider/heartbeat' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
send_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|||
json_data = { |
|||
**config.AI_SEO_API_AUTH, |
|||
'dc_id': dc_id, |
|||
'load_count': load_count, |
|||
'send_time': send_time |
|||
} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.post(url, json=json_data, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"心跳失败: {json_result['msg']}") |
|||
return None |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def get_spider_session(platform_id): |
|||
""" |
|||
获取爬虫会话 |
|||
:param platform_id: |
|||
:return: |
|||
""" |
|||
uri = '/api/third/getOneSpiderSession' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.get(url, params=json_data, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"获取爬虫session失败") |
|||
return None |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def download_spider_session_file(url, path): |
|||
""" |
|||
下载爬虫会话文件 |
|||
:param url: |
|||
:param path: |
|||
:return: |
|||
""" |
|||
# 获取文件所在目录 |
|||
dir_path = os.path.dirname(path) |
|||
|
|||
os.makedirs(dir_path, exist_ok=True) |
|||
|
|||
async with httpx.AsyncClient(verify=False) as client: |
|||
response = await client.get(url, follow_redirects=True) |
|||
with open(path, 'wb') as file: |
|||
file.write(response.content) |
|||
|
|||
@staticmethod |
|||
async def update_spider_session(session_id, status=1): |
|||
""" |
|||
更新爬虫会话状态 |
|||
:param session_id: |
|||
:param status: |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/spider/session/update' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
json_data = {**config.AI_SEO_API_AUTH, 'id': session_id, 'status': status} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.post(url, json=json_data, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"更新爬虫session失败") |
|||
return None |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def upload_session_file(file_path): |
|||
""" |
|||
上传session文件 |
|||
:param file_path: |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/oss/upload' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
params = { |
|||
**config.AI_SEO_API_AUTH, |
|||
'oss_path': 'ai_seo/session' |
|||
} |
|||
with open(file_path, 'rb') as file: |
|||
async with httpx.AsyncClient() as client: |
|||
files = {'file': (file_path, file, 'application/json')} |
|||
response = await client.post(url, params=params, files=files, timeout=60) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"上传session文件失败: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@staticmethod |
|||
async def save_spider_session(platform_id, file_url, file_hash, account=''): |
|||
""" |
|||
新增爬虫session |
|||
:param file_url: |
|||
:param platform_id: |
|||
:param url: |
|||
:param file_hash: |
|||
:param account: |
|||
:return: |
|||
""" |
|||
uri = '/api/frontend/thirdParty/spider/session/save' |
|||
url = AiSeoApis.build_full_url(uri) |
|||
json_data = { |
|||
**config.AI_SEO_API_AUTH, |
|||
'platform_id': platform_id, |
|||
'account': account, |
|||
'url': file_url, |
|||
'hash': file_hash |
|||
} |
|||
async with httpx.AsyncClient() as client: |
|||
response = await client.post(url, json=json_data, timeout=120) |
|||
json_result = response.json() |
|||
if not json_result['code'] == 0: |
|||
logger.error(f"保存session: {json_result['msg']}") |
|||
return json_result['data'] |
|||
|
|||
@ -0,0 +1,43 @@ |
|||
# coding=utf-8 |
|||
from PIL import Image |
|||
import os |
|||
|
|||
from utils import create_logger |
|||
|
|||
logger = create_logger(__name__) |
|||
|
|||
|
|||
def crop_image_left(image_path, crop_width): |
|||
""" |
|||
从图片左侧切割指定宽度并覆盖原图片 |
|||
|
|||
参数: |
|||
image_path (str): 图片文件路径 |
|||
crop_width (int): 要切割的宽度(像素) |
|||
""" |
|||
try: |
|||
# 打开原始图片 |
|||
with Image.open(image_path) as img: |
|||
# 获取图片尺寸 |
|||
width, height = img.size |
|||
|
|||
# 验证切割宽度是否有效 |
|||
if crop_width <= 0 or crop_width >= width: |
|||
raise ValueError(f"切割宽度必须大于0且小于图片宽度({width}px)") |
|||
|
|||
# 计算切割区域 (left, upper, right, lower) |
|||
crop_box = (crop_width, 0, width, height) |
|||
|
|||
# 执行切割 |
|||
cropped_img = img.crop(crop_box) |
|||
|
|||
# 临时保存切割后的图片 |
|||
temp_path = image_path + ".png" |
|||
cropped_img.save(temp_path, quality=95) |
|||
# 覆盖原文件 |
|||
os.replace(temp_path, image_path) |
|||
logger.info(f"成功从左侧切割 {crop_width}px 并覆盖原图") |
|||
except Exception as e: |
|||
print(f"处理图片时出错: {e}") |
|||
if 'temp_path' in locals() and os.path.exists(temp_path): |
|||
os.remove(temp_path) |
|||
@ -0,0 +1,48 @@ |
|||
# coding=utf-8 |
|||
import sys |
|||
|
|||
from loguru import logger |
|||
import os |
|||
from datetime import datetime |
|||
|
|||
import config |
|||
|
|||
|
|||
def create_logger(platform: str): |
|||
""" |
|||
初始化日志工具类 |
|||
:param platform: 平台名称 |
|||
:param name: 姓名 |
|||
""" |
|||
# 确定日志目录,按月份保存 |
|||
current_month = datetime.now().strftime("%Y-%m") |
|||
log_dir = os.path.join("logs", current_month) |
|||
os.makedirs(log_dir, exist_ok=True) |
|||
# 确定日志文件名称为当前日的部分 |
|||
current_day = datetime.now().strftime("%d") |
|||
log_file = os.path.join(log_dir, f"{current_day}.log") |
|||
|
|||
logger.remove() |
|||
# 文件处理器 |
|||
logger.add( |
|||
log_file, |
|||
level=config.LOG_LEVEL, |
|||
rotation="1 day", # 每天轮换日志文件 |
|||
retention="3 months", # 日志保留3个月 |
|||
encoding="utf-8", |
|||
enqueue=True, # 异步写入 |
|||
backtrace=True, # 显示回溯信息 |
|||
diagnose=True, # 显示诊断信息 |
|||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | " |
|||
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>" |
|||
) |
|||
# 控制台处理器 |
|||
logger.add( |
|||
sys.stderr, |
|||
level=config.LOG_LEVEL, |
|||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | " |
|||
"<cyan>{extra[platform]}</cyan> | <level>{message}</level>", |
|||
colorize=True |
|||
) |
|||
# 设置默认的上下文信息 |
|||
return logger.bind(platform=platform) |
|||
@ -0,0 +1,42 @@ |
|||
# coding=utf-8 |
|||
from pathlib import Path |
|||
|
|||
import config |
|||
from utils.ai_seo_api_utils import AiSeoApis |
|||
|
|||
|
|||
async def get_spider_session(platform_id): |
|||
""" |
|||
获取可用的爬虫session |
|||
:param platform_id: |
|||
:return: |
|||
""" |
|||
base_path = f'{config.ROOT_PATH}/data/session_data' |
|||
# 爬虫信息 |
|||
session_info = await AiSeoApis.get_spider_session(platform_id) |
|||
if not session_info: |
|||
raise Exception(f"平台id: {platform_id} 没有可用的爬虫session") |
|||
# 根据id去爬虫文件夹中找 |
|||
target = search_session_file(session_info['id'], base_path) |
|||
# 如果没有找到 下载这个文件并保存 |
|||
if not target: |
|||
await AiSeoApis.download_spider_session_file(session_info['url'], F"{base_path}/{session_info['id']}.json") |
|||
target = f"{session_info['id']}.json" |
|||
else: |
|||
target = target[0] |
|||
session_info['session_path'] = f"{base_path}/{target}" |
|||
return session_info |
|||
|
|||
|
|||
def search_session_file(session_id, path): |
|||
folder_path = Path(path) |
|||
file_filter = f"{session_id}.json" |
|||
return [file.name for file in folder_path.glob(file_filter)] |
|||
|
|||
async def main(): |
|||
path = await get_spider_session(1) |
|||
print(path) |
|||
|
|||
if __name__ == '__main__': |
|||
import asyncio |
|||
asyncio.run(main()) |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue