Browse Source
feat(ai_seo): 支持 datetime 类型的 publish_time 字段并优化工作流爬虫集成
feat(ai_seo): 支持 datetime 类型的 publish_time 字段并优化工作流爬虫集成
- 在 `AiAnswer` 数据类中扩展 `publish_time` 类型,支持传入 `datetime` 实例,并自动格式化为字符串- 新增 `WorkFlowApiSpider` 爬虫类,用于通过 API 方式获取工作流平台的 AI 回复结果 - 更新 `main.py` 和 `run.py`,注册并调用工作流爬虫逻辑 - 添加定时任务调度逻辑以支持工作流平台的任务执行 - 支持根据平台 ID 动态判断使用传统爬虫或工作流 API 爬虫- 增加对短结果、空结果的异常处理及任务状态回滚机制- 浏览器初始化逻辑优化,仅在需要时启动浏览器实例master
4 changed files with 215 additions and 5 deletions
-
6domain/ai_seo.py
-
7main.py
-
102run.py
-
105spiders/ai_seo/workflow.py
@ -0,0 +1,105 @@ |
|||||
|
# coding=utf-8 |
||||
|
from datetime import datetime |
||||
|
|
||||
|
import httpx |
||||
|
|
||||
|
import config |
||||
|
from domain.ai_seo import AiAnswer, AiSearchResult |
||||
|
from utils import create_logger |
||||
|
|
||||
|
# 平台信息 |
||||
|
PLATFORMS = { |
||||
|
1: {'id': 1, 'name': 'Deepseek', 'api_key': ''}, |
||||
|
2: {'id': 2, 'name': '通义千问', 'api_key': 'app-mQE0lOxB0G49r4tQSv2LgIOV'}, |
||||
|
3: {'id': 3, 'name': '腾讯元宝', 'api_key': ''}, |
||||
|
4: {'id': 4, 'name': 'Kimi', 'api_key': ''}, |
||||
|
5: {'id': 5, 'name': '豆包', 'api_key': 'app-lD5HbD03EW7pamzIV2VEIyR6'}, |
||||
|
6: {'id': 6, 'name': '文心一言', 'api_key': ''}, |
||||
|
} |
||||
|
|
||||
|
|
||||
|
|
||||
|
logger = create_logger(__name__) |
||||
|
|
||||
|
class WorkFlowApiSpider: |
||||
|
platform_id: int |
||||
|
platform_name: str |
||||
|
prompt: str |
||||
|
keyword: str |
||||
|
ai_answer: AiAnswer | None = None |
||||
|
fail_status: bool = False |
||||
|
fail_exception: Exception | None = None |
||||
|
load_session: bool = True |
||||
|
task_id: int = 0 |
||||
|
think: bool = False, |
||||
|
api_key: str = '' |
||||
|
|
||||
|
def __init__(self, prompt: str, keyword: str, platform_id: int): |
||||
|
self.platform_id = platform_id |
||||
|
self.prompt = prompt |
||||
|
self.keyword = keyword |
||||
|
|
||||
|
platform = PLATFORMS.get(int(platform_id), {}) |
||||
|
if not platform: |
||||
|
raise Exception('平台不存在') |
||||
|
self.platform_name = platform.get('name', '') |
||||
|
self.api_key = platform.get('api_key', '') |
||||
|
if not self.api_key: |
||||
|
raise Exception('平台未配置api_key') |
||||
|
|
||||
|
async def run(self): |
||||
|
logger.info(f"{self.platform_name}Api开始获取数据 提问词: {self.prompt}") |
||||
|
# 构建参数 |
||||
|
params = { |
||||
|
"response_mode": "blocking", |
||||
|
"user": config.DIFY_USER, |
||||
|
"inputs": { |
||||
|
"prompt": self.prompt |
||||
|
} |
||||
|
} |
||||
|
headers = { |
||||
|
"Authorization": f"Bearer {self.api_key}" |
||||
|
} |
||||
|
# 发送请求 |
||||
|
async with httpx.AsyncClient() as client: |
||||
|
response = await client.post(f"{config.DIFY_BASE_URL}/workflows/run", json=params, headers=headers, timeout=300) |
||||
|
json_result = response.json() |
||||
|
result = json_result.get('data', []) |
||||
|
if not result or not result['status'] == 'succeeded': |
||||
|
logger.error(f"{self.platform_name}Api获取数据失败: {json_result}") |
||||
|
raise Exception(f"{self.platform_name}Api获取数据失败") |
||||
|
# 获取工作流返回的数据 |
||||
|
workflow_result = result['outputs']['work_flow_result'] |
||||
|
# 用量数据 |
||||
|
usage = workflow_result['usage'] |
||||
|
# ai回复内容 带标签 |
||||
|
answer = workflow_result['answer'] |
||||
|
# ai回复内容 不带标签 |
||||
|
pure_answer = workflow_result['pure_answer'] |
||||
|
logger.debug(f"ai回复: {pure_answer}") |
||||
|
# 联网搜索结果 |
||||
|
web_searches = workflow_result.get('web_search', []) |
||||
|
# 转换后的结果 |
||||
|
search_items = [] |
||||
|
for item in web_searches: |
||||
|
# 提取publish_time |
||||
|
if not item['datePublished']: |
||||
|
publish_time = None |
||||
|
else: |
||||
|
publish_time = datetime.strptime(item['datePublished'], "%Y-%m-%dT%H:%M:%S%z") |
||||
|
search_item = AiSearchResult( |
||||
|
title=item['name'], |
||||
|
url=item['url'], |
||||
|
host_name=item['siteName'], |
||||
|
body=item['summary'], |
||||
|
publish_time=publish_time, |
||||
|
is_referenced=item['is_ref'], |
||||
|
) |
||||
|
logger.debug(f"ai参考资料: [{search_item.host_name}]{search_item.title}({search_item.url})") |
||||
|
search_items.append(search_item) |
||||
|
|
||||
|
# 组合结果 |
||||
|
self.ai_answer = AiAnswer(self.platform_id, self.platform_name, self.prompt, self.keyword, answer, search_items, '', True) |
||||
|
logger.info(f"本次用量:\n总Token: {usage['total_tokens']}\n总资费: {round(float(usage['total_price']), 3)}") |
||||
|
return self.ai_answer |
||||
|
|
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue