Browse Source
			
			
			feat(ai_seo): 支持 datetime 类型的 publish_time 字段并优化工作流爬虫集成
			
				
		feat(ai_seo): 支持 datetime 类型的 publish_time 字段并优化工作流爬虫集成
	
		
	
			
				- 在 `AiAnswer` 数据类中扩展 `publish_time` 类型,支持传入 `datetime` 实例,并自动格式化为字符串- 新增 `WorkFlowApiSpider` 爬虫类,用于通过 API 方式获取工作流平台的 AI 回复结果 - 更新 `main.py` 和 `run.py`,注册并调用工作流爬虫逻辑 - 添加定时任务调度逻辑以支持工作流平台的任务执行 - 支持根据平台 ID 动态判断使用传统爬虫或工作流 API 爬虫- 增加对短结果、空结果的异常处理及任务状态回滚机制- 浏览器初始化逻辑优化,仅在需要时启动浏览器实例master
				 4 changed files with 215 additions and 5 deletions
			
			
		- 
					6domain/ai_seo.py
- 
					7main.py
- 
					102run.py
- 
					105spiders/ai_seo/workflow.py
| @ -0,0 +1,105 @@ | |||||
|  | # coding=utf-8 | ||||
|  | from datetime import datetime | ||||
|  | 
 | ||||
|  | import httpx | ||||
|  | 
 | ||||
|  | import config | ||||
|  | from domain.ai_seo import AiAnswer, AiSearchResult | ||||
|  | from utils import create_logger | ||||
|  | 
 | ||||
|  | # 平台信息 | ||||
|  | PLATFORMS = { | ||||
|  |     1: {'id': 1, 'name': 'Deepseek', 'api_key': ''}, | ||||
|  |     2: {'id': 2, 'name': '通义千问', 'api_key': 'app-mQE0lOxB0G49r4tQSv2LgIOV'}, | ||||
|  |     3: {'id': 3, 'name': '腾讯元宝', 'api_key': ''}, | ||||
|  |     4: {'id': 4, 'name': 'Kimi', 'api_key': ''}, | ||||
|  |     5: {'id': 5, 'name': '豆包', 'api_key': 'app-lD5HbD03EW7pamzIV2VEIyR6'}, | ||||
|  |     6: {'id': 6, 'name': '文心一言', 'api_key': ''}, | ||||
|  | } | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | 
 | ||||
|  | logger = create_logger(__name__) | ||||
|  | 
 | ||||
|  | class WorkFlowApiSpider: | ||||
|  |     platform_id: int | ||||
|  |     platform_name: str | ||||
|  |     prompt: str | ||||
|  |     keyword: str | ||||
|  |     ai_answer: AiAnswer | None = None | ||||
|  |     fail_status: bool = False | ||||
|  |     fail_exception: Exception | None = None | ||||
|  |     load_session: bool = True | ||||
|  |     task_id: int = 0 | ||||
|  |     think: bool = False, | ||||
|  |     api_key: str = '' | ||||
|  | 
 | ||||
|  |     def __init__(self, prompt: str, keyword: str, platform_id: int): | ||||
|  |         self.platform_id = platform_id | ||||
|  |         self.prompt = prompt | ||||
|  |         self.keyword = keyword | ||||
|  | 
 | ||||
|  |         platform = PLATFORMS.get(int(platform_id), {}) | ||||
|  |         if not platform: | ||||
|  |             raise Exception('平台不存在') | ||||
|  |         self.platform_name = platform.get('name', '') | ||||
|  |         self.api_key = platform.get('api_key', '') | ||||
|  |         if not self.api_key: | ||||
|  |             raise Exception('平台未配置api_key') | ||||
|  | 
 | ||||
|  |     async def run(self): | ||||
|  |         logger.info(f"{self.platform_name}Api开始获取数据 提问词: {self.prompt}") | ||||
|  |         # 构建参数 | ||||
|  |         params = { | ||||
|  |             "response_mode": "blocking", | ||||
|  |             "user": config.DIFY_USER, | ||||
|  |             "inputs": { | ||||
|  |                 "prompt": self.prompt | ||||
|  |             } | ||||
|  |         } | ||||
|  |         headers = { | ||||
|  |             "Authorization": f"Bearer {self.api_key}" | ||||
|  |         } | ||||
|  |         # 发送请求 | ||||
|  |         async with httpx.AsyncClient() as client: | ||||
|  |             response = await client.post(f"{config.DIFY_BASE_URL}/workflows/run", json=params, headers=headers, timeout=300) | ||||
|  |             json_result = response.json() | ||||
|  |             result = json_result.get('data', []) | ||||
|  |             if not result or not result['status'] == 'succeeded': | ||||
|  |                 logger.error(f"{self.platform_name}Api获取数据失败: {json_result}") | ||||
|  |                 raise Exception(f"{self.platform_name}Api获取数据失败") | ||||
|  |             # 获取工作流返回的数据 | ||||
|  |             workflow_result = result['outputs']['work_flow_result'] | ||||
|  |             # 用量数据 | ||||
|  |             usage = workflow_result['usage'] | ||||
|  |             # ai回复内容 带标签 | ||||
|  |             answer = workflow_result['answer'] | ||||
|  |             # ai回复内容 不带标签 | ||||
|  |             pure_answer = workflow_result['pure_answer'] | ||||
|  |             logger.debug(f"ai回复: {pure_answer}") | ||||
|  |             # 联网搜索结果 | ||||
|  |             web_searches = workflow_result.get('web_search', []) | ||||
|  |             # 转换后的结果 | ||||
|  |             search_items = [] | ||||
|  |             for item in web_searches: | ||||
|  |                 # 提取publish_time | ||||
|  |                 if not item['datePublished']: | ||||
|  |                     publish_time = None | ||||
|  |                 else: | ||||
|  |                     publish_time = datetime.strptime(item['datePublished'], "%Y-%m-%dT%H:%M:%S%z") | ||||
|  |                 search_item = AiSearchResult( | ||||
|  |                     title=item['name'], | ||||
|  |                     url=item['url'], | ||||
|  |                     host_name=item['siteName'], | ||||
|  |                     body=item['summary'], | ||||
|  |                     publish_time=publish_time, | ||||
|  |                     is_referenced=item['is_ref'], | ||||
|  |                 ) | ||||
|  |                 logger.debug(f"ai参考资料: [{search_item.host_name}]{search_item.title}({search_item.url})") | ||||
|  |                 search_items.append(search_item) | ||||
|  | 
 | ||||
|  |             # 组合结果 | ||||
|  |             self.ai_answer = AiAnswer(self.platform_id, self.platform_name, self.prompt, self.keyword, answer, search_items, '',  True) | ||||
|  |             logger.info(f"本次用量:\n总Token: {usage['total_tokens']}\n总资费: {round(float(usage['total_price']), 3)}") | ||||
|  |             return self.ai_answer | ||||
|  | 
 | ||||
						Write
						Preview
					
					
					Loading…
					
					Cancel
						Save
					
		Reference in new issue