Browse Source

refactor(ai_seo): 重构通义灵码爬虫

- 点击提示框确认按钮- 优化搜索结果处理逻辑- 调整 AI 回答处理方式
- 移除不必要的注释和代码
master
zzx 1 month ago
parent
commit
184c7ad851
  1. 42
      spiders/ai_seo/tongyi.py

42
spiders/ai_seo/tongyi.py

@ -30,6 +30,10 @@ class TongyiSpider(AbstractAiSeoSpider):
# 初始化信息 # 初始化信息
self._init_data() self._init_data()
await self.browser_page.goto(self.get_home_url(), timeout=600000) await self.browser_page.goto(self.get_home_url(), timeout=600000)
# 点掉提示框
confirm_btn = self.browser_page.locator('//button[.//span[text()="我知道了"]]')
if await confirm_btn.is_visible():
await confirm_btn.click()
if self.think: if self.think:
search_btn = self.browser_page.locator("div:text('深度思考')") search_btn = self.browser_page.locator("div:text('深度思考')")
if await search_btn.is_visible(): if await search_btn.is_visible():
@ -94,7 +98,6 @@ class TongyiSpider(AbstractAiSeoSpider):
stream = await response.body() stream = await response.body()
response_text = stream.decode('utf-8') response_text = stream.decode('utf-8')
datas = response_text.split("\n") datas = response_text.split("\n")
# print("datas:",datas)
# 合规数据转成字典 # 合规数据转成字典
for data_str in datas: for data_str in datas:
if not data_str or data_str == 'data: [DONE]': if not data_str or data_str == 'data: [DONE]':
@ -109,46 +112,29 @@ class TongyiSpider(AbstractAiSeoSpider):
contents = data.get('contents', []) contents = data.get('contents', [])
# 保存搜索内容 # 保存搜索内容
ai_search_result_list = [] ai_search_result_list = []
search_result_list = list()
for content in contents: for content in contents:
content_type = content.get('contentType', '') content_type = content.get('contentType', '')
if content_type == 'plugin':
if content_type == 'referenceLink':
logger.debug(f"获取到联网搜索结果") logger.debug(f"获取到联网搜索结果")
if self.think: if self.think:
search_result_list = glom(content, 'content.pluginResult.links', default=[]) search_result_list = glom(content, 'content.pluginResult.links', default=[])
else: else:
search_result_list = glom(content, 'content.pluginResult.links.-1.search_results', default=[])
# for search_result in search_result_list:
# url = search_result.get('url', '')
# title = search_result.get('title', '')
# body = search_result.get('body', '')
# host_name = search_result.get('host_name', '未知')
# publish_time = search_result.get('time', 0)
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
# ai_search_result_list.append(
# AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time)
# )
if content_type == 'think':
logger.debug(f'获取到ai回复结果')
answer = content.get('content', '').get('content', '')
logger.debug(f"ai回复: {answer}")
self.ai_answer.answer = answer
pattern = r'ty-reference]\((\d+)\)'
index_data = list(set(re.findall(pattern, self.ai_answer.answer)))
for index, search_result in enumerate(search_result_list):
search_result_list = glom(content, 'content.links', default=[])
for search_result in search_result_list:
url = search_result.get('url', '') url = search_result.get('url', '')
title = search_result.get('title', '') title = search_result.get('title', '')
body = search_result.get('body', '') body = search_result.get('body', '')
host_name = search_result.get('host_name', '未知')
host_name =title.rsplit('-', 1)[1] if '-' in title else '未知'
publish_time = search_result.get('time', 0) publish_time = search_result.get('time', 0)
if str(index+1) in index_data:
is_referenced = "1"
else:
is_referenced = "0"
logger.debug(f"ai参考资料: [{host_name}]{title}({url})") logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
ai_search_result_list.append( ai_search_result_list.append(
AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time , is_referenced=is_referenced)
AiSearchResult(title=title, url=url, body=body, host_name=host_name, publish_time=publish_time, is_referenced='1')
) )
if content_type == 'text':
logger.debug(f'获取到ai回复结果')
answer = content.get('content', '')
logger.debug(f"ai回复: {answer}")
self.ai_answer.answer = answer
if ai_search_result_list: if ai_search_result_list:
self.ai_answer.search_result = ai_search_result_list self.ai_answer.search_result = ai_search_result_list
self.completed_event.set() self.completed_event.set()

Loading…
Cancel
Save