Browse Source

refactor(ai_seo): 重构 DeepSeek 爬虫

- 更新了页面元素定位方式,以适应网站结构变化
- 优化了深度思考和回答获取的逻辑
- 调整了搜索结果处理的方式
- 更新了数据返回结构的解析
master
zzx 1 month ago
parent
commit
90870aaf5d
  1. 50
      spiders/ai_seo/deepseek.py

50
spiders/ai_seo/deepseek.py

@ -37,15 +37,16 @@ class DeepseekSpider(AbstractAiSeoSpider):
search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..') search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..')
if await search_btn.is_visible(): if await search_btn.is_visible():
await search_btn.click() await search_btn.click()
self.think = True
if self.think: if self.think:
# 开启深度思考 # 开启深度思考
think_btn = self.browser_page.locator("span:text('深度思考 (R1)')").locator('..')
think_btn = self.browser_page.locator("span:text('深度思考')").locator('..')
if await think_btn.is_visible(): if await think_btn.is_visible():
styles = css_to_dict(await think_btn.get_attribute('style'))
if styles.get('--ds-button-color') == '#fff':
# styles = css_to_dict(await think_btn.get_attribute('style'))
# if styles.get('--ds-button-color') == '#fff':
await think_btn.click() await think_btn.click()
await asyncio.sleep(1) await asyncio.sleep(1)
chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']")
chat_input_element = self.browser_page.locator("//textarea[@placeholder='给 DeepSeek 发送消息 ']")
await chat_input_element.click() await chat_input_element.click()
# 输入提问词 # 输入提问词
await self.browser_page.keyboard.type(self.prompt) await self.browser_page.keyboard.type(self.prompt)
@ -72,7 +73,8 @@ class DeepseekSpider(AbstractAiSeoSpider):
await think_element.nth(-1).click() await think_element.nth(-1).click()
await asyncio.sleep(2) await asyncio.sleep(2)
# 获取回答元素 # 获取回答元素
answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1)
# answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1)
answer = self.browser_page.locator("//div[contains(@class, 'ds-message')]").nth(-1)
box = await answer.bounding_box() box = await answer.bounding_box()
# 设置视口大小 # 设置视口大小
await self.browser_page.set_viewport_size({ await self.browser_page.set_viewport_size({
@ -130,9 +132,11 @@ class DeepseekSpider(AbstractAiSeoSpider):
body = stream.decode('utf-8') body = stream.decode('utf-8')
datas = body.split("\n\n") datas = body.split("\n\n")
for data_str in datas: for data_str in datas:
# 返回数据为空 跳过
if not data_str: if not data_str:
continue continue
data_str = data_str.replace('data: ', '') data_str = data_str.replace('data: ', '')
# 服务器繁忙 跳过
try: try:
data = json.loads(data_str) data = json.loads(data_str)
if glom(data, 'v.0.v', default='') == 'TIMEOUT': if glom(data, 'v.0.v', default='') == 'TIMEOUT':
@ -145,27 +149,27 @@ class DeepseekSpider(AbstractAiSeoSpider):
logger.debug(f"获取到联网搜索结果") logger.debug(f"获取到联网搜索结果")
search_result_list = data.get('v', []) search_result_list = data.get('v', [])
search_result_lists.extend(search_result_list) search_result_lists.extend(search_result_list)
# # 保存搜索结果
# ai_search_result_list = []
# for search_result in search_result_list:
# url = search_result.get('url', '')
# title = search_result.get('title', '')
# body = search_result.get('snippet', '')
# publish_time = search_result.get('published_at', '')
# host_name = search_result.get('site_name', '未知')
# ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name)
# if ai_result.title and ai_result.url:
# ai_search_result_list.append(ai_result)
# logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
# if ai_search_result_list:
# self.ai_answer.search_result = ai_search_result_list
# self.search_result_count = len(self.ai_answer.search_result)
# 保存搜索结果
ai_search_result_list = []
for search_result in search_result_list:
url = search_result.get('url', '')
title = search_result.get('title', '')
body = search_result.get('snippet', '')
publish_time = search_result.get('published_at', '')
host_name = search_result.get('site_name', '未知')
ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name)
if ai_result.title and ai_result.url:
ai_search_result_list.append(ai_result)
logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
if ai_search_result_list:
self.ai_answer.search_result = ai_search_result_list
self.search_result_count = len(self.ai_answer.search_result)
continue continue
# 是否开始返回深度思考数据 # 是否开始返回深度思考数据
if data.get('p', '') == 'response/thinking_content':
if data.get('p', '') == 'response/fragments/1/content':
start_thinking = True start_thinking = True
if data.get('p', '') == 'response/thinking_elapsed_secs':
if data.get('p', '') == 'response/fragments/1/elapsed_secs':
start_thinking = False start_thinking = False
if start_thinking: if start_thinking:
# 获取深度思考回复 # 获取深度思考回复
@ -177,7 +181,7 @@ class DeepseekSpider(AbstractAiSeoSpider):
value = glom(data, target, default="") value = glom(data, target, default="")
thinking_text = thinking_text + str(value) thinking_text = thinking_text + str(value)
# 是否开始返回回复数据 # 是否开始返回回复数据
if data.get('p', '') == 'response/content':
if data.get('p', '') == 'response/fragments/2/content':
start_content = True start_content = True
if start_content: if start_content:
# 获取ai回复 # 获取ai回复

Loading…
Cancel
Save