refactor(ai_seo): 重构 DeepSeek 爬虫

- 更新了页面元素定位方式，以适应网站结构变化 - 优化了深度思考和回答获取的逻辑 - 调整了搜索结果处理的方式 - 更新了数据返回结构的解析
1 month ago · 90870aaf5d
1 changed files with 30 additions and 26 deletions
--- a/spiders/ai_seo/deepseek.py
+++ b/spiders/ai_seo/deepseek.py
@ -37,15 +37,16 @@ class DeepseekSpider(AbstractAiSeoSpider):
        search_btn = self.browser_page.locator("span:text('联网搜索')").locator('..')
        if await search_btn.is_visible():
            await search_btn.click()
        self.think = True
        if self.think:
        # 开启深度思考
            think_btn = self.browser_page.locator("span:text('深度思考 (R1)')").locator('..')
            think_btn = self.browser_page.locator("span:text('深度思考')").locator('..')
            if await think_btn.is_visible():
                styles = css_to_dict(await think_btn.get_attribute('style'))
                if styles.get('--ds-button-color') == '#fff':
                # styles = css_to_dict(await think_btn.get_attribute('style'))
                # if styles.get('--ds-button-color') == '#fff':
                await think_btn.click()
                await asyncio.sleep(1)
        chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']")
        chat_input_element = self.browser_page.locator("//textarea[@placeholder='给 DeepSeek 发送消息 ']")
        await chat_input_element.click()
        # 输入提问词
        await self.browser_page.keyboard.type(self.prompt)
@ -72,7 +73,8 @@ class DeepseekSpider(AbstractAiSeoSpider):
                await think_element.nth(-1).click()
                await asyncio.sleep(2)
        # 获取回答元素
        answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1)
        # answer = self.browser_page.locator("//div[@class='ds-markdown ds-markdown--block']").nth(-1)
        answer = self.browser_page.locator("//div[contains(@class, 'ds-message')]").nth(-1)
        box = await answer.bounding_box()
        # 设置视口大小
        await self.browser_page.set_viewport_size({
@ -130,9 +132,11 @@ class DeepseekSpider(AbstractAiSeoSpider):
        body = stream.decode('utf-8')
        datas = body.split("\n\n")
        for data_str in datas:
            # 返回数据为空 跳过
            if not data_str:
                continue
            data_str = data_str.replace('data: ', '')
            # 服务器繁忙 跳过
            try:
                data = json.loads(data_str)
                if glom(data, 'v.0.v', default='') == 'TIMEOUT':
@ -145,27 +149,27 @@ class DeepseekSpider(AbstractAiSeoSpider):
                logger.debug(f"获取到联网搜索结果")
                search_result_list = data.get('v', [])
                search_result_lists.extend(search_result_list)
                # # 保存搜索结果
                # ai_search_result_list = []
                # for search_result in search_result_list:
                #     url = search_result.get('url', '')
                #     title = search_result.get('title', '')
                #     body = search_result.get('snippet', '')
                #     publish_time = search_result.get('published_at', '')
                #     host_name = search_result.get('site_name', '未知')
                #     ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name)
                #     if ai_result.title and ai_result.url:
                #         ai_search_result_list.append(ai_result)
                #     logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
                # if ai_search_result_list:
                #     self.ai_answer.search_result = ai_search_result_list
                #     self.search_result_count = len(self.ai_answer.search_result)
                # 保存搜索结果
                ai_search_result_list = []
                for search_result in search_result_list:
                    url = search_result.get('url', '')
                    title = search_result.get('title', '')
                    body = search_result.get('snippet', '')
                    publish_time = search_result.get('published_at', '')
                    host_name = search_result.get('site_name', '未知')
                    ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name)
                    if ai_result.title and ai_result.url:
                        ai_search_result_list.append(ai_result)
                    logger.debug(f"ai参考资料: [{host_name}]{title}({url})")
                if ai_search_result_list:
                    self.ai_answer.search_result = ai_search_result_list
                    self.search_result_count = len(self.ai_answer.search_result)
                continue
            # 是否开始返回深度思考数据
            if data.get('p', '') == 'response/thinking_content':
            if data.get('p', '') == 'response/fragments/1/content':
                start_thinking = True
            if data.get('p', '') == 'response/thinking_elapsed_secs':
            if data.get('p', '') == 'response/fragments/1/elapsed_secs':
                start_thinking = False
            if start_thinking:
                # 获取深度思考回复
@ -177,7 +181,7 @@ class DeepseekSpider(AbstractAiSeoSpider):
                    value = glom(data, target, default="")
                thinking_text = thinking_text + str(value)
            # 是否开始返回回复数据
            if data.get('p', '') == 'response/content':
            if data.get('p', '') == 'response/fragments/2/content':
                start_content = True
            if start_content:
                # 获取ai回复