feat(ai): 更新品牌词提取任务要求

- 不再对提取的品牌词进行去重处理 - 提取文本中提到的所有品牌，包括文本总结中的品牌- 调整提示词以适应新的提取逻辑 - 修改测试入口函数名
feat(spider): 更新ai_seo爬虫选择器以适配新页面结构
3 changed files with 22 additions and 15 deletions
--- a/main.py
+++ b/main.py
@ -102,4 +102,4 @@ async def check_session(platform_ids=None):


 if __name__ == '__main__':
-    asyncio.get_event_loop().run_until_complete(check_session_by_platform_id(6))
+    asyncio.get_event_loop().run_until_complete(test())
--- a/spiders/ai_seo/metaso.py
+++ b/spiders/ai_seo/metaso.py
@ -55,7 +55,7 @@ class MetasoSpider(AbstractAiSeoSpider):
        await self.browser_page.reload()
        # await self.completed_event.wait()
        # 等待指定元素
-        copy_button = await self.browser_page.wait_for_selector('//*[starts-with(@id, "search-content-container-")]/div[2]/div[3]/button', timeout=600000)
+        copy_button = await self.browser_page.wait_for_selector("//div[@class='relative']/following-sibling::div[1]//button[1]", timeout=600000)
        # 点击复制按钮
        await copy_button.click()
        # 读取剪贴板
@ -63,26 +63,31 @@ class MetasoSpider(AbstractAiSeoSpider):
        logger.debug(f'ai回复内容: {self.ai_answer}')
        # 获取来源数据
        try:
-            await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000)
-            search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span")
+            await self.browser_page.wait_for_selector("//div[contains(@aria-label, '来源')]", timeout=60000)
+            show_search_item_btn = self.browser_page.locator("//div[contains(@aria-label, '来源')]")
+            await show_search_item_btn.click()
+            await asyncio.sleep(2)
+            # logger.debug(f'来源数据: {search_item_count}')
+            # 获取来源数据
+            search_items =  self.browser_page.locator("//ul[contains(@class, 'meta-ordered-list_meta-list')]/li")
            search_item_count = await search_items.count()
-            logger.debug(f'来源数据: {search_item_count}')
-            await asyncio.sleep(5)
            search_results = []
            for i in range(search_item_count):
                search_result = AiSearchResult()
                search_item = search_items.nth(i)
                # 抽取链接和标题
-                a = search_item.locator("xpath=./a")
+                a = search_item.locator("xpath=./div[1]/a")
                # 抽取时间
-                publish_date_element = search_item.locator("xpath=./span")
+                publish_date_element = search_item.locator("xpath=./div[2]/div")
+                publish_str = await publish_date_element.text_content()
+                search_result.publish_time = publish_str.replace('[', '').replace(']', '')
                if await a.is_visible():
                    search_result.title = await a.text_content()
                    search_result.url = await a.get_attribute('href')
-                if await publish_date_element.count() > 0:
-                    publish_date_element = search_item.locator("xpath=./span").nth(-1)
-                    publish_str = await publish_date_element.text_content()
-                    search_result.publish_time = publish_str.replace('[', '').replace(']', '')
+                # if await publish_date_element.count() > 0:
+                #     publish_date_element = search_item.locator("xpath=./span").nth(-1)
+                #     publish_str = await publish_date_element.text_content()
+                #     search_result.publish_time = publish_str.replace('[', '').replace(']', '')
                search_results.append(search_result)
            self.ai_answer.search_result = search_results
        except TimeoutError:
@ -91,7 +96,7 @@ class MetasoSpider(AbstractAiSeoSpider):
        if self.fail_status:
            raise self.fail_exception
        # 获取回答元素
-        answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]")
+        answer_element = self.browser_page.locator("//div[contains(@class, 'result-responsive-layer')]")
        box = await answer_element.bounding_box()
        logger.debug(f'answer_element: {box}')
        view_port_height = box['height'] + 300
@ -100,6 +105,7 @@ class MetasoSpider(AbstractAiSeoSpider):
            'width': 1920,
            'height': int(view_port_height)
        })
+        await self.browser_page.locator("//div[contains(@class, 'MetaDialog_meta-dialog-mask')]//button").click()
        # 截图
        screenshot_path = self._get_screenshot_path()
        await self.browser_page.screenshot(path=screenshot_path)
--- a/utils/ai.py
+++ b/utils/ai.py
@ -20,8 +20,9 @@ async def main():
            logger.info(f"[{result['id']}] 已读取过排名")
            continue
        prompt = f"""
-        任务: 请在以下文本中, 按出现的顺序提取出品牌词, 多次出现的品牌词仅提取一次, 返回json数组
-        返回格式: json中包含brands字段, 字段的值为数组, 数组内容是按顺序提取的品牌词
+            任务: 根据用户输入的文本，按照提取出品牌名称以及提取的依据，并且不要去重；
+            只要文本中提到了的品牌都需要提取出来，文本总结里面的品牌也需要。
+            以json格式返回，json中包含brands字段, 字段的值为数组

        文本正文:
        {result['content']}