From 6dfda1086ed6f26bb55ccd7c49b76b7f2fdc9b01 Mon Sep 17 00:00:00 2001 From: zzx Date: Sat, 20 Sep 2025 22:16:30 +0800 Subject: [PATCH] =?UTF-8?q?feat(spider):=20=E6=9B=B4=E6=96=B0ai=5Fseo?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E9=80=89=E6=8B=A9=E5=99=A8=E4=BB=A5=E9=80=82?= =?UTF-8?q?=E9=85=8D=E6=96=B0=E9=A1=B5=E9=9D=A2=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 修改复制按钮的XPath选择器,使用更稳定的定位方式 - 调整来源数据获取逻辑,点击展示按钮后延时加载 - 更新来源列表项的选择器路径,并修复发布时间提取逻辑- 修改回答容器元素的选择器,确保正确获取截图区域- 添加遮罩层按钮点击操作,优化截图前的页面状态 --- spiders/ai_seo/metaso.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/spiders/ai_seo/metaso.py b/spiders/ai_seo/metaso.py index 3cfb552..b153bc6 100644 --- a/spiders/ai_seo/metaso.py +++ b/spiders/ai_seo/metaso.py @@ -55,7 +55,7 @@ class MetasoSpider(AbstractAiSeoSpider): await self.browser_page.reload() # await self.completed_event.wait() # 等待指定元素 - copy_button = await self.browser_page.wait_for_selector('//*[starts-with(@id, "search-content-container-")]/div[2]/div[3]/button', timeout=600000) + copy_button = await self.browser_page.wait_for_selector("//div[@class='relative']/following-sibling::div[1]//button[1]", timeout=600000) # 点击复制按钮 await copy_button.click() # 读取剪贴板 @@ -63,26 +63,31 @@ class MetasoSpider(AbstractAiSeoSpider): logger.debug(f'ai回复内容: {self.ai_answer}') # 获取来源数据 try: - await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000) - search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span") + await self.browser_page.wait_for_selector("//div[contains(@aria-label, '来源')]", timeout=60000) + show_search_item_btn = self.browser_page.locator("//div[contains(@aria-label, '来源')]") + await show_search_item_btn.click() + await asyncio.sleep(2) + # logger.debug(f'来源数据: {search_item_count}') + # 获取来源数据 + search_items = self.browser_page.locator("//ul[contains(@class, 'meta-ordered-list_meta-list')]/li") search_item_count = await search_items.count() - logger.debug(f'来源数据: {search_item_count}') - await asyncio.sleep(5) search_results = [] for i in range(search_item_count): search_result = AiSearchResult() search_item = search_items.nth(i) # 抽取链接和标题 - a = search_item.locator("xpath=./a") + a = search_item.locator("xpath=./div[1]/a") # 抽取时间 - publish_date_element = search_item.locator("xpath=./span") + publish_date_element = search_item.locator("xpath=./div[2]/div") + publish_str = await publish_date_element.text_content() + search_result.publish_time = publish_str.replace('[', '').replace(']', '') if await a.is_visible(): search_result.title = await a.text_content() search_result.url = await a.get_attribute('href') - if await publish_date_element.count() > 0: - publish_date_element = search_item.locator("xpath=./span").nth(-1) - publish_str = await publish_date_element.text_content() - search_result.publish_time = publish_str.replace('[', '').replace(']', '') + # if await publish_date_element.count() > 0: + # publish_date_element = search_item.locator("xpath=./span").nth(-1) + # publish_str = await publish_date_element.text_content() + # search_result.publish_time = publish_str.replace('[', '').replace(']', '') search_results.append(search_result) self.ai_answer.search_result = search_results except TimeoutError: @@ -91,7 +96,7 @@ class MetasoSpider(AbstractAiSeoSpider): if self.fail_status: raise self.fail_exception # 获取回答元素 - answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]") + answer_element = self.browser_page.locator("//div[contains(@class, 'result-responsive-layer')]") box = await answer_element.bounding_box() logger.debug(f'answer_element: {box}') view_port_height = box['height'] + 300 @@ -100,6 +105,7 @@ class MetasoSpider(AbstractAiSeoSpider): 'width': 1920, 'height': int(view_port_height) }) + await self.browser_page.locator("//div[contains(@class, 'MetaDialog_meta-dialog-mask')]//button").click() # 截图 screenshot_path = self._get_screenshot_path() await self.browser_page.screenshot(path=screenshot_path)