Browse Source

feat(spider): 更新ai_seo爬虫选择器以适配新页面结构

- 修改复制按钮的XPath选择器,使用更稳定的定位方式
- 调整来源数据获取逻辑,点击展示按钮后延时加载
- 更新来源列表项的选择器路径,并修复发布时间提取逻辑- 修改回答容器元素的选择器,确保正确获取截图区域- 添加遮罩层按钮点击操作,优化截图前的页面状态
master
zzx 1 month ago
parent
commit
6dfda1086e
  1. 30
      spiders/ai_seo/metaso.py

30
spiders/ai_seo/metaso.py

@ -55,7 +55,7 @@ class MetasoSpider(AbstractAiSeoSpider):
await self.browser_page.reload() await self.browser_page.reload()
# await self.completed_event.wait() # await self.completed_event.wait()
# 等待指定元素 # 等待指定元素
copy_button = await self.browser_page.wait_for_selector('//*[starts-with(@id, "search-content-container-")]/div[2]/div[3]/button', timeout=600000)
copy_button = await self.browser_page.wait_for_selector("//div[@class='relative']/following-sibling::div[1]//button[1]", timeout=600000)
# 点击复制按钮 # 点击复制按钮
await copy_button.click() await copy_button.click()
# 读取剪贴板 # 读取剪贴板
@ -63,26 +63,31 @@ class MetasoSpider(AbstractAiSeoSpider):
logger.debug(f'ai回复内容: {self.ai_answer}') logger.debug(f'ai回复内容: {self.ai_answer}')
# 获取来源数据 # 获取来源数据
try: try:
await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000)
search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span")
await self.browser_page.wait_for_selector("//div[contains(@aria-label, '来源')]", timeout=60000)
show_search_item_btn = self.browser_page.locator("//div[contains(@aria-label, '来源')]")
await show_search_item_btn.click()
await asyncio.sleep(2)
# logger.debug(f'来源数据: {search_item_count}')
# 获取来源数据
search_items = self.browser_page.locator("//ul[contains(@class, 'meta-ordered-list_meta-list')]/li")
search_item_count = await search_items.count() search_item_count = await search_items.count()
logger.debug(f'来源数据: {search_item_count}')
await asyncio.sleep(5)
search_results = [] search_results = []
for i in range(search_item_count): for i in range(search_item_count):
search_result = AiSearchResult() search_result = AiSearchResult()
search_item = search_items.nth(i) search_item = search_items.nth(i)
# 抽取链接和标题 # 抽取链接和标题
a = search_item.locator("xpath=./a")
a = search_item.locator("xpath=./div[1]/a")
# 抽取时间 # 抽取时间
publish_date_element = search_item.locator("xpath=./span")
publish_date_element = search_item.locator("xpath=./div[2]/div")
publish_str = await publish_date_element.text_content()
search_result.publish_time = publish_str.replace('[', '').replace(']', '')
if await a.is_visible(): if await a.is_visible():
search_result.title = await a.text_content() search_result.title = await a.text_content()
search_result.url = await a.get_attribute('href') search_result.url = await a.get_attribute('href')
if await publish_date_element.count() > 0:
publish_date_element = search_item.locator("xpath=./span").nth(-1)
publish_str = await publish_date_element.text_content()
search_result.publish_time = publish_str.replace('[', '').replace(']', '')
# if await publish_date_element.count() > 0:
# publish_date_element = search_item.locator("xpath=./span").nth(-1)
# publish_str = await publish_date_element.text_content()
# search_result.publish_time = publish_str.replace('[', '').replace(']', '')
search_results.append(search_result) search_results.append(search_result)
self.ai_answer.search_result = search_results self.ai_answer.search_result = search_results
except TimeoutError: except TimeoutError:
@ -91,7 +96,7 @@ class MetasoSpider(AbstractAiSeoSpider):
if self.fail_status: if self.fail_status:
raise self.fail_exception raise self.fail_exception
# 获取回答元素 # 获取回答元素
answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]")
answer_element = self.browser_page.locator("//div[contains(@class, 'result-responsive-layer')]")
box = await answer_element.bounding_box() box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}') logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 300 view_port_height = box['height'] + 300
@ -100,6 +105,7 @@ class MetasoSpider(AbstractAiSeoSpider):
'width': 1920, 'width': 1920,
'height': int(view_port_height) 'height': int(view_port_height)
}) })
await self.browser_page.locator("//div[contains(@class, 'MetaDialog_meta-dialog-mask')]//button").click()
# 截图 # 截图
screenshot_path = self._get_screenshot_path() screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path) await self.browser_page.screenshot(path=screenshot_path)

Loading…
Cancel
Save