2 Commits

Author SHA1 Message Date
zzx f91e065d27 feat(ai): 更新品牌词提取任务要求 1 month ago
zzx 6dfda1086e feat(spider): 更新ai_seo爬虫选择器以适配新页面结构 1 month ago
  1. 2
      main.py
  2. 30
      spiders/ai_seo/metaso.py
  3. 5
      utils/ai.py

2
main.py

@ -102,4 +102,4 @@ async def check_session(platform_ids=None):
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(check_session_by_platform_id(6))
asyncio.get_event_loop().run_until_complete(test())

30
spiders/ai_seo/metaso.py

@ -55,7 +55,7 @@ class MetasoSpider(AbstractAiSeoSpider):
await self.browser_page.reload()
# await self.completed_event.wait()
# 等待指定元素
copy_button = await self.browser_page.wait_for_selector('//*[starts-with(@id, "search-content-container-")]/div[2]/div[3]/button', timeout=600000)
copy_button = await self.browser_page.wait_for_selector("//div[@class='relative']/following-sibling::div[1]//button[1]", timeout=600000)
# 点击复制按钮
await copy_button.click()
# 读取剪贴板
@ -63,26 +63,31 @@ class MetasoSpider(AbstractAiSeoSpider):
logger.debug(f'ai回复内容: {self.ai_answer}')
# 获取来源数据
try:
await self.browser_page.wait_for_selector("//div[contains(@class, 'meta-ordered-list_list-item')]/span", timeout=60000)
search_items = self.browser_page.locator("//div[contains(@class, 'meta-ordered-list_list-item')]/span")
await self.browser_page.wait_for_selector("//div[contains(@aria-label, '来源')]", timeout=60000)
show_search_item_btn = self.browser_page.locator("//div[contains(@aria-label, '来源')]")
await show_search_item_btn.click()
await asyncio.sleep(2)
# logger.debug(f'来源数据: {search_item_count}')
# 获取来源数据
search_items = self.browser_page.locator("//ul[contains(@class, 'meta-ordered-list_meta-list')]/li")
search_item_count = await search_items.count()
logger.debug(f'来源数据: {search_item_count}')
await asyncio.sleep(5)
search_results = []
for i in range(search_item_count):
search_result = AiSearchResult()
search_item = search_items.nth(i)
# 抽取链接和标题
a = search_item.locator("xpath=./a")
a = search_item.locator("xpath=./div[1]/a")
# 抽取时间
publish_date_element = search_item.locator("xpath=./span")
publish_date_element = search_item.locator("xpath=./div[2]/div")
publish_str = await publish_date_element.text_content()
search_result.publish_time = publish_str.replace('[', '').replace(']', '')
if await a.is_visible():
search_result.title = await a.text_content()
search_result.url = await a.get_attribute('href')
if await publish_date_element.count() > 0:
publish_date_element = search_item.locator("xpath=./span").nth(-1)
publish_str = await publish_date_element.text_content()
search_result.publish_time = publish_str.replace('[', '').replace(']', '')
# if await publish_date_element.count() > 0:
# publish_date_element = search_item.locator("xpath=./span").nth(-1)
# publish_str = await publish_date_element.text_content()
# search_result.publish_time = publish_str.replace('[', '').replace(']', '')
search_results.append(search_result)
self.ai_answer.search_result = search_results
except TimeoutError:
@ -91,7 +96,7 @@ class MetasoSpider(AbstractAiSeoSpider):
if self.fail_status:
raise self.fail_exception
# 获取回答元素
answer_element = self.browser_page.locator("//div[contains(@class, 'Search_search-result-container')]")
answer_element = self.browser_page.locator("//div[contains(@class, 'result-responsive-layer')]")
box = await answer_element.bounding_box()
logger.debug(f'answer_element: {box}')
view_port_height = box['height'] + 300
@ -100,6 +105,7 @@ class MetasoSpider(AbstractAiSeoSpider):
'width': 1920,
'height': int(view_port_height)
})
await self.browser_page.locator("//div[contains(@class, 'MetaDialog_meta-dialog-mask')]//button").click()
# 截图
screenshot_path = self._get_screenshot_path()
await self.browser_page.screenshot(path=screenshot_path)

5
utils/ai.py

@ -20,8 +20,9 @@ async def main():
logger.info(f"[{result['id']}] 已读取过排名")
continue
prompt = f"""
: , , , json数组
: json中包含brands字段, ,
:
json格式返回json中包含brands字段,
:
{result['content']}

Loading…
Cancel
Save