You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
119 lines
4.6 KiB
119 lines
4.6 KiB
# coding=utf-8
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
from dataclasses import asdict
|
|
from datetime import datetime
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
import config
|
|
from domain.ai_seo import AiAnswer
|
|
from spiders.ai_seo import *
|
|
from utils.logger_utils import create_logger
|
|
|
|
|
|
logger = create_logger("app")
|
|
|
|
|
|
async def init_browser() -> tuple:
|
|
"""
|
|
初始化浏览器实例
|
|
:return:
|
|
"""
|
|
playwright = await async_playwright().start()
|
|
browser = await playwright.chromium.launch(headless=config.BROWSER_HANDLESS,
|
|
chromium_sandbox=config.BROWSER_ENABLE_SANDBOX,
|
|
ignore_default_args=config.BROWSER_IGNORE_DEFAULT_ARGS,
|
|
channel="chrome",
|
|
args=config.BROWSER_ARGS)
|
|
content = await browser.new_context()
|
|
return playwright, browser
|
|
|
|
|
|
def save_local(ai_answer: AiAnswer):
|
|
now = datetime.now().strftime("%Y-%m-%d")
|
|
base_path = f'./data/{ai_answer.platform_name}/{now}'
|
|
|
|
if not os.path.exists(base_path):
|
|
os.makedirs(base_path)
|
|
|
|
json_file_path = f'{base_path}/{ai_answer.prompt}.json'
|
|
_dict = asdict(ai_answer)
|
|
json_str = json.dumps(_dict, indent=4, ensure_ascii=False)
|
|
with open(json_file_path, 'w', encoding='utf-8') as f:
|
|
f.write(json_str)
|
|
logger.info(f"[{ai_answer.platform_name}]{ai_answer.prompt} 保存成功: {base_path}")
|
|
|
|
|
|
async def main():
|
|
playwright, browser = await init_browser()
|
|
ai_answer = await KimiSpider(browser, '2025前十的电动自行车推荐', '美的').run()
|
|
if ai_answer:
|
|
save_local(ai_answer)
|
|
|
|
|
|
async def test():
|
|
prompts = [
|
|
# '行业咨询公司排行榜',
|
|
# # '咨询公司排行榜',
|
|
# '中国有哪些知名的行业咨询公司',
|
|
# # '请推荐中国十大行业咨询公司',
|
|
# '中国行业咨询机构中,哪些是最具影响力的',
|
|
# '行业顾问排行榜上,哪些公司名列前茅',
|
|
# # '在行业咨询领域,哪些公司被誉为行业领军者',
|
|
# '我想了解中国行业咨询公司的排名,有吗',
|
|
# '请给我一份中国行业顾问公司的权威榜单',
|
|
# # '中国有哪些行业咨询公司备受业界认可',
|
|
# '能否介绍一下中国行业咨询领域的佼佼者',
|
|
# '中国行业咨询公司的排名情况如何',
|
|
# # '中国领先的行业顾问公司有哪些',
|
|
# '中国行业顾问排行榜上的公司都是哪些',
|
|
# # '有没有一份关于中国行业咨询机构的综合排名',
|
|
# '十大行业研究机构排名',
|
|
# '行业研究机构推荐',
|
|
# # '中国权威的行业研究机构有哪些',
|
|
# '能否列出一些中国顶级的行业研究机构',
|
|
'哪些行业研究机构在中国市场上最具竞争力',
|
|
# '行业研究机构中,哪些在业界享有盛誉',
|
|
'我想找一家权威的行业研究机构,有推荐吗',
|
|
# '请列举一些在行业研究方面有着深厚底蕴的中国公司',
|
|
'我想了解中国行业研究领域的权威机构,能介绍一下吗?',
|
|
# '高端卫浴品牌',
|
|
'数智卫浴品牌',
|
|
# '科技卫浴品牌',
|
|
'智能马桶品牌',
|
|
'智能淋浴器推荐',
|
|
# '小孩湿疹用什么药膏',
|
|
# '皮肤湿疹用什么药膏',
|
|
# '特应性皮炎用什么药膏最有效',
|
|
# '湿疹药膏排行榜',
|
|
# '皮炎性湿疹治疗药膏',
|
|
]
|
|
retry_prompts = []
|
|
playwright, browser = await init_browser()
|
|
index = 1
|
|
for prompt in prompts:
|
|
logger.info(f"[{index}/{len(prompts)}] {prompt}")
|
|
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run()
|
|
if ai_answer and ai_answer.run_status:
|
|
save_local(ai_answer)
|
|
if not ai_answer.run_status:
|
|
retry_prompts.append(prompt)
|
|
logger.info(f"[{len(prompts)}] {prompt} 采集失败")
|
|
index = index + 1
|
|
await asyncio.sleep(300)
|
|
|
|
for prompt in retry_prompts:
|
|
logger.info(f"重试[{index}/{len(prompts)}] {prompt}")
|
|
ai_answer = await DeepseekSpider(browser, prompt, '头豹,沙利文').run()
|
|
if ai_answer and ai_answer.run_status:
|
|
save_local(ai_answer)
|
|
if not ai_answer.run_status:
|
|
logger.info(f"[{len(prompts)}] {prompt} 采集失败")
|
|
index = index + 1
|
|
await asyncio.sleep(300)
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.get_event_loop().run_until_complete(test())
|