diff --git a/abs_spider.py b/abs_spider.py index 80e8e58..45c5026 100644 --- a/abs_spider.py +++ b/abs_spider.py @@ -54,7 +54,8 @@ class AbstractAiSeoSpider(ABC): 4: "kimi", 2: "tongyi", 6: "yiyan", - 3: "yuanbao" + 3: "yuanbao", + 8: "wenxiaoyan" } # todo 支持多session管理 @@ -69,7 +70,10 @@ class AbstractAiSeoSpider(ABC): async def __init_page(self): if self.load_session: self.session_info = await get_spider_session(self.platform_id) - self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path']) + if self.platform_id != 8: + self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path']) + else: + self.browser_content = await self.browser.new_context() else: self.browser_content = await self.browser.new_context() self.browser_page = await self.browser_content.new_page() diff --git a/config.py b/config.py index 8b16ce3..33c6b9f 100644 --- a/config.py +++ b/config.py @@ -30,7 +30,7 @@ AI_SEO_JOB_RANGE = { # aiseo任务是否启用 AI_SEO_JOB_ENABLE = True # aiseo任务运行间隔 -AI_SEO_JOB_INTERVAL = 5 +AI_SEO_JOB_INTERVAL = 20 # aiseo任务获取平台 AI_SEO_JOB_PLATFORM_IDS = [ '2', '3', '4', '5', '7', '13'] # aiseo任务最大并发量 @@ -42,7 +42,7 @@ DEEPSEEK_SEO_JOB_RANGE = { 'end_time': '23:59' } # deepseek任务是否启用 -DEEPSEEK_JOB_ENABLE = True +DEEPSEEK_JOB_ENABLE = False # deepseek任务获取间隔 DEEPSEEK_JOB_INTERVAL = 30 # deepseek任务获取平台 diff --git a/domain/ai_seo.py b/domain/ai_seo.py index 77cec2f..3c3db4b 100644 --- a/domain/ai_seo.py +++ b/domain/ai_seo.py @@ -26,7 +26,6 @@ class AiSearchResult: is_referenced: str = '0' #情感倾向" 1- 中立 2- 正面 3- 负面 sentiment_type = 0 - #情感类型 type = 0 def __post_init__(self): diff --git a/spiders/ai_seo/nanometer.py b/spiders/ai_seo/nanometer.py index add339a..b31d3ae 100644 --- a/spiders/ai_seo/nanometer.py +++ b/spiders/ai_seo/nanometer.py @@ -31,6 +31,8 @@ class NanometerSpider(AbstractAiSeoSpider): self._init_data() # 开始操作 await self.browser_page.goto(self.get_home_url(), timeout=600000) + #开启深度思考 + await self.browser_page.locator('//*[@id="nworld-app-container"]/div/div[1]/div[1]/div/div/div/div/div[2]/div[1]/div[1]/div[2]/div[1]/section/div').click() chat_input_element = self.browser_page.locator("//textarea[@id='composition-input']") # 输入提问词 await chat_input_element.fill(self.prompt) diff --git a/spiders/ai_seo/wenxiaoyan.py b/spiders/ai_seo/wenxiaoyan.py new file mode 100644 index 0000000..59fa928 --- /dev/null +++ b/spiders/ai_seo/wenxiaoyan.py @@ -0,0 +1,272 @@ +# coding=utf-8 +import asyncio +import json +import time +from functools import partial, wraps +from json import JSONDecodeError + +import requests +from glom import glom +from playwright.async_api import Browser + +from abs_spider import AbstractAiSeoSpider +from domain.ai_seo import AiAnswer, AiSearchResult +from utils import create_logger, css_to_dict +from utils.image_utils import crop_image_left,remove_bottom_part +import re +logger = create_logger(__name__) + +class WenxiaoyanSpider(AbstractAiSeoSpider): + + def __init__(self, browser: Browser, prompt: str, keyword: str): + super().__init__(browser, prompt, keyword) + def get_home_url(self) -> str: + return '' + + def get_platform_id(self) -> int: + return 8 + + def get_platform_name(self) -> str: + return 'wenxiaoyan' + + + def get_answer(self,question): + url = "https://yiyanapp.baidu.com/chat/completions?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%2C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q&jt=" # ⚠️ 注意替换 jt 参数为你抓包所得 + + headers = { + "Host": "yiyanapp.baidu.com", + "Accept": "text/event-stream", + "Sec-Fetch-Site": "same-origin", + # "Acs-Token": "1751775388769_1751851559961_bhZQ4YI+QFmUuvgW70dSF4p6RO6kwt11kM1YLLkcALYQfKLMVkL4CaQihHY3UdL0U59IEuGQ18+yD/87YkdFFJjvRFtzv5V4w566SOzaV4Tl6MgUbtEXWZ1AIXHPKMOvdwofdSm4kY1DzhUDlUirlwEBtgg57atBjuGENz2ku0gIlOq7WF/KO9B8rZ8A+wZvDqBg64h0nWDr+OG7ggPF0m18dztme9wUmTomJ71gIxt6pfiIWP/ICkk0voivZe5/WZ2+n6H1jEiXrbZV2naPksyYjA9alnvptf5PU09c3+9ZzFa7Wh+iou+eKCxjjTv99f8BsQNAs0NVITq3szndR9ZAjqk7Tx5Pf1wskjrafPx4Gk5/tYj9PBEgIapBmvPmf1HBfNwI2zZbVHoEZnMuMllQ1AzDyFiVIdxkGAak3FmsIL/4k8XfcXxaB86CDVk2rKwOSDudYaDDSH5DkP/iWnFFUxAs6Y0/DZglPrMfYpFYgdnS+TD2/Qm5QXbsN1pBmmWjTuvzy9zjP5ykRhaqVg==", # ⚠️ 替换为抓包所得 Acs-Token + # "X-Bd-Alive": "86ff04fba2edb36666d4fbd126554961", + "Accept-Language": "zh-CN,zh-Hans;q=0.9", + "Sec-Fetch-Mode": "cors", + "Accept-Encoding": "gzip, deflate, br", + "Origin": "https://yiyanapp.baidu.com", + "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", + "Referer": "https://yiyanapp.baidu.com/talk/chat", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Cookie": self.session_info.get("cookie") + } + payload = { + "parentId" : "7955151", + "inputSource" : "user_input", + "querySource" : "user_input", + "context" : [ + "7955151", + "7955150", + "7955113", + "7955112", + "7955075", + "7955074", + "7955051", + "7955050", + "7955045", + "7955044", + "7955015", + "7955014", + "7954965", + "7954964", + "7954865", + "7954864", + "7954863", + "7954862", + "7954831", + "7954830" + ], + "contentType" : "text", + "from" : "app", + "plugins" : { + + }, + "models" : { + "dispatch" : "", + "force" : "", + "select" : "auto", + "replyMode" : "normal" + }, + "sessionId" : "1159258", + "chooses" : None, + "promptId" : "", + "botId" : "144038", + "characterId" : "1034", + "environment" : { + "isNewAB" : 1, + "depthAnalysis" : 0, + "hostTag" : "main_chat", + "composition" : None, + "sceneFrom" : "", + "individual" : "0", + "live" : { + "visOnline" : { + "per" : "4189", + "pdt" : "10170", + "audio_ctrl" : "{\"mid\":\"\",\"sampling_rate\":24000}", + "spd" : "5" + } + }, + "isReplyModeChange" : 0, + "memory" : { + "noMemoryExtraction" : 0 + }, + "depthMsgId" : "", + "clarify" : { + "enable" : 1, + "stage" : 1 + }, + "interveneId" : "" + }, + "content" : "小米" + } + payload["content"] = question + dic = {} + try: + r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) + answer = "" + reference = None + messageId = '' + parentId = '' + for line in r.iter_lines(): + if line: + try: + text = line.decode("utf-8") + if text.startswith("data:"): + print(json.loads(text[5:])) + if json.loads(text[5:]).get("data", "").get("data", "").get("reference"): + reference = json.loads(text[5:]).get("data", "").get("data", "").get("reference") + messageId = json.loads(text[5:]).get("data", "").get("data", "").get("messageId") + parentId = json.loads(text[5:]).get("data", "").get("data", "").get("parentId") + answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") + except: + pass + # results.append({"question": question, "answer": answer}) + print(f"✅ 回答:{answer}...\n") + dic['messageId'] = messageId + dic['parentId'] = parentId + dic['reference'] = reference + dic['answer'] = answer + except Exception as e: + print(f"❌ 请求失败:{e}") + return dic + # results.append({"question": question, "answer": "请求失败"}) + # ✅ 4. 批量问题 + # questions = [ + # "苹果手机咋样!", + # + # ] + # + # # ✅ 5. 结果列表 + # results = [] + # + # # ✅ 6. 主循环:发送每个问题 + # for q in questions: + # print(f"📨 提问中:{q}") + # payload["content"] = q + # try: + # r = requests.post(url, headers=headers, json=payload, stream=True, timeout=30) + # answer = "" + # for line in r.iter_lines(): + # if line: + # try: + # text = line.decode("utf-8") + # if text.startswith("data:"): + # print(json.loads(text[5:])) + # answer += json.loads(text[5:]).get("data", "").get("data", "").get("content", "") + # except: + # pass + # results.append({"question": q, "answer": answer}) + # print(f"✅ 回答:{answer}...\n") + # time.sleep(2) # 加一点延时防止频率过快 + # except Exception as e: + # print(f"❌ 请求失败:{e}") + # results.append({"question": q, "answer": "请求失败"}) + + def get_url(self,idList): + + url = "https://yiyanapp.baidu.com/api/share/create?appmode=nor&appname=newapp&cfrom=1027594s¤tLLMModel=auto&ds_lv=0&ds_stc=0.0000&from=1027594s&llmMode=auto&matrixstyle=0&network=1_0&sid=148615_1-999987_2-999992_1-159581_2-999991_1-164345_3-160429_1-999989_1-165121_1-999990_1-162727_1-999999_71-999988_1-163324_4-999994_2-999993_1-160037_5&st=2&ua=1170_2532_iphone_4.20.0.10_0&uid=808BA951B6493F89F2D28A62C7438A9332566F6AAOHBGTHSHBE&ut=iPhone13%252C2_17.6.1&zid=U8HLkcXuwbe2-gJK1hw1Goz5QuMxZdMQJSqN2Fs15rnzb0Vss7pxP-zc-wBRYuJI8qmsMDWWRwkcd-EivBG0P8Q" + + headers = { + "Host": "yiyanapp.baidu.com", + "Accept": "application/json, text/plain, */*", + "Sec-Fetch-Site": "same-origin", + # "Acs-Token": "1751775388769_1751866566877_gA4PTJauK4tNXXnPDSKIalOniXNhLT69M57uSBnbJshpFO5a8U6IqSMFUutHHsp1kgQT6GO9xupKSeNt7b+XxTGChWyb1qqFcD3N1GjNXr4qGGIugnL0b8oZEr4TSSDVSYElOcM03UeUp/jdNCZrLPP9xVflotEcnSLWnFR3yqXPtRwnebq9rngoPqtExqw/xzd5QrUe6HQbwqA/AUvIsg/wFFrpqWo+kTPSfWlpUfJ80QmT5wkoApRY8ATrhn2QqV4UPhox+qQFqYykNQtGe5LlMfRDVIXAKQtGmsU3E5DCD8JvTNiUSr6WhqJlYdIFAwoHelZLMQ6M4yASxlseZxvj4ad/eFI78cto6KWFUaqmlYv+jMDvHY9TSSuFu0goGImjGlmBRhXH/9+opBN2rlk1kJQ0OykPOe1Xy5Bh1X+ikajGyFx4ZF2y0T/rgJA07Unyk9zNFObEL8+Q3CCIugBZausPV3v3nBthXH7tptFczZMMy3Zyfk5QwlDm75qtajvi63/qV2Krv7N10nlQUw==", + "Accept-Language": "zh-CN,zh-Hans;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Sec-Fetch-Mode": "cors", + "Content-Type": "application/json", + "Origin": "https://yiyanapp.baidu.com", + "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 17_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 light/1.0 newapp/4.20.0.10 (Baidu; P2 17.6.1)", + "Referer": "https://yiyanapp.baidu.com/talk/chat", + "Connection": "keep-alive", + "Sec-Fetch-Dest": "empty", + "Cookie": self.session_info.get("cookie") + } + + + payload = { + "idList": "7924487,7924488", + "watermark": "qYbvOVmx", + "vp": 0 + } + payload["idList"] = idList + resp = requests.post(url, headers=headers, json=payload) + print(resp.json()) + print("返回内容:", resp.json().get("data").get("url")) + return resp.json().get("data").get("url") + async def _do_spider(self,) -> AiAnswer: + self._init_data() + self.search_result_count = 0 + print("self.prompt",self.prompt) + result = self.get_answer(self.prompt) + idList = None + if result.get("reference"): + idList = result.get("parentId")+","+ result.get("messageId") + if idList: + await self.browser_page.goto(self.get_url(idList), timeout=600000) + + await asyncio.sleep(3) + chat_input_element = self.browser_page.locator('//*[@id="35"]/div[1]/div/div/div[2]') + await chat_input_element.click() + # 获取回答元素 + answer = self.browser_page.locator('//*[@id="app"]/div/div[1]').nth(-1) + box = await answer.bounding_box() + # 设置视口大小 + await self.browser_page.set_viewport_size({ + 'width': 1920, + 'height': int(box['height']) + 500 + }) + # 截图 + screenshot_path = self._get_screenshot_path() + await self.browser_page.screenshot(path=screenshot_path) + #匹配citation:中的数字 + citation = list() + citations = re.findall(r'citation:(\d+)', result.get("answer")) + if citations: + citation = list(set(citations)) + ai_search_result_list = [] + for index,search_result in enumerate(result.get("reference").get("searchInfo")): + url = search_result.get('url', '') + title = search_result.get('title', '') + body = search_result.get('snippet', '') + publish_time = search_result.get('published_at', '') + host_name = search_result.get('site_name', '未知') + if str(index+1) in citation: + is_referenced = "1" + else: + is_referenced = "0" + ai_result = AiSearchResult(url=url, title=title, body=body, publish_time=publish_time, host_name=host_name, is_referenced=is_referenced) + if ai_result.title and ai_result.url: + ai_search_result_list.append(ai_result) + logger.debug(f"ai参考资料: [{host_name}]{title}({url})") + self.ai_answer.search_result = ai_search_result_list + self.search_result_count = len(self.ai_answer.search_result) + # 切割图片 + remove_bottom_part(screenshot_path, 82) + self.ai_answer.answer = result.get("answer") + self.ai_answer.screenshot_file = screenshot_path + return self.ai_answer + + + + diff --git a/utils/image_utils.py b/utils/image_utils.py index 17271ab..ba904cc 100644 --- a/utils/image_utils.py +++ b/utils/image_utils.py @@ -41,3 +41,20 @@ def crop_image_left(image_path, crop_width): print(f"处理图片时出错: {e}") if 'temp_path' in locals() and os.path.exists(temp_path): os.remove(temp_path) + +from PIL import Image + + +#从顶部截到离底部crop_height像素 +def remove_bottom_part(image_path, crop_height): + img = Image.open(image_path) + width, height = img.size + cropped_img = img.crop((0, 0, width, height - crop_height)) # 保留顶部到 height - crop_height + # 临时保存切割后的图片 + temp_path = image_path + ".png" + cropped_img.save(temp_path, quality=95) + # 覆盖原文件 + os.replace(temp_path, image_path) + logger.info(f"成功从底部切割 {crop_height}px 并覆盖原图") + + diff --git a/utils/session_utils.py b/utils/session_utils.py index b95ce17..fbc71dc 100644 --- a/utils/session_utils.py +++ b/utils/session_utils.py @@ -16,6 +16,8 @@ async def get_spider_session(platform_id): session_info = await AiSeoApis.get_spider_session(platform_id) if not session_info: raise Exception(f"平台id: {platform_id} 没有可用的爬虫session") + if platform_id == 8: + return session_info # 根据id去爬虫文件夹中找 target = search_session_file(session_info['id'], base_path) # 如果没有找到 下载这个文件并保存