diff --git a/spiders/ai_seo/doubao.py b/spiders/ai_seo/doubao.py index 3921fe1..6d299a6 100644 --- a/spiders/ai_seo/doubao.py +++ b/spiders/ai_seo/doubao.py @@ -139,19 +139,35 @@ class DouBaoSpider(AbstractAiSeoSpider): answer = '' datas = [] index_data = list() + logger.debug(f"await data: {await response.text()}") response_text = ftfy.fix_text(await response.text()) + logger.debug(f"response_text: {response_text}") lines = response_text.split("\n\n") for line in lines: if line.startswith('data: '): line = line[6:] + logger.debug(f"line_1: {line}") + import re + pattern = r'"[\u4e00-\u9fa5]{1,10}"' + result = re.findall(pattern, line) + for i in result: + line = line.replace(i,i[1:-1]) try: data = parse_nested_json(line) + logger.debug(f"data: {data}") datas.append(data) event_data = data.get('event_data', {}) target_key = 'message.content.text' text = glom(event_data, target_key, default=None) if not text is None: answer = answer + str(text) + index_key = 'message.content.meta_infos' + index = glom(event_data, index_key, default=None) + if index: + if str(index[0].get("info").get("insert_text")).isdigit(): + # logger.debug(f"index: {index}") + logger.debug(f"index: {index[0].get("info").get("insert_text")}") + index_data.append(index[0].get("info").get("insert_text")) except JSONDecodeError: continue logger.debug(f"ai回复: {answer}") diff --git a/spiders/ai_seo/yuanbao.py b/spiders/ai_seo/yuanbao.py index cef0ec9..9fd146a 100644 --- a/spiders/ai_seo/yuanbao.py +++ b/spiders/ai_seo/yuanbao.py @@ -39,7 +39,7 @@ class YuanBaoSpider(AbstractAiSeoSpider): await think_button.click() await asyncio.sleep(2) # 开启联网搜索 - search_button = self.browser_page.locator("//button[@dt-button-id='online_search']") + search_button = self.browser_page.locator("//div[@dt-button-id='online_search']") if await search_button.is_visible(): class_str = await search_button.get_attribute('class') clazz = class_str.split(' ') @@ -115,7 +115,8 @@ class YuanBaoSpider(AbstractAiSeoSpider): ai_search_result_list = [] self.ai_answer.answer = text if search_list: - pattern = r'\[\^(\d+)\]' + # pattern = r'\[\^(\d+)\]' + pattern = r'citation:(\d+)' index_data = list(set(re.findall(pattern, self.ai_answer.answer))) for index,search_result in enumerate(search_list): if str(index+1) in index_data: