Browse Source

Merge remote-tracking branch 'origin/master'

master
zhurunlin 2 months ago
parent
commit
e8350ef7e7
  1. 4
      .gitignore
  2. 28
      abs_spider.py
  3. 17
      main.py
  4. 12
      spiders/ai_seo/deepseek.py
  5. 3
      spiders/ai_seo/doubao.py
  6. 3
      spiders/ai_seo/kimi.py
  7. 25
      spiders/ai_seo/metaso.py
  8. 3
      spiders/ai_seo/nanometer.py
  9. 3
      spiders/ai_seo/tongyi.py
  10. 3
      spiders/ai_seo/wenxiaoyan.py
  11. 17
      spiders/ai_seo/yiyan.py
  12. 3
      spiders/ai_seo/yuanbao.py
  13. 26
      utils/ai_seo_api_utils.py
  14. 4
      utils/session_utils.py

4
.gitignore

@ -7,4 +7,6 @@ config.py
screenshot screenshot
Pipfile Pipfile
Pipfile.lock Pipfile.lock
logs
logs
__pycache__/
**/__pycache__/

28
abs_spider.py

@ -9,6 +9,7 @@ from playwright.async_api import Browser, BrowserContext, Page
import config import config
from domain.ai_seo import AiAnswer from domain.ai_seo import AiAnswer
from utils import create_logger from utils import create_logger
from utils.ai_seo_api_utils import AiSeoApis
from utils.session_utils import get_spider_session from utils.session_utils import get_spider_session
logger = create_logger("abs_spider") logger = create_logger("abs_spider")
@ -67,10 +68,11 @@ class AbstractAiSeoSpider(ABC):
screenshot_path = f'{config.SCREENSHOT_BASE_PATH}/{self.platform_name}_{unique_id}.png' screenshot_path = f'{config.SCREENSHOT_BASE_PATH}/{self.platform_name}_{unique_id}.png'
return screenshot_path return screenshot_path
async def __init_page(self):
async def __init_page(self, id=''):
if self.load_session: if self.load_session:
self.session_info = await get_spider_session(self.platform_id)
self.session_info = await get_spider_session(self.platform_id, id)
if self.platform_id != 8: if self.platform_id != 8:
print(self.session_info['session_path'])
self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path']) self.browser_content = await self.browser.new_context(storage_state=self.session_info['session_path'])
else: else:
self.browser_content = await self.browser.new_context() self.browser_content = await self.browser.new_context()
@ -118,6 +120,25 @@ class AbstractAiSeoSpider(ABC):
finally: finally:
await self._close() await self._close()
async def check_session(self, session_id) -> bool:
await self.__init_page(session_id)
result = await self.do_check_session()
await self._close()
if result:
logger.success(f"[{self.get_platform_name()}]session状态有效! ✅ id: {session_id}")
else:
logger.error(f"[{self.get_platform_name()}]session状态无效! ❌ id: {session_id}")
# 更新session状态
status = None
if self.get_platform_id() != 13:
status = 1 if result else 2
else:
status = 1 if result else 3
await AiSeoApis.update_spider_session(session_id, status)
return result
@abstractmethod @abstractmethod
async def _do_spider(self) -> AiAnswer: async def _do_spider(self) -> AiAnswer:
""" """
@ -137,3 +158,6 @@ class AbstractAiSeoSpider(ABC):
@abstractmethod @abstractmethod
def get_home_url(self) -> str: def get_home_url(self) -> str:
pass pass
@abstractmethod
async def do_check_session(self) -> bool:
pass

17
main.py

@ -12,7 +12,9 @@ import config
from abs_spider import AbstractAiSeoSpider from abs_spider import AbstractAiSeoSpider
from domain.ai_seo import AiAnswer from domain.ai_seo import AiAnswer
from spiders.ai_seo import * from spiders.ai_seo import *
from spiders.ai_seo.wenxiaoyan import WenxiaoyanSpider
from utils.logger_utils import create_logger from utils.logger_utils import create_logger
from utils.ai import AiSeoApis
logger = create_logger("app") logger = create_logger("app")
@ -24,7 +26,8 @@ SPIDER_CLS = {
5: DouBaoSpider, 5: DouBaoSpider,
6: YiYanSpider, 6: YiYanSpider,
7: NanometerSpider, 7: NanometerSpider,
13: MetasoSpider
13: MetasoSpider,
8: WenxiaoyanSpider
} }
@ -47,7 +50,7 @@ def get_spider(platform_id, prompt, brand, browser) -> AbstractAiSeoSpider:
cls = SPIDER_CLS.get(int(platform_id), None) cls = SPIDER_CLS.get(int(platform_id), None)
if not cls: if not cls:
raise ValueError(f"未找到对应的爬虫类,platform_id={platform_id}") raise ValueError(f"未找到对应的爬虫类,platform_id={platform_id}")
return cls(browser, prompt, brand, True)
return cls(browser, prompt, brand)
def save_local(ai_answer: AiAnswer): def save_local(ai_answer: AiAnswer):
@ -80,6 +83,14 @@ async def test():
index = index + 1 index = index + 1
await asyncio.sleep(config.TEST_INTERVAL * 6) await asyncio.sleep(config.TEST_INTERVAL * 6)
async def test_check_session():
sessions = await AiSeoApis.list_spider_session(6)
playwright, browser = await init_browser()
for session in sessions:
spider = get_spider(session['platform_id'], '你好', '品牌词', browser)
result = await spider.check_session(session['id'])
if __name__ == '__main__': if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(test())
asyncio.get_event_loop().run_until_complete(test_check_session())

12
spiders/ai_seo/deepseek.py

@ -87,6 +87,18 @@ class DeepseekSpider(AbstractAiSeoSpider):
self.ai_answer.screenshot_file = screenshot_path self.ai_answer.screenshot_file = screenshot_path
return self.ai_answer return self.ai_answer
async def do_check_session(self) -> bool:
try:
await self.browser_page.goto(self.get_home_url(), timeout=600000)
await asyncio.sleep(3)
chat_input_element = self.browser_page.locator("//textarea[@id='chat-input']")
await chat_input_element.click()
# 输入提问词
await self.browser_page.keyboard.type(self.prompt)
return True
except Exception:
return False
def handle_listen_response_error(self, func): def handle_listen_response_error(self, func):
""" """

3
spiders/ai_seo/doubao.py

@ -16,6 +16,9 @@ logger = create_logger(__name__)
class DouBaoSpider(AbstractAiSeoSpider): class DouBaoSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think) super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response) self.__listen_response = self.handle_listen_response_error(self.__listen_response)

3
spiders/ai_seo/kimi.py

@ -16,6 +16,9 @@ logger = create_logger(__name__)
class KimiSpider(AbstractAiSeoSpider): class KimiSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think) super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response) self.__listen_response = self.handle_listen_response_error(self.__listen_response)

25
spiders/ai_seo/metaso.py

@ -112,10 +112,6 @@ class MetasoSpider(AbstractAiSeoSpider):
def get_platform_name(self) -> str: def get_platform_name(self) -> str:
return 'Metaso' return 'Metaso'
async def __listen_response(self, response): async def __listen_response(self, response):
url = response.url url = response.url
if response.status == 200: if response.status == 200:
@ -194,6 +190,7 @@ class MetasoSpider(AbstractAiSeoSpider):
else: else:
pass pass
def handle_listen_response_error(self, func): def handle_listen_response_error(self, func):
""" """
@ -214,6 +211,26 @@ class MetasoSpider(AbstractAiSeoSpider):
return wrapper return wrapper
async def do_check_session(self) -> bool:
try:
await self.browser_page.goto(self.get_home_url(), timeout=200000)
await asyncio.sleep(2)
info = await self.browser_page.wait_for_selector(
'#left-menu > div > div.LeftMenu_footer__qsJdJ > div > div > div > button', timeout=600000)
await info.click()
edu = self.browser_page.locator(
'//div[@aria-label="每天有100搜索额度"]/following-sibling::div[1]//span[contains(@class, "MuiTypography-root")]')
edu_txt = await edu.text_content()
if edu_txt == '0':
return False
# 开始操作
chat_input_element = self.browser_page.locator("//textarea[contains(@class, 'search-consult-textarea')]")
# 输入提问词
await chat_input_element.fill(self.prompt)
logger.info(f"[{self.get_platform_name()}]查询还剩{edu_txt} 次")
return True
except Exception:
return False
async def run(): async def run():

3
spiders/ai_seo/nanometer.py

@ -18,6 +18,9 @@ logger = create_logger(__name__)
class NanometerSpider(AbstractAiSeoSpider): class NanometerSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str): def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword) super().__init__(browser, prompt, keyword)
self.load_session = False self.load_session = False

3
spiders/ai_seo/tongyi.py

@ -16,6 +16,9 @@ logger = create_logger(__name__)
class TongyiSpider(AbstractAiSeoSpider): class TongyiSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think) super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response) self.__listen_response = self.handle_listen_response_error(self.__listen_response)

3
spiders/ai_seo/wenxiaoyan.py

@ -18,6 +18,9 @@ logger = create_logger(__name__)
class WenxiaoyanSpider(AbstractAiSeoSpider): class WenxiaoyanSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str): def __init__(self, browser: Browser, prompt: str, keyword: str):
super().__init__(browser, prompt, keyword) super().__init__(browser, prompt, keyword)
def get_home_url(self) -> str: def get_home_url(self) -> str:

17
spiders/ai_seo/yiyan.py

@ -168,7 +168,6 @@ class YiYanSpider(AbstractAiSeoSpider):
)) ))
self.ai_answer.search_result = ai_search_result_list self.ai_answer.search_result = ai_search_result_list
async def check_login(self): async def check_login(self):
# 找登录后才会出现的侧边栏 # 找登录后才会出现的侧边栏
try: try:
@ -182,6 +181,22 @@ class YiYanSpider(AbstractAiSeoSpider):
await AiSeoApis.update_spider_session(self.session_info['id'], 2) await AiSeoApis.update_spider_session(self.session_info['id'], 2)
raise Exception(f"{self.get_platform_name()}登录失败 session_id: {self.session_info['id']}") raise Exception(f"{self.get_platform_name()}登录失败 session_id: {self.session_info['id']}")
async def do_check_session(self) -> bool:
try:
await self.browser_page.goto(self.get_home_url(), timeout=200000)
await asyncio.sleep(2)
# 检查登录状态
await self.check_login()
# 开始操作
chat_input_element = self.browser_page.locator("//div[@class='yc-editor']")
await chat_input_element.click()
await asyncio.sleep(2)
# 输入提问词
await self.browser_page.keyboard.insert_text(self.prompt)
return True
except Exception:
return False
def get_platform_id(self) -> int: def get_platform_id(self) -> int:
return 6 return 6

3
spiders/ai_seo/yuanbao.py

@ -17,6 +17,9 @@ logger = create_logger(__name__)
class YuanBaoSpider(AbstractAiSeoSpider): class YuanBaoSpider(AbstractAiSeoSpider):
async def do_check_session(self) -> bool:
pass
def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False): def __init__(self, browser: Browser, prompt: str, keyword: str, think: bool = False):
super().__init__(browser, prompt, keyword, think) super().__init__(browser, prompt, keyword, think)
self.__listen_response = self.handle_listen_response_error(self.__listen_response) self.__listen_response = self.handle_listen_response_error(self.__listen_response)

26
utils/ai_seo_api_utils.py

@ -170,15 +170,39 @@ class AiSeoApis:
return json_result['data'] return json_result['data']
@staticmethod @staticmethod
async def get_spider_session(platform_id):
async def get_spider_session(platform_id, id=''):
""" """
:param platform_id: :param platform_id:
:param id:
:return: :return:
""" """
uri = '/api/third/getOneSpiderSession' uri = '/api/third/getOneSpiderSession'
url = AiSeoApis.build_full_url(uri) url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id} json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id}
if id:
json_data['id'] = id
async with httpx.AsyncClient() as client:
response = await client.get(url, params=json_data, timeout=60)
json_result = response.json()
if not json_result['code'] == 0:
logger.error(f"获取爬虫session失败")
return None
return json_result['data']
@staticmethod
async def list_spider_session(platform_id='', status=''):
"""
session列表
:param platform_id:
:param id:
:return:
"""
uri = '/api/third/getSpiderSessionList'
url = AiSeoApis.build_full_url(uri)
json_data = {**config.AI_SEO_API_AUTH, 'platform_id': platform_id, 'status': status}
if not id:
json_data['id'] = id
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
response = await client.get(url, params=json_data, timeout=60) response = await client.get(url, params=json_data, timeout=60)
json_result = response.json() json_result = response.json()

4
utils/session_utils.py

@ -5,7 +5,7 @@ import config
from utils.ai_seo_api_utils import AiSeoApis from utils.ai_seo_api_utils import AiSeoApis
async def get_spider_session(platform_id):
async def get_spider_session(platform_id, id=''):
""" """
session session
:param platform_id: :param platform_id:
@ -13,7 +13,7 @@ async def get_spider_session(platform_id):
""" """
base_path = f'{config.ROOT_PATH}/data/session_data' base_path = f'{config.ROOT_PATH}/data/session_data'
# 爬虫信息 # 爬虫信息
session_info = await AiSeoApis.get_spider_session(platform_id)
session_info = await AiSeoApis.get_spider_session(platform_id, id)
if not session_info: if not session_info:
raise Exception(f"平台id: {platform_id} 没有可用的爬虫session") raise Exception(f"平台id: {platform_id} 没有可用的爬虫session")
if platform_id == 8: if platform_id == 8:

Loading…
Cancel
Save