commit
a38f378e16
32 changed files with 1897 additions and 0 deletions
-
10.gitignore
-
1base/__init__.py
-
21base/base_spider.py
-
26base/enums.py
-
31db.py
-
7lib/stealth.min.js
-
88main.py
-
1models/__init__.py
-
39models/monitor_result_db.py
-
50models/monitor_result_model.py
-
36models/monitor_task_db.py
-
46models/monitor_task_model.py
-
10requirements.txt
-
1spiders/__init__.py
-
1spiders/renmin/__init__.py
-
151spiders/renmin/client.py
-
19spiders/renmin/exception.py
-
188spiders/renmin/spider.py
-
4spiders/xinhua/__init__.py
-
114spiders/xinhua/client.py
-
19spiders/xinhua/exception.py
-
201spiders/xinhua/spider.py
-
1spiders/yang_shi/__init__.py
-
160spiders/yang_shi/client.py
-
19spiders/yang_shi/exception.py
-
185spiders/yang_shi/spider.py
-
1utils/__init__.py
-
102utils/date_format.py
-
83utils/mail.py
-
113utils/proxy.py
-
109utils/scheduler.py
-
60utils/utils.py
@ -0,0 +1,10 @@ |
|||||
|
data |
||||
|
venv |
||||
|
**/log |
||||
|
.idea |
||||
|
**/__pycache__/ |
||||
|
test.py |
||||
|
config.py |
||||
|
Pipfile |
||||
|
Pipfile.lock |
||||
|
run.bat |
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1,21 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from abc import ABC, abstractclassmethod |
||||
|
|
||||
|
|
||||
|
class AbstractSpider(ABC): |
||||
|
""" |
||||
|
爬虫抽象类 |
||||
|
""" |
||||
|
def init_config(self): |
||||
|
""" |
||||
|
初始化配置 |
||||
|
:return: |
||||
|
""" |
||||
|
pass |
||||
|
|
||||
|
def start(self, task_id): |
||||
|
""" |
||||
|
运行爬虫 |
||||
|
:return: |
||||
|
""" |
||||
@ -0,0 +1,26 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from enum import Enum |
||||
|
|
||||
|
|
||||
|
class TaskStatus(Enum): |
||||
|
WAITING = 1 |
||||
|
RUNNING = 2 |
||||
|
COMPLETED = 3 |
||||
|
FAIL = 4 |
||||
|
|
||||
|
def __eq__(self, other): |
||||
|
return self.value == other |
||||
|
|
||||
|
|
||||
|
class Platform(Enum): |
||||
|
XIN_HUA = "xin_hua" |
||||
|
REN_MIN = "ren_min" |
||||
|
YANG_SHI = "yang_shi" |
||||
|
|
||||
|
def __eq__(self, other): |
||||
|
return self.value == other |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
print(Platform.REN_MIN == "ren_min") |
||||
@ -0,0 +1,31 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from tortoise import Tortoise, run_async |
||||
|
from config import * |
||||
|
from utils.utils import logger |
||||
|
|
||||
|
""" |
||||
|
数据库操作 |
||||
|
""" |
||||
|
|
||||
|
|
||||
|
def get_db_url(): |
||||
|
""" |
||||
|
拼接数据库url |
||||
|
:return: |
||||
|
""" |
||||
|
return F"mysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE}" |
||||
|
|
||||
|
|
||||
|
async def init(): |
||||
|
""" |
||||
|
初始化数据库连接 |
||||
|
:return: |
||||
|
""" |
||||
|
await Tortoise.init( |
||||
|
db_url=get_db_url(), |
||||
|
modules={"models": ['models.monitor_result_db', 'models.monitor_task_db']} |
||||
|
) |
||||
|
await Tortoise.generate_schemas() |
||||
|
logger.info("[数据库]初始化数据库连接成功") |
||||
|
|
||||
7
lib/stealth.min.js
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,88 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
|
||||
|
import argparse |
||||
|
import asyncio |
||||
|
import os |
||||
|
import sys |
||||
|
import config |
||||
|
import db |
||||
|
import utils.date_format as date_format |
||||
|
from base.enums import Platform |
||||
|
from models import monitor_task_model |
||||
|
from utils.scheduler import SchedulerManager |
||||
|
from utils.utils import logger |
||||
|
|
||||
|
|
||||
|
def task_group(tasks): |
||||
|
groups = {} |
||||
|
for name, enum in Platform.__members__.items(): |
||||
|
groups[enum.value] = [] |
||||
|
for task in tasks: |
||||
|
if task.platform in groups: |
||||
|
groups[task.platform].append(task) |
||||
|
return list(groups.values()) |
||||
|
|
||||
|
|
||||
|
async def do_get_task_job(): |
||||
|
""" |
||||
|
获取任务信息 |
||||
|
:return: |
||||
|
""" |
||||
|
await db.init() |
||||
|
tasks = await monitor_task_model.get_today_task() |
||||
|
if not tasks: |
||||
|
logger.info(F"没有获取到任务信息") |
||||
|
return |
||||
|
# 分组 |
||||
|
# groups = task_group(tasks) |
||||
|
# random.shuffle(groups) |
||||
|
schedular_manager = SchedulerManager() |
||||
|
logger.info(F"============================== 获取到{len(tasks)}条任务信息 ==============================") |
||||
|
schedular_manager.add_tasks(tasks, True) |
||||
|
|
||||
|
|
||||
|
def restart(): |
||||
|
os.execl(sys.executable, sys.executable, *sys.argv) |
||||
|
|
||||
|
|
||||
|
def load_arg_parse(): |
||||
|
""" |
||||
|
解析启动参数 |
||||
|
:return: |
||||
|
""" |
||||
|
parse = argparse.ArgumentParser(description="抓取社媒新闻数据") |
||||
|
parse.add_argument("-a", "--active", help="启动脚本时 立即进行一次任务拉取", default='false') |
||||
|
args = parse.parse_args() |
||||
|
logger.info(F"启动参数: {args}") |
||||
|
return args |
||||
|
|
||||
|
|
||||
|
def clear_system_proxy(): |
||||
|
# 清除系统代理相关的环境变量 |
||||
|
os.environ.pop('http_proxy', None) |
||||
|
os.environ.pop('https_proxy', None) |
||||
|
os.environ.pop('ftp_proxy', None) |
||||
|
os.environ.pop('no_proxy', None) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
try: |
||||
|
clear_system_proxy() |
||||
|
logger.info(F'启动成功 将在每天的{config.GET_TASK_TIME}拉取任务信息') |
||||
|
get_task_time = date_format.gen_job_datetime(config.GET_TASK_TIME) |
||||
|
manager = SchedulerManager() |
||||
|
# 启动定时任务 |
||||
|
manager.start() |
||||
|
# 添加拉取任务信息的任务 |
||||
|
manager.scheduler.add_job(do_get_task_job, 'cron', hour=get_task_time.hour, minute=get_task_time.minute) |
||||
|
manager.scheduler.add_job(restart, 'cron', hour=get_task_time.hour, minute=0) |
||||
|
# 参数检查 |
||||
|
args = load_arg_parse() |
||||
|
if args.active and args.active.lower() == 'true': |
||||
|
logger.info(F"立即执行一次任务拉取...") |
||||
|
asyncio.get_event_loop().run_until_complete(do_get_task_job()) |
||||
|
# 开启事件循环 |
||||
|
asyncio.get_event_loop().run_forever() |
||||
|
except KeyboardInterrupt: |
||||
|
sys.exit() |
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1,39 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from typing import Optional, Iterable |
||||
|
|
||||
|
from tortoise import fields, BaseDBAsyncClient |
||||
|
from tortoise.models import Model |
||||
|
import utils.date_format as date_format |
||||
|
|
||||
|
|
||||
|
class MonitorResult(Model): |
||||
|
""" |
||||
|
结果实体 |
||||
|
""" |
||||
|
id = fields.IntField(pk=True, autoincrement=True, description="ID") |
||||
|
keyword = fields.CharField(null=True, max_length=120, description="关键词") |
||||
|
title = fields.CharField(null=True, max_length=255, description="文章标题") |
||||
|
url = fields.CharField(null=True, max_length=500, description="文章地址") |
||||
|
publish_time = fields.BigIntField(null=True, max_length=20, description="发布时间") |
||||
|
platform = fields.CharField(null=True, max_length=20, description="平台") |
||||
|
gather_time = fields.CharField(null=True, description="设定采集时间", max_length=30) |
||||
|
content = fields.TextField(null=True, description="文章内容") |
||||
|
image = fields.CharField(null=True, max_length=255, description="结果截图") |
||||
|
is_del = fields.IntField(null=True, max_length=1, description="删除状态") |
||||
|
create_time = fields.BigIntField(null=True, max_length=20, description="创建时间") |
||||
|
update_time = fields.BigIntField(null=True, max_length=20, description="更新时间") |
||||
|
delete_time = fields.BigIntField(null=True, max_length=20, description="删除时间") |
||||
|
|
||||
|
class Meta: |
||||
|
table = "aux_monitor_result" |
||||
|
|
||||
|
def _pre_save( |
||||
|
self, |
||||
|
using_db: Optional[BaseDBAsyncClient] = None, |
||||
|
update_fields: Optional[Iterable[str]] = None, |
||||
|
) -> None: |
||||
|
if not self.id: |
||||
|
self.create_time = date_format.timestamp() |
||||
|
self.update_time = date_format.timestamp() |
||||
|
|
||||
@ -0,0 +1,50 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
import config |
||||
|
from .monitor_result_db import MonitorResult |
||||
|
import utils.date_format as date_format |
||||
|
from .monitor_task_db import MonitorTask |
||||
|
|
||||
|
|
||||
|
def gen_result(task: MonitorTask, title, url, publish_time): |
||||
|
""" |
||||
|
构建任务结果对象 |
||||
|
:param task: 任务对象 |
||||
|
:param title: 标题 |
||||
|
:param url: 地址 |
||||
|
:param publish_time: 发布时间 |
||||
|
:return: |
||||
|
""" |
||||
|
if isinstance(publish_time, str): |
||||
|
timestamp = date_format.timestamp() |
||||
|
publish_time = int(date_format.parse(publish_time).timestamp()) |
||||
|
module = MonitorResult(title=title, url=url, publish_time=publish_time, |
||||
|
is_del=1, |
||||
|
keyword=task.keyword, platform=task.platform, |
||||
|
gather_time=F"{task.gather_date} {task.setting_time}") |
||||
|
return module |
||||
|
|
||||
|
|
||||
|
async def save(results): |
||||
|
if config.RESULT_UNIQUE: |
||||
|
await save_unique(results) |
||||
|
else: |
||||
|
model = MonitorResult() |
||||
|
await model.bulk_create(results) |
||||
|
|
||||
|
|
||||
|
async def save_unique(results): |
||||
|
# 过滤列表中重复的结果 |
||||
|
unique_results = {} |
||||
|
for result in results: |
||||
|
key = (result.platform, result.keyword, result.title) |
||||
|
if key not in unique_results: |
||||
|
unique_results[key] = result |
||||
|
unique_results = list(unique_results.values()) |
||||
|
# 过滤数据库中重复的结果 |
||||
|
save_results = [] |
||||
|
model = MonitorResult() |
||||
|
for result in unique_results: |
||||
|
exist = await model.filter(platform=result.platform, keyword=result.keyword, title=result.title).exists() |
||||
|
if not exist: |
||||
|
save_results.append(result) |
||||
|
await model.bulk_create(save_results) |
||||
@ -0,0 +1,36 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from typing import Optional, Iterable |
||||
|
|
||||
|
from tortoise import fields, BaseDBAsyncClient |
||||
|
from tortoise.models import Model |
||||
|
import utils.date_format as date_format |
||||
|
|
||||
|
|
||||
|
class MonitorTask(Model): |
||||
|
""" |
||||
|
任务实体 |
||||
|
""" |
||||
|
id = fields.IntField(pk=True, autoincrement=True, description="ID") |
||||
|
keyword = fields.CharField(null=True, max_length=255, description="关键词") |
||||
|
lang = fields.CharField(null=True, max_length=50, description="语言") |
||||
|
platform = fields.CharField(null=True, max_length=30, description="媒体平台") |
||||
|
gather_date = fields.CharField(null=True, max_length=30, description="采集日期") |
||||
|
gather_time = fields.CharField(null=True, max_length=30, description="采集时间") |
||||
|
setting_time = fields.CharField(null=True, max_length=30, description="设定时间") |
||||
|
status = fields.IntField(null=True, max_length=1, description="任务状态 1 待执行 2 进行中 3 已完成") |
||||
|
create_time = fields.BigIntField(null=True, max_length=16, description="创建时间") |
||||
|
update_time = fields.BigIntField(null=True, max_length=16, description="更新时间") |
||||
|
|
||||
|
class Meta: |
||||
|
table = "aux_monitor_task" |
||||
|
|
||||
|
async def _pre_save( |
||||
|
self, |
||||
|
using_db: Optional[BaseDBAsyncClient] = None, |
||||
|
update_fields: Optional[Iterable[str]] = None, |
||||
|
) -> None: |
||||
|
if not self.id: |
||||
|
self.create_time = date_format.timestamp() |
||||
|
self.update_time = date_format.timestamp() |
||||
|
|
||||
@ -0,0 +1,46 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from .monitor_task_db import MonitorTask |
||||
|
import utils.date_format as date_format |
||||
|
from base.enums import TaskStatus |
||||
|
|
||||
|
|
||||
|
async def get_today_task(): |
||||
|
""" |
||||
|
获取当天的任务信息 |
||||
|
:return: |
||||
|
""" |
||||
|
# 当天日期 |
||||
|
today = date_format.gen_today_str() |
||||
|
task_model = MonitorTask() |
||||
|
result = await task_model.filter(gather_date=today, status=TaskStatus.WAITING.value).all() |
||||
|
return result |
||||
|
|
||||
|
|
||||
|
async def get_task(task_id): |
||||
|
""" |
||||
|
获取指定id的任务信息 |
||||
|
:param task_id: |
||||
|
:return: |
||||
|
""" |
||||
|
task_model = MonitorTask() |
||||
|
return await task_model.get_or_none(id=task_id) |
||||
|
|
||||
|
|
||||
|
async def complete(task_id): |
||||
|
task_model = MonitorTask() |
||||
|
await task_model.filter(id=task_id).update(status=TaskStatus.COMPLETED.value, update_time=date_format.timestamp()) |
||||
|
|
||||
|
|
||||
|
async def running(task_id): |
||||
|
task_model = MonitorTask() |
||||
|
await task_model.filter(id=task_id).update(status=TaskStatus.RUNNING.value, update_time=date_format.timestamp()) |
||||
|
|
||||
|
|
||||
|
async def fail(task_id): |
||||
|
task_model = MonitorTask() |
||||
|
await task_model.filter(id=task_id).update(status=TaskStatus.FAIL.value, update_time=date_format.timestamp()) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
get_today_task() |
||||
@ -0,0 +1,10 @@ |
|||||
|
tortoise-orm~=0.19.0 |
||||
|
playwright~=1.42.0 |
||||
|
httpx~=0.27.0 |
||||
|
aiomysql~=0.2.0 |
||||
|
pymysql~=1.1.0 |
||||
|
python-dateutil~=2.9.0.post0 |
||||
|
APScheduler~=3.10.4 |
||||
|
yagmail~=0.15.293 |
||||
|
retry~=0.9.2 |
||||
|
|
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1,151 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
import json |
||||
|
from typing import Dict |
||||
|
from urllib.parse import urlencode |
||||
|
import httpx |
||||
|
from httpx._exceptions import HTTPError, RequestError |
||||
|
from playwright.async_api import Page |
||||
|
from .exception import DataFetchError |
||||
|
import asyncio |
||||
|
import json |
||||
|
import utils.date_format as date_format |
||||
|
from utils.utils import count_characters |
||||
|
from utils.utils import logger |
||||
|
import utils.proxy as proxy |
||||
|
import config |
||||
|
|
||||
|
|
||||
|
class RenMinClient: |
||||
|
def __init__(self, |
||||
|
timeout=60, |
||||
|
*, |
||||
|
playwright_page: Page, |
||||
|
cookie_dict: Dict[str, str]): |
||||
|
self.timeout = timeout |
||||
|
self.headers = { |
||||
|
"Accept": "application/json, text/plain, */*", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"Connection": "keep-alive", |
||||
|
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", |
||||
|
"Host": "search.people.cn", |
||||
|
"Content-Length": "163", |
||||
|
"Content-Type": "application/json", |
||||
|
"Origin": "http://search.people.cn", |
||||
|
"Pragma": "no-cache", |
||||
|
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" |
||||
|
} |
||||
|
self._host = "http://search.people.cn" |
||||
|
self.playwright_page = playwright_page |
||||
|
self.cookie_dict = cookie_dict |
||||
|
|
||||
|
async def request(self, method, url, **kwargs): |
||||
|
""" |
||||
|
请求方法 |
||||
|
:param method: 请求方法 |
||||
|
:param url: 地址 |
||||
|
:param kwargs: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
# api代理 |
||||
|
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None |
||||
|
try: |
||||
|
async with httpx.AsyncClient(proxies=proxies) as client: |
||||
|
response = await client.request( |
||||
|
method, url, timeout=self.timeout, |
||||
|
**kwargs |
||||
|
) |
||||
|
# 人民网504 是没有数据 |
||||
|
if response.status_code == 504: |
||||
|
# logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}") |
||||
|
# raise DataFetchError("黑名单异常", url, method, kwargs) |
||||
|
return {} |
||||
|
if not response.status_code == 200: |
||||
|
logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") |
||||
|
raise DataFetchError("httpx异常", url, method, kwargs) |
||||
|
data: Dict = response.json() |
||||
|
if data.get("code") != "0": |
||||
|
raise DataFetchError(data.get("message", "未知错误"), url) |
||||
|
else: |
||||
|
return data.get("data", {}) |
||||
|
except HTTPError as e: |
||||
|
logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}") |
||||
|
logger.error(F"[人民网]错误信息{str(e)}") |
||||
|
raise DataFetchError(str(e), url) |
||||
|
except Exception as e: |
||||
|
logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") |
||||
|
logger.error(F"[人民网]错误信息{str(e)}") |
||||
|
raise Exception(str(e)) |
||||
|
|
||||
|
async def get(self, uri: str, params=None) -> Dict: |
||||
|
""" |
||||
|
GET 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param params: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
final_uri = uri |
||||
|
if isinstance(params, dict): |
||||
|
final_uri = (f"{uri}?" |
||||
|
f"{urlencode(params)}") |
||||
|
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
||||
|
|
||||
|
async def post(self, uri: str, data: dict) -> Dict: |
||||
|
""" |
||||
|
POST 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param data: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
||||
|
return await self.request(method="POST", url=F"{self._host}{uri}", |
||||
|
data=json_str, headers=self.headers) |
||||
|
|
||||
|
async def search(self, keyword, cur_page): |
||||
|
""" |
||||
|
搜索 |
||||
|
:param end: |
||||
|
:param start: |
||||
|
:param keyword: 关键词 |
||||
|
:param cur_page: 页码 |
||||
|
:param sort_field: 排序 # 0 时间倒序 1 时间正序 |
||||
|
:return: |
||||
|
""" |
||||
|
# 接口地址 |
||||
|
uri = '/search-platform/front/search' |
||||
|
get_param = { |
||||
|
'key': keyword, |
||||
|
'startTime': 0, |
||||
|
'endTime': 0, |
||||
|
'hasContent': True, |
||||
|
'hasTitle': True, |
||||
|
'isFuzzy': False, # 精准匹配 |
||||
|
'limit': 10, |
||||
|
'page': cur_page, |
||||
|
'sortType': 0, |
||||
|
'type': 0 |
||||
|
} |
||||
|
chinese, not_chinese = count_characters(keyword) |
||||
|
# 长度 = 127+ 汉字*3 + 其他*1 |
||||
|
# 关键字部分 |
||||
|
content_length = 126 + (chinese * 3) + not_chinese + 1 # 如果精准匹配是False 加一字节 |
||||
|
# 页码部分 |
||||
|
chinese, not_chinese = count_characters(cur_page) |
||||
|
content_length = content_length + not_chinese |
||||
|
|
||||
|
logger.info(F"[人民网]请求长度: {content_length}") |
||||
|
logger.info(F"[人民网]参数: {get_param}") |
||||
|
self.headers['Content-Length'] = str(content_length) |
||||
|
content = await self.post(uri, get_param) |
||||
|
if not content or not content.get('records'): |
||||
|
return [] |
||||
|
return content.get('records', []) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
client = RenMinClient(playwright_page=None, cookie_dict={}) |
||||
|
start, end = date_format.today_timestamp_long() |
||||
|
asyncio.run(client.search('乡村发展', 1)) |
||||
@ -0,0 +1,19 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from httpx import RequestError |
||||
|
|
||||
|
|
||||
|
class DataFetchError(RequestError): |
||||
|
"""未知异常""" |
||||
|
def __init__(self, message, url, method="GET", params=None): |
||||
|
self.message = message |
||||
|
self.url = url |
||||
|
self.method = method |
||||
|
self.params = params |
||||
|
|
||||
|
def __str__(self): |
||||
|
return self.message |
||||
|
|
||||
|
|
||||
|
class IPBlockError(RequestError): |
||||
|
"""ip被封禁异常""" |
||||
@ -0,0 +1,188 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
||||
|
|
||||
|
from base.base_spider import AbstractSpider |
||||
|
from typing import Dict, List, Optional, Tuple |
||||
|
from .client import RenMinClient |
||||
|
from utils.utils import logger, is_blank |
||||
|
from models.monitor_task_model import get_task, running, complete, fail |
||||
|
from models.monitor_result_model import gen_result, save |
||||
|
from base.enums import Platform |
||||
|
import utils.date_format as date_format |
||||
|
import os |
||||
|
import config |
||||
|
import uuid |
||||
|
from .exception import DataFetchError |
||||
|
import utils.mail as mail |
||||
|
import asyncio |
||||
|
from tortoise.transactions import in_transaction |
||||
|
|
||||
|
|
||||
|
class RenMinSpider(AbstractSpider): |
||||
|
""" |
||||
|
人民网爬虫 |
||||
|
""" |
||||
|
client: RenMinClient # 请求对象 |
||||
|
context_page: Page # 浏览器页面上下文 |
||||
|
browser_context: BrowserContext # 浏览器上下文 |
||||
|
image_path: str |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.index_url = "http://www.people.com.cn/" |
||||
|
self.platform = Platform.REN_MIN |
||||
|
self.image_path = None |
||||
|
self.retry = 0 # 自旋次数 |
||||
|
|
||||
|
def init_config(self): |
||||
|
super().init_config() |
||||
|
|
||||
|
async def start(self, task_id): |
||||
|
try: |
||||
|
async with in_transaction(): |
||||
|
await self.do_spider(task_id) |
||||
|
except DataFetchError as e: |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 获取数据异常") |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
if self.retry > 3: |
||||
|
await fail(task_id) |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "人民网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
except Exception as e: |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 爬虫异常") |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 切换代理ip并自旋 |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
if self.retry > 3: |
||||
|
await fail(task_id) |
||||
|
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "人民网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
|
||||
|
async def create_client(self) -> RenMinClient: |
||||
|
return RenMinClient(playwright_page=None, cookie_dict={}) |
||||
|
|
||||
|
async def launch_browser(self, |
||||
|
chromium: BrowserType, |
||||
|
playwright_proxy: Optional[Dict], |
||||
|
user_agent: Optional[Dict], |
||||
|
headless: bool = True): |
||||
|
""" |
||||
|
启动一个浏览器上下文 |
||||
|
:param chromium: |
||||
|
:param headless: |
||||
|
:param self: 类型 |
||||
|
:param playwright_proxy: 代理 |
||||
|
:param user_agent: 用户标识 |
||||
|
:return: |
||||
|
""" |
||||
|
# 浏览器对象 |
||||
|
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
||||
|
|
||||
|
# 浏览器上下文 |
||||
|
browser_context = await browser.new_context( |
||||
|
viewport={"width": 1920, "height": 1080}, |
||||
|
user_agent=user_agent |
||||
|
) |
||||
|
return browser_context |
||||
|
|
||||
|
async def do_search(self, task): |
||||
|
""" |
||||
|
获取任务信息 |
||||
|
:return: |
||||
|
""" |
||||
|
start, end = date_format.today_timestamp_long() # 开始结束时间 |
||||
|
results = [] |
||||
|
cur_page = 1 |
||||
|
logger.info(F"[人民网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
||||
|
self.client = await self.create_client() |
||||
|
while True: |
||||
|
logger.info(F"[人民网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
||||
|
search_datas = await self.client.search(task.keyword, cur_page) |
||||
|
logger.info(F"[人民网]获取到{len(search_datas)}条搜索结果") |
||||
|
if not search_datas: |
||||
|
logger.info(F"[人民网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
||||
|
break |
||||
|
index = -1 |
||||
|
for i, data in enumerate(search_datas): |
||||
|
# 找到一个不是今天的数据就结束 |
||||
|
if not date_format.is_today(date_format.timestamp2date(data.get("displayTime")).strftime("%Y-%m-%d")): |
||||
|
index = i |
||||
|
break |
||||
|
# 切割 |
||||
|
if index == -1: |
||||
|
# 搜索结果的最后一个依然是今天的 整个添加 |
||||
|
results = results + search_datas |
||||
|
# 翻到下一页 继续找 |
||||
|
cur_page = cur_page + 1 |
||||
|
else: |
||||
|
# 搜索结果中有不是今天的 切割一部分添加 |
||||
|
results = results + search_datas[:index] |
||||
|
# 结束本次搜索 |
||||
|
break |
||||
|
logger.info(F"[人民网]关键词:{task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
||||
|
return results |
||||
|
|
||||
|
async def cut_screen(self, url): |
||||
|
""" |
||||
|
网页截图 |
||||
|
:param url: 地址 |
||||
|
:return: |
||||
|
""" |
||||
|
if not self.image_path: |
||||
|
image_path = config.IMAGE_PATH |
||||
|
if is_blank(image_path): |
||||
|
self.image_path = "./data" |
||||
|
if not os.path.exists(self.image_path): |
||||
|
os.makedirs(self.image_path) |
||||
|
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
||||
|
# 开始截图 |
||||
|
await self.context_page.goto(url) |
||||
|
await self.context_page.screenshot(path=save_path, full_page=True) |
||||
|
return save_path |
||||
|
|
||||
|
async def do_spider(self, task_id): |
||||
|
# 获取任务信息 |
||||
|
task = await get_task(task_id) |
||||
|
if not task: |
||||
|
logger.error(F"[人民网]任务ID: {task_id}不存在 任务结束") |
||||
|
return |
||||
|
logger.info(F"[人民网]任务ID: {task_id} 任务开始") |
||||
|
await running(task_id) |
||||
|
# 从api中获取数据 |
||||
|
search_datas = await self.do_search(task) |
||||
|
if not search_datas: |
||||
|
logger.info(F"[人民网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束") |
||||
|
await complete(task_id) |
||||
|
return |
||||
|
# 保存result实体 |
||||
|
results = [] |
||||
|
# 启动浏览器 |
||||
|
async with async_playwright() as playwright: |
||||
|
chromium = playwright.chromium |
||||
|
self.browser_context = await self.launch_browser(chromium, None, None, headless=True) |
||||
|
# 反反爬脚本 |
||||
|
await self.browser_context.add_init_script(path="lib/stealth.min.js") |
||||
|
self.context_page: Page = await self.browser_context.new_page() |
||||
|
# 构建结果实体 截图 |
||||
|
for data in search_datas: |
||||
|
result = gen_result(task, data.get("title"), data.get("url"), int(data.get("displayTime") / 1000)) |
||||
|
# img_path = await self.cut_screen(data.get("url")) |
||||
|
# result.image = img_path |
||||
|
results.append(result) |
||||
|
# logger.info(F"[人民网]标题: {data.get('title')} 截图文件名: {img_path}") |
||||
|
|
||||
|
# 结果落库 |
||||
|
await save(results) |
||||
|
logger.info(F"[人民网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
||||
|
await complete(task_id) |
||||
@ -0,0 +1,4 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
|
||||
|
|
||||
@ -0,0 +1,114 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
import json |
||||
|
from typing import Dict |
||||
|
from urllib.parse import urlencode |
||||
|
from .exception import DataFetchError |
||||
|
import httpx |
||||
|
from playwright.async_api import Page |
||||
|
from httpx._exceptions import HTTPError |
||||
|
from utils.utils import logger |
||||
|
import asyncio |
||||
|
import utils.proxy as proxy |
||||
|
import config |
||||
|
|
||||
|
|
||||
|
class XinHuaClient: |
||||
|
def __init__(self, |
||||
|
timeout=10, |
||||
|
*, |
||||
|
headers: Dict[str, str], |
||||
|
playwright_page: Page, |
||||
|
cookie_dict: Dict[str, str]): |
||||
|
self.timeout = timeout |
||||
|
self.headers = headers |
||||
|
self._host = "https://so.news.cn/" |
||||
|
self.playwright_page = playwright_page |
||||
|
self.cookie_dict = cookie_dict |
||||
|
|
||||
|
async def request(self, method, url, **kwargs): |
||||
|
""" |
||||
|
请求方法 |
||||
|
:param method: 请求方法 |
||||
|
:param url: 地址 |
||||
|
:param kwargs: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
# api代理 |
||||
|
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None |
||||
|
try: |
||||
|
async with httpx.AsyncClient(proxies=proxies) as client: |
||||
|
response = await client.request( |
||||
|
method, url, timeout=self.timeout, |
||||
|
**kwargs |
||||
|
) |
||||
|
# 返回不正确的状态码 |
||||
|
if not response.status_code == 200: |
||||
|
logger.error(F"[新华网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") |
||||
|
raise DataFetchError("httpx异常", url, method, kwargs) |
||||
|
# 返回正确的状态码 |
||||
|
data: Dict = response.json() |
||||
|
if data.get("code") != 200: |
||||
|
# 有特殊情况 敏感词会直接把content返回为没有找到相关稿件 |
||||
|
if data.get("content") == '没有找到相关稿件': |
||||
|
logger.warning(F"[新华网]触发敏感词 跳过请求 参数: {kwargs}") |
||||
|
return {} |
||||
|
raise DataFetchError(data.get("content", "API未知错误"), url, method, kwargs) |
||||
|
else: |
||||
|
return data.get("content", {}) |
||||
|
except HTTPError as e: |
||||
|
logger.error(F"[新华网]httpx异常: [{method}]{url} 参数: {kwargs}") |
||||
|
logger.error(F"[新华网]错误信息{str(e)}") |
||||
|
raise DataFetchError(str(e), url) |
||||
|
except Exception as e: |
||||
|
logger.error(F"[新华网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") |
||||
|
logger.error(F"[新华网]错误信息{str(e)}") |
||||
|
raise Exception(str(e)) |
||||
|
|
||||
|
async def get(self, uri: str, params=None) -> Dict: |
||||
|
""" |
||||
|
GET 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param params: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
final_uri = uri |
||||
|
if isinstance(params, dict): |
||||
|
final_uri = (f"{uri}?" |
||||
|
f"{urlencode(params)}") |
||||
|
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
||||
|
|
||||
|
async def post(self, uri: str, data: dict) -> Dict: |
||||
|
""" |
||||
|
POST 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param data: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
||||
|
return await self.request(method="POST", url=F"{self._host}{uri}", |
||||
|
data=json_str, headers=self.headers) |
||||
|
|
||||
|
async def search(self, keyword, cur_page, lang='cn', sort_field=0, search_fields=0): |
||||
|
""" |
||||
|
搜索 |
||||
|
:param lang: |
||||
|
:param keyword: 关键词 |
||||
|
:param cur_page: 页码 |
||||
|
:param sort_field: 排序 0: 相关度 1: 时间 |
||||
|
:param search_fields: 搜索类型: 0: 全文 1: 标题 |
||||
|
:return: |
||||
|
""" |
||||
|
# 接口地址 |
||||
|
uri = '/getNews' |
||||
|
get_param = { |
||||
|
'keyword': keyword, |
||||
|
'curPage': cur_page, |
||||
|
'sortField': sort_field, |
||||
|
'searchFields': search_fields, |
||||
|
'lang': lang |
||||
|
} |
||||
|
content = await self.get(uri, get_param) |
||||
|
if not content or not content.get('results'): |
||||
|
return [] |
||||
|
return content.get('results', []) |
||||
@ -0,0 +1,19 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from httpx import RequestError |
||||
|
|
||||
|
|
||||
|
class DataFetchError(RequestError): |
||||
|
"""未知异常""" |
||||
|
def __init__(self, message, url, method="GET", params=None): |
||||
|
self.message = message |
||||
|
self.url = url |
||||
|
self.method = method |
||||
|
self.params = params |
||||
|
|
||||
|
def __str__(self): |
||||
|
return self.message |
||||
|
|
||||
|
|
||||
|
class IPBlockError(RequestError): |
||||
|
"""ip被封禁异常""" |
||||
@ -0,0 +1,201 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
||||
|
|
||||
|
from base.base_spider import AbstractSpider |
||||
|
from typing import Dict, List, Optional, Tuple |
||||
|
from .client import XinHuaClient |
||||
|
from utils.utils import logger, is_blank |
||||
|
from models.monitor_task_model import get_task, running, complete, fail |
||||
|
from models.monitor_result_model import gen_result, save |
||||
|
from base.enums import Platform |
||||
|
import utils.date_format as date_format |
||||
|
import os |
||||
|
import config |
||||
|
import uuid |
||||
|
from .exception import DataFetchError |
||||
|
import asyncio |
||||
|
import utils.mail as mail |
||||
|
from tortoise.transactions import in_transaction |
||||
|
|
||||
|
|
||||
|
class XinHuaSpider(AbstractSpider): |
||||
|
""" |
||||
|
新华网爬虫 |
||||
|
""" |
||||
|
client: XinHuaClient # 请求对象 |
||||
|
context_page: Page # 浏览器页面上下文 |
||||
|
browser_context: BrowserContext # 浏览器上下文 |
||||
|
image_path: str |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.index_url = "http://www.xinhuanet.com/" |
||||
|
self.platform = Platform.XIN_HUA |
||||
|
self.image_path = None |
||||
|
self.retry = 0 # 自旋次数 |
||||
|
self.context_page = None |
||||
|
|
||||
|
def init_config(self): |
||||
|
super().init_config() |
||||
|
|
||||
|
async def start(self, task_id): |
||||
|
try: |
||||
|
async with in_transaction(): |
||||
|
await self.do_spider(task_id) |
||||
|
except DataFetchError as e: |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 获取数据异常") |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
if self.retry > 3: |
||||
|
await fail(task_id) |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "新华网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
except Exception as e: |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 爬虫异常") |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
await fail(task_id) |
||||
|
if self.retry > 3: |
||||
|
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "新华网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
|
||||
|
async def create_xinhua_client(self, httpx_proxy: Optional[str]) -> XinHuaClient: |
||||
|
# 请求头 |
||||
|
headers = { |
||||
|
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br, zstd", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", |
||||
|
"Cookie": "org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; wdcid=7af5eba7b2f8b44b; arialoadData=false; acw_tc=2760778017108394678246790e1403779a009cc2c5fe412f126407bf171637", |
||||
|
"Host": "so.news.cn", "Pragma": "no-cache", "Referer": "https://so.news.cn/", "Sec-Fetch-Dest": "empty", |
||||
|
"Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", |
||||
|
"X-Requested-With": "XMLHttpRequest", |
||||
|
"sec-ch-ua": "\"Chromium\";v=\"122\", \"Not(A:Brand\";v=\"24\", \"Google Chrome\";v=\"122\"", |
||||
|
"sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\""} |
||||
|
client = XinHuaClient(headers=headers, cookie_dict=None, playwright_page=self.context_page) |
||||
|
return client |
||||
|
|
||||
|
async def launch_browser(self, |
||||
|
chromium: BrowserType, |
||||
|
playwright_proxy: Optional[Dict], |
||||
|
user_agent: Optional[Dict], |
||||
|
headless: bool = True): |
||||
|
""" |
||||
|
启动一个浏览器上下文 |
||||
|
:param chromium: |
||||
|
:param headless: |
||||
|
:param self: 类型 |
||||
|
:param playwright_proxy: 代理 |
||||
|
:param user_agent: 用户标识 |
||||
|
:return: |
||||
|
""" |
||||
|
# 浏览器对象 |
||||
|
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
||||
|
|
||||
|
# 浏览器上下文 |
||||
|
browser_context = await browser.new_context( |
||||
|
viewport={"width": 1920, "height": 1080}, |
||||
|
user_agent=user_agent |
||||
|
) |
||||
|
return browser_context |
||||
|
|
||||
|
async def do_search(self, task): |
||||
|
""" |
||||
|
获取任务信息 |
||||
|
:return: |
||||
|
""" |
||||
|
results = [] |
||||
|
cur_page = 1 |
||||
|
logger.info(F"[新华网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
||||
|
self.client = await self.create_xinhua_client(None) |
||||
|
while True: |
||||
|
logger.info(F"[新华网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
||||
|
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page, lang=task.lang) |
||||
|
logger.info(F"[新华网]获取到{len(search_datas)}条搜索结果") |
||||
|
if not search_datas: |
||||
|
logger.info(F"[新华网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
||||
|
break |
||||
|
index = -1 |
||||
|
for i, data in enumerate(search_datas): |
||||
|
# 找到一个不是今天的数据就结束 |
||||
|
if not date_format.is_today(data.get("pubtime")): |
||||
|
index = i |
||||
|
break |
||||
|
# 如果全都是今天的 就翻页 |
||||
|
if index == -1: |
||||
|
# 搜索结果的最后一个依然是今天的 整个添加 |
||||
|
results = results + search_datas |
||||
|
# 翻到下一页 继续找 |
||||
|
cur_page = cur_page + 1 |
||||
|
else: |
||||
|
# 搜索结果中有不是今天的 切割一部分添加 |
||||
|
results = results + search_datas[:index] |
||||
|
# 结束本次搜索 |
||||
|
break |
||||
|
logger.info(F"[新华网]关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
||||
|
return results |
||||
|
|
||||
|
async def cut_screen(self, url): |
||||
|
""" |
||||
|
网页截图 |
||||
|
:param url: 地址 |
||||
|
:return: |
||||
|
""" |
||||
|
if not self.image_path: |
||||
|
image_path = config.IMAGE_PATH |
||||
|
if is_blank(image_path): |
||||
|
self.image_path = "./data" |
||||
|
if not os.path.exists(self.image_path): |
||||
|
os.makedirs(self.image_path) |
||||
|
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
||||
|
# 开始截图 |
||||
|
await self.context_page.goto(url) |
||||
|
await self.context_page.screenshot(path=save_path, full_page=True) |
||||
|
return save_path |
||||
|
|
||||
|
async def do_spider(self, task_id): |
||||
|
# 获取任务信息 |
||||
|
task = await get_task(task_id) |
||||
|
if not task: |
||||
|
logger.error(F"[新华网]任务ID: {task_id}不存在 任务结束") |
||||
|
return |
||||
|
logger.info(F"[新华网]任务ID: {task_id} 任务开始") |
||||
|
await running(task_id) |
||||
|
# 从api中获取数据 |
||||
|
search_datas = await self.do_search(task) |
||||
|
if not search_datas: |
||||
|
logger.info(F"[新华网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束") |
||||
|
await complete(task_id) |
||||
|
return |
||||
|
# 保存result实体 |
||||
|
results = [] |
||||
|
# 启动浏览器 |
||||
|
async with async_playwright() as playwright: |
||||
|
chromium = playwright.chromium |
||||
|
self.browser_context = await self.launch_browser(chromium, None, None, headless=True) |
||||
|
# 反反爬脚本 |
||||
|
await self.browser_context.add_init_script(path="lib/stealth.min.js") |
||||
|
self.context_page: Page = await self.browser_context.new_page() |
||||
|
|
||||
|
# 构建结果实体 截图 |
||||
|
for data in search_datas: |
||||
|
result = gen_result(task, data.get("title"), data.get("url"), data.get("pubtime")) |
||||
|
# img_path = await self.cut_screen(data.get("url")) |
||||
|
# result.image = img_path |
||||
|
results.append(result) |
||||
|
# logger.info(F"[新华网]标题: {data.get('title')} 截图文件名: {img_path}") |
||||
|
|
||||
|
# 结果落库 |
||||
|
await save(results) |
||||
|
logger.info(F"[新华网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
||||
|
await complete(task_id) |
||||
|
|
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1,160 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
import json |
||||
|
from typing import Dict |
||||
|
from urllib.parse import urlencode |
||||
|
import httpx |
||||
|
from playwright.async_api import Page |
||||
|
from .exception import DataFetchError |
||||
|
import asyncio |
||||
|
import json |
||||
|
import utils.date_format as date_format |
||||
|
from utils.utils import count_characters |
||||
|
from playwright.async_api import async_playwright |
||||
|
import asyncio |
||||
|
from utils.utils import logger |
||||
|
|
||||
|
|
||||
|
class YangShiClient: |
||||
|
def __init__(self, |
||||
|
timeout=60, |
||||
|
proxies=None, |
||||
|
*, |
||||
|
playwright_page: Page, |
||||
|
cookie_dict: Dict[str, str]): |
||||
|
self.proxies = proxies |
||||
|
self.timeout = timeout |
||||
|
self.headers = { |
||||
|
"Accept": "application/json, text/plain, */*", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"Connection": "keep-alive", |
||||
|
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", |
||||
|
"Host": "search.people.cn", |
||||
|
"Content-Length": "163", |
||||
|
"Content-Type": "application/json", |
||||
|
"Origin": "http://search.people.cn", |
||||
|
"Pragma": "no-cache", |
||||
|
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" |
||||
|
} |
||||
|
self._host = "https://search.cctv.com/" |
||||
|
self.playwright_page = playwright_page |
||||
|
self.cookie_dict = cookie_dict |
||||
|
|
||||
|
async def request(self, method, url, **kwargs): |
||||
|
""" |
||||
|
请求方法 |
||||
|
:param method: 请求方法 |
||||
|
:param url: 地址 |
||||
|
:param kwargs: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
async with httpx.AsyncClient(proxies=self.proxies) as client: |
||||
|
response = await client.request( |
||||
|
method, url, timeout=self.timeout, |
||||
|
**kwargs |
||||
|
) |
||||
|
data: Dict = response.json() |
||||
|
if data.get("code") != "0": |
||||
|
raise DataFetchError(data.get("message", "未知错误")) |
||||
|
else: |
||||
|
return data.get("data", {}) |
||||
|
|
||||
|
async def get(self, uri: str, params=None) -> Dict: |
||||
|
""" |
||||
|
GET 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param params: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
final_uri = uri |
||||
|
if isinstance(params, dict): |
||||
|
final_uri = (f"{uri}?" |
||||
|
f"{urlencode(params)}") |
||||
|
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
||||
|
|
||||
|
async def post(self, uri: str, data: dict) -> Dict: |
||||
|
""" |
||||
|
POST 请求方法 |
||||
|
:param uri: 请求地址 |
||||
|
:param data: 参数 |
||||
|
:return: 返回结果 |
||||
|
""" |
||||
|
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
||||
|
return await self.request(method="POST", url=F"{self._host}{uri}", |
||||
|
data=json_str, headers=self.headers) |
||||
|
|
||||
|
async def search(self, keyword, cur_page): |
||||
|
""" |
||||
|
搜索 |
||||
|
:param keyword: 关键词 |
||||
|
:param cur_page: 页码 |
||||
|
:return: |
||||
|
""" |
||||
|
# 接口地址 |
||||
|
uri = F"/search.php?qtext={keyword}&page={cur_page}&type=web&sort=date&datepid=1&channel=&vtime=-1&is_search=1" |
||||
|
full_url = F"{self._host}{uri}" |
||||
|
try: |
||||
|
await self.playwright_page.goto(full_url) |
||||
|
results = [] |
||||
|
# 选择每一个结果元素 |
||||
|
elements = await self.playwright_page.query_selector_all("div.tright") |
||||
|
for element in elements: |
||||
|
title = "" |
||||
|
url = "" |
||||
|
publish_time = "" |
||||
|
# 标题元素 |
||||
|
tit = await element.query_selector(".tit") |
||||
|
if tit: |
||||
|
# 标题下面的链接 |
||||
|
span = await tit.query_selector("span") |
||||
|
url = await span.get_attribute("lanmu1") |
||||
|
|
||||
|
# 存放标题的a标签 |
||||
|
tit_a = await span.query_selector("a") |
||||
|
if tit_a: |
||||
|
title = await tit_a.inner_text() |
||||
|
# 发布时间元素 |
||||
|
tim = await element.query_selector(".src-tim .tim") |
||||
|
if tim: |
||||
|
tim_text = await tim.inner_text() |
||||
|
publish_time = tim_text.split(":")[1] |
||||
|
# 保存数据 |
||||
|
results.append({ |
||||
|
"keyword": keyword, |
||||
|
"title": title, |
||||
|
"url": url, |
||||
|
"publish_time": publish_time |
||||
|
}) |
||||
|
return results |
||||
|
except Exception as e: |
||||
|
logger.error(F"[央视网]搜索方法异常: 关键词: {keyword} 页码: {cur_page} {full_url}") |
||||
|
logger.error(F"[央视网]错误信息: {str(e)}") |
||||
|
raise DataFetchError(str(e), full_url) |
||||
|
|
||||
|
|
||||
|
async def run(): |
||||
|
async with async_playwright() as playwright: |
||||
|
# 启动浏览器 |
||||
|
async with async_playwright() as playwright: |
||||
|
chromium = playwright.chromium |
||||
|
browser = await chromium.launch(headless=False) |
||||
|
# 浏览器上下文 |
||||
|
browser_context = await browser.new_context( |
||||
|
viewport={"width": 1920, "height": 1080}, |
||||
|
user_agent="" |
||||
|
) |
||||
|
# 反反爬脚本 |
||||
|
await browser_context.add_init_script(path="../../lib/stealth.min.js") |
||||
|
context_page: Page = await browser_context.new_page() |
||||
|
|
||||
|
# 创建对象 |
||||
|
client = YangShiClient(playwright_page=context_page, cookie_dict={}) |
||||
|
result = await client.search("医保", 1) |
||||
|
print(result) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
asyncio.get_event_loop().run_until_complete(run()) |
||||
@ -0,0 +1,19 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from httpx import RequestError |
||||
|
|
||||
|
|
||||
|
class DataFetchError(RequestError): |
||||
|
"""未知异常""" |
||||
|
def __init__(self, message, url, method="GET", params=None): |
||||
|
self.message = message |
||||
|
self.url = url |
||||
|
self.method = method |
||||
|
self.params = params |
||||
|
|
||||
|
def __str__(self): |
||||
|
return self.message |
||||
|
|
||||
|
|
||||
|
class IPBlockError(RequestError): |
||||
|
"""ip被封禁异常""" |
||||
@ -0,0 +1,185 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
||||
|
|
||||
|
from base.base_spider import AbstractSpider |
||||
|
from typing import Dict, List, Optional, Tuple |
||||
|
from .client import YangShiClient |
||||
|
from utils.utils import logger, is_blank |
||||
|
from models.monitor_task_model import get_task, running, complete, fail |
||||
|
from models.monitor_result_model import gen_result, save |
||||
|
from base.enums import Platform |
||||
|
import utils.date_format as date_format |
||||
|
import os |
||||
|
import config |
||||
|
import uuid |
||||
|
from .exception import DataFetchError |
||||
|
import utils.mail as mail |
||||
|
import asyncio |
||||
|
from tortoise.transactions import in_transaction |
||||
|
|
||||
|
|
||||
|
class YangShiSpider(AbstractSpider): |
||||
|
""" |
||||
|
央视网爬虫 |
||||
|
""" |
||||
|
client: YangShiClient # 请求对象 |
||||
|
context_page: Page # 浏览器页面上下文 |
||||
|
browser_context: BrowserContext # 浏览器上下文 |
||||
|
image_path: str |
||||
|
|
||||
|
def __init__(self): |
||||
|
self.index_url = "https://tv.cctv.com/" |
||||
|
self.platform = Platform.YANG_SHI |
||||
|
self.image_path = None |
||||
|
self.retry = 0 # 自旋次数 |
||||
|
|
||||
|
def init_config(self): |
||||
|
super().init_config() |
||||
|
|
||||
|
async def start(self, task_id): |
||||
|
try: |
||||
|
async with in_transaction(): |
||||
|
await self.do_spider(task_id) |
||||
|
except DataFetchError as e: |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常") |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
if self.retry > 3: |
||||
|
await fail(task_id) |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "央视网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
except Exception as e: |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 爬虫异常") |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 切换代理ip并自旋 |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常") |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
||||
|
# 尝试自旋 |
||||
|
self.retry = self.retry + 1 |
||||
|
if self.retry > 3: |
||||
|
await fail(task_id) |
||||
|
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
||||
|
await mail.send_post_mail(task_id, "央视网", str(e)) |
||||
|
else: |
||||
|
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
||||
|
await asyncio.sleep(20) |
||||
|
await self.do_spider(task_id) |
||||
|
|
||||
|
async def create_client(self, httpx_proxy: Optional[str]) -> YangShiClient: |
||||
|
# 请求头 |
||||
|
client = YangShiClient(proxies=httpx_proxy, cookie_dict={}, playwright_page=self.context_page) |
||||
|
return client |
||||
|
|
||||
|
async def launch_browser(self, |
||||
|
chromium: BrowserType, |
||||
|
playwright_proxy: Optional[Dict], |
||||
|
user_agent: Optional[Dict], |
||||
|
headless: bool = True): |
||||
|
""" |
||||
|
启动一个浏览器上下文 |
||||
|
:param chromium: |
||||
|
:param headless: |
||||
|
:param self: 类型 |
||||
|
:param playwright_proxy: 代理 |
||||
|
:param user_agent: 用户标识 |
||||
|
:return: |
||||
|
""" |
||||
|
# 浏览器对象 |
||||
|
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
||||
|
|
||||
|
# 浏览器上下文 |
||||
|
browser_context = await browser.new_context( |
||||
|
viewport={"width": 1920, "height": 1080}, |
||||
|
user_agent=user_agent |
||||
|
) |
||||
|
# 反反爬脚本 |
||||
|
await browser_context.add_init_script(path="lib/stealth.min.js") |
||||
|
context_page = await browser_context.new_page() |
||||
|
return browser_context, context_page |
||||
|
|
||||
|
async def do_search(self, task): |
||||
|
""" |
||||
|
获取任务信息 |
||||
|
:return: |
||||
|
""" |
||||
|
results = [] |
||||
|
cur_page = 1 |
||||
|
logger.info(F"[央视网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
||||
|
self.client = await self.create_client(None) |
||||
|
while True: |
||||
|
logger.info(F"[央视网]任务ID: {task.id} 开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
||||
|
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page) |
||||
|
logger.info(F"[央视网]任务ID: {task.id} 获取到{len(search_datas)}条搜索结果") |
||||
|
if not search_datas: |
||||
|
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
||||
|
break |
||||
|
index = -1 |
||||
|
for i, data in enumerate(search_datas): |
||||
|
# 找到一个不是今天的数据就结束 |
||||
|
if not date_format.is_today(data.get("publish_time")): |
||||
|
index = i |
||||
|
break |
||||
|
# 切割 |
||||
|
if index == -1: |
||||
|
# 搜索结果的最后一个依然是今天的 整个添加 |
||||
|
results = results + search_datas |
||||
|
# 翻到下一页 继续找 |
||||
|
cur_page = cur_page + 1 |
||||
|
else: |
||||
|
# 搜索结果中有不是今天的 切割一部分添加 |
||||
|
results = results + search_datas[:index] |
||||
|
# 结束本次搜索 |
||||
|
break |
||||
|
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
||||
|
return results |
||||
|
|
||||
|
async def cut_screen(self, url): |
||||
|
""" |
||||
|
网页截图 |
||||
|
:param url: 地址 |
||||
|
:return: |
||||
|
""" |
||||
|
if not self.image_path: |
||||
|
image_path = config.IMAGE_PATH |
||||
|
if is_blank(image_path): |
||||
|
self.image_path = "./data" |
||||
|
if not os.path.exists(self.image_path): |
||||
|
os.makedirs(self.image_path) |
||||
|
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
||||
|
# 开始截图 |
||||
|
await self.context_page.goto(url) |
||||
|
await self.context_page.screenshot(path=save_path, full_page=True) |
||||
|
return save_path |
||||
|
|
||||
|
async def do_spider(self, task_id): |
||||
|
# 获取任务信息 |
||||
|
task = await get_task(task_id) |
||||
|
if not task: |
||||
|
logger.error(F"[央视网]任务ID: {task_id}不存在 任务结束") |
||||
|
return |
||||
|
logger.info(F"[央视网]任务ID: {task_id} 任务开始") |
||||
|
await running(task_id) |
||||
|
results = [] |
||||
|
# 启动浏览器 |
||||
|
async with async_playwright() as playwright: |
||||
|
chromium = playwright.chromium |
||||
|
self.browser_context, self.context_page = await self.launch_browser(chromium, None, None, headless=True) |
||||
|
# 创建请求客户端 |
||||
|
search_datas = await self.do_search(task) |
||||
|
# 构建结果实体 截图 |
||||
|
for data in search_datas: |
||||
|
result = gen_result(task, data.get("title"), data.get("url"), data.get("publish_time")) |
||||
|
# img_path = await self.cut_screen(data.get("url")) |
||||
|
# result.image = img_path |
||||
|
results.append(result) |
||||
|
# logger.info(F"[央视网] 任务ID: {task_id} 标题: {data.get('title')} 截图文件名: {img_path}") |
||||
|
# 结果落库 |
||||
|
await save(results) |
||||
|
logger.info(F"[央视网] 任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
||||
|
await complete(task_id) |
||||
@ -0,0 +1 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
@ -0,0 +1,102 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from dateutil.parser import parse |
||||
|
from dateutil.relativedelta import relativedelta |
||||
|
from datetime import datetime, timedelta |
||||
|
import time |
||||
|
|
||||
|
|
||||
|
def gen_job_datetime(time_str, date_str=''): |
||||
|
""" |
||||
|
生成任务的时间和日期 |
||||
|
:param time_str: 时间 |
||||
|
:param date_str: 日期 |
||||
|
:return: |
||||
|
""" |
||||
|
if not time_str: |
||||
|
return None |
||||
|
return parse(date_str + time_str) |
||||
|
|
||||
|
|
||||
|
def gen_today_str(): |
||||
|
today = datetime.today() |
||||
|
return today.strftime("%Y-%m-%d") |
||||
|
|
||||
|
|
||||
|
def timestamp(): |
||||
|
return int(time.time()) |
||||
|
|
||||
|
|
||||
|
def is_today(date_str): |
||||
|
publish_date = parse(date_str).date() # 获取日期部分,忽略时间部分 |
||||
|
today = datetime.today().date() # 获取今天的日期,忽略时间部分 |
||||
|
# 检查日期是否相等 |
||||
|
return publish_date == today |
||||
|
|
||||
|
|
||||
|
def timestamp2date(timestamp_long): |
||||
|
d = datetime.utcfromtimestamp(float(timestamp_long / 1000)) |
||||
|
return d |
||||
|
|
||||
|
|
||||
|
def today_timestamp_long(): |
||||
|
""" |
||||
|
获取今天开始和结束的毫秒时间戳 |
||||
|
:return: |
||||
|
""" |
||||
|
start = parse("00:00") |
||||
|
end = start + timedelta(days=1) |
||||
|
return start.timestamp() * 1000, end.timestamp() * 1000 |
||||
|
|
||||
|
|
||||
|
def parse_time(time_str): |
||||
|
return datetime.strptime(time_str, '%H:%M').time() |
||||
|
|
||||
|
|
||||
|
def eq_time(time1: str, time2: str): |
||||
|
time1 = datetime.strptime(time1, '%H:%M').time() |
||||
|
time2 = datetime.strptime(time2, '%H:%M').time() |
||||
|
|
||||
|
today = datetime.today().date() |
||||
|
time1 = datetime.combine(today, time1) |
||||
|
time2 = datetime.combine(today, time2) |
||||
|
|
||||
|
return time1 == time2 |
||||
|
|
||||
|
|
||||
|
def ge_time(time1: str, time2: str): |
||||
|
""" |
||||
|
比较time1是否大于等于time2 |
||||
|
:param time1: |
||||
|
:param time2: |
||||
|
:return: |
||||
|
""" |
||||
|
time1 = datetime.strptime(time1, '%H:%M').time() |
||||
|
time2 = datetime.strptime(time2, '%H:%M').time() |
||||
|
|
||||
|
today = datetime.today().date() |
||||
|
time1 = datetime.combine(today, time1) |
||||
|
time2 = datetime.combine(today, time2) |
||||
|
|
||||
|
return time1 >= time2 |
||||
|
|
||||
|
|
||||
|
def lt_time(time1: str, time2: str): |
||||
|
""" |
||||
|
比较time1是否小于time2 |
||||
|
:param time1: |
||||
|
:param time2: |
||||
|
:return: |
||||
|
""" |
||||
|
time1 = datetime.strptime(time1, '%H:%M').time() |
||||
|
time2 = datetime.strptime(time2, '%H:%M').time() |
||||
|
|
||||
|
today = datetime.today().date() |
||||
|
time1 = datetime.combine(today, time1) |
||||
|
time2 = datetime.combine(today, time2) |
||||
|
|
||||
|
return time1 < time2 |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
print(lt_time("18:52", "23:55")) |
||||
@ -0,0 +1,83 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
import yagmail |
||||
|
import config |
||||
|
from utils.utils import logger |
||||
|
import inspect |
||||
|
from datetime import datetime |
||||
|
from models.monitor_task_model import get_task, complete |
||||
|
|
||||
|
|
||||
|
async def send_post_mail(task_id, name, message="出现异常"): |
||||
|
keyword = "" |
||||
|
task = await get_task(task_id) |
||||
|
if not task: |
||||
|
message = F"不存在的任务ID: {task_id}" |
||||
|
else: |
||||
|
keyword = task.keyword |
||||
|
|
||||
|
mail_server = None |
||||
|
try: |
||||
|
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST) |
||||
|
except Exception as e: |
||||
|
logger.error("[邮件]初始化失败 请检查邮件配置") |
||||
|
return |
||||
|
to = [] |
||||
|
if not config.POST_EMAIL: |
||||
|
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
||||
|
return |
||||
|
if isinstance(config.POST_EMAIL, str): |
||||
|
to.append(config.POST_EMAIL) |
||||
|
elif isinstance(config.POST_EMAIL, list): |
||||
|
to = config.POST_EMAIL |
||||
|
else: |
||||
|
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
||||
|
return |
||||
|
|
||||
|
title = F"HuoSpider上报邮件: {name}数据获取异常" |
||||
|
content = F""" |
||||
|
异常任务ID: {task_id} |
||||
|
异常站点: {name} |
||||
|
关键词: {keyword} |
||||
|
异常信息: {message} |
||||
|
上报时间: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")} |
||||
|
""" |
||||
|
inspect.cleandoc(content) |
||||
|
mail_server.send(to, title, content) |
||||
|
mail_server.close() |
||||
|
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}") |
||||
|
|
||||
|
|
||||
|
async def test(task_id, name, message="出现异常"): |
||||
|
keyword = "测试" |
||||
|
|
||||
|
mail_server = None |
||||
|
try: |
||||
|
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST) |
||||
|
except Exception as e: |
||||
|
logger.error("[邮件]初始化失败 请检查邮件配置") |
||||
|
return |
||||
|
to = [] |
||||
|
if not config.POST_EMAIL: |
||||
|
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
||||
|
return |
||||
|
if isinstance(config.POST_EMAIL, str): |
||||
|
to.append(config.POST_EMAIL) |
||||
|
elif isinstance(config.POST_EMAIL, list): |
||||
|
to = config.POST_EMAIL |
||||
|
else: |
||||
|
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
||||
|
return |
||||
|
|
||||
|
title = F"HuoSpider上报邮件: {name}数据获取异常" |
||||
|
content = F""" |
||||
|
异常任务ID: {task_id} |
||||
|
异常站点: {name} |
||||
|
关键词: {keyword} |
||||
|
异常信息: {message} |
||||
|
上报时间: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")} |
||||
|
""" |
||||
|
inspect.cleandoc(content) |
||||
|
mail_server.send(to, title, content) |
||||
|
mail_server.close() |
||||
|
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}") |
||||
@ -0,0 +1,113 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
from datetime import datetime, timedelta |
||||
|
|
||||
|
import httpx |
||||
|
import retry |
||||
|
from dateutil.parser import parse |
||||
|
|
||||
|
import config |
||||
|
from utils.utils import logger |
||||
|
|
||||
|
ip_pool = [] |
||||
|
|
||||
|
|
||||
|
# 代理池 |
||||
|
|
||||
|
class ProxyIp: |
||||
|
ip: str |
||||
|
port: int |
||||
|
expire: datetime |
||||
|
city: str |
||||
|
isp: str |
||||
|
|
||||
|
def __init__(self, ip, port, expire, city='未知', isp='未知'): |
||||
|
self.ip = ip |
||||
|
self.port = port |
||||
|
self.expire = parse(expire) |
||||
|
self.city = city |
||||
|
self.isp = isp |
||||
|
|
||||
|
def __str__(self): |
||||
|
return F"({self.city}-{self.isp}){self.ip}:{self.port} 过期时间:{self.expire}" |
||||
|
|
||||
|
def is_expire(self): |
||||
|
now = datetime.now() |
||||
|
expire = self.expire - timedelta(seconds=20) |
||||
|
return expire < now |
||||
|
|
||||
|
def to_httpx_proxies(self): |
||||
|
return {"http://": F"http://{self.ip}:{self.port}"} |
||||
|
|
||||
|
|
||||
|
class ProxyError(Exception): |
||||
|
def __init__(self, message, code=-1000): |
||||
|
self.code = code |
||||
|
self.message = message |
||||
|
|
||||
|
def __str__(self): |
||||
|
return F"错误码: {self.code} 错误消息: {self.message}" |
||||
|
|
||||
|
|
||||
|
@retry.retry(exceptions=ProxyError, tries=3, delay=2, backoff=2) |
||||
|
def add_ip(count=1) -> ProxyIp: |
||||
|
""" |
||||
|
向ip池中添加一个代理ip对象 |
||||
|
:param count: 添加的数量 默认为1 |
||||
|
:return: |
||||
|
""" |
||||
|
url = "http://api.tianqiip.com/getip" |
||||
|
params = { |
||||
|
"secret": config.PROXY_SECRET, # 密钥 |
||||
|
"sign": config.PROXY_SIGN, # 签名 |
||||
|
"num": count, # 数量 |
||||
|
"type": "json", # 返回类型 |
||||
|
"port": 1, # 协议 |
||||
|
"time": 3, # 时长三分钟 |
||||
|
"ts": 1, # 显示过期时间 |
||||
|
"mr": 1, # 去重 |
||||
|
"cs": 1, # 显示位置 |
||||
|
"ys": 1 # 显示运营商 |
||||
|
} |
||||
|
ips = [] |
||||
|
result: dict = httpx.get(url, params=params, proxies={}).json() |
||||
|
if not result['code'] == 1000: |
||||
|
logger.error("[IP池]API获取代理IP失败") |
||||
|
raise ProxyError(result['code'], result['msg']) |
||||
|
for data in result["data"]: |
||||
|
ip = ProxyIp(data['ip'], data['port'], data['expire'], city=data['city'], isp=data['isp']) |
||||
|
ip_pool.append(ip) |
||||
|
ips.append(ip) |
||||
|
logger.info(F"[IP池]新增代理IP {str(ip)}") |
||||
|
return ips[0] |
||||
|
|
||||
|
|
||||
|
def del_ip(index): |
||||
|
if index > len(ip_pool) - 1: |
||||
|
return |
||||
|
logger.error(f"[IP池]代理IP被删除: {ip_pool[index]}") |
||||
|
del ip_pool[index] |
||||
|
|
||||
|
|
||||
|
def get_ip(cache=True) -> ProxyIp: |
||||
|
""" |
||||
|
获取一个代理ip对象 |
||||
|
:param cache: 使用缓存 |
||||
|
:return: |
||||
|
""" |
||||
|
if not cache: |
||||
|
# 不使用缓存时 请求一个新的ip并放入池中 然后获取该ip |
||||
|
return add_ip() |
||||
|
# 从缓存中获取一个有效的ip |
||||
|
if not ip_pool: |
||||
|
return add_ip() |
||||
|
cur_ip = None |
||||
|
for index, ip in enumerate(ip_pool): |
||||
|
if not ip.is_expire(): |
||||
|
# 没过期 返回 |
||||
|
cur_ip = ip |
||||
|
break |
||||
|
if not cur_ip: |
||||
|
return add_ip() |
||||
|
logger.info(f"[IP池]从IP池中获取到代理IP: {cur_ip}") |
||||
|
return cur_ip |
||||
@ -0,0 +1,109 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
import datetime |
||||
|
import random |
||||
|
|
||||
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler |
||||
|
from typing import Dict, List |
||||
|
|
||||
|
import config |
||||
|
from models.monitor_task_db import MonitorTask |
||||
|
from spiders.xinhua.spider import XinHuaSpider |
||||
|
from spiders.renmin.spider import RenMinSpider |
||||
|
from spiders.yang_shi.spider import YangShiSpider |
||||
|
from utils.utils import logger |
||||
|
from base.base_spider import AbstractSpider |
||||
|
import utils.date_format as date_format |
||||
|
from base.enums import Platform |
||||
|
import logging |
||||
|
from datetime import timedelta |
||||
|
import copy |
||||
|
|
||||
|
|
||||
|
def singleton(cls): |
||||
|
instances = {} |
||||
|
|
||||
|
def getinstance(): |
||||
|
if cls not in instances: |
||||
|
instances[cls] = cls() |
||||
|
return instances[cls] |
||||
|
|
||||
|
return getinstance |
||||
|
|
||||
|
|
||||
|
@singleton |
||||
|
class SchedulerManager: |
||||
|
scheduler: AsyncIOScheduler |
||||
|
|
||||
|
def __init__(self): |
||||
|
# 调整调度器日志等级 |
||||
|
ap_logger = logging.getLogger('apscheduler') |
||||
|
ap_logger.setLevel(logging.WARNING) |
||||
|
self.scheduler = AsyncIOScheduler() |
||||
|
|
||||
|
def get_scheduler(self): |
||||
|
""" |
||||
|
获取调度器对象 |
||||
|
:return: |
||||
|
""" |
||||
|
if not self.scheduler: |
||||
|
self.scheduler = AsyncIOScheduler() |
||||
|
return self.scheduler |
||||
|
|
||||
|
def start(self, paused=False): |
||||
|
self.scheduler.start(paused) |
||||
|
|
||||
|
def add_task(self, task: MonitorTask, offset=0, is_random=False): |
||||
|
""" |
||||
|
添加任务 |
||||
|
:param is_random: 是否随机偏移 |
||||
|
:param offset: 偏移多少秒后执行 |
||||
|
:param task: |
||||
|
:return: |
||||
|
""" |
||||
|
scheduler = self.get_scheduler() |
||||
|
spider: AbstractSpider = None |
||||
|
if task.platform == Platform.XIN_HUA: |
||||
|
spider = XinHuaSpider() |
||||
|
elif task.platform == Platform.REN_MIN: |
||||
|
spider = RenMinSpider() |
||||
|
elif task.platform == Platform.YANG_SHI: |
||||
|
spider = YangShiSpider() |
||||
|
if not spider: |
||||
|
# logger.error(F"未知的平台: {task.platform} 任务id: {task.id}") |
||||
|
return |
||||
|
if not task.gather_time: |
||||
|
logger.error(F"[调度器]采集时间不存在 任务id: {task.id}") |
||||
|
if is_random: |
||||
|
offset = offset + random.randint(1, 29) |
||||
|
# 时间向后偏移 |
||||
|
task_date_time = date_format.gen_job_datetime(task.gather_time) |
||||
|
task_date_time = task_date_time + timedelta(seconds=offset) |
||||
|
|
||||
|
if task_date_time < datetime.datetime.now(): |
||||
|
task_date_time = datetime.datetime.now() + datetime.timedelta(seconds=60) |
||||
|
# 添加定时任务 |
||||
|
scheduler.add_job(spider.start, "date", run_date=task_date_time, kwargs={"task_id": task.id}) |
||||
|
logger.info( |
||||
|
F"[调度器]注册定时任务 ID: {task.id} 执行时间: {task_date_time} {F'偏移{offset}秒后执行' if offset > 0 else ''}") |
||||
|
|
||||
|
def add_tasks(self, tasks: List[MonitorTask], is_random=False): |
||||
|
# 按平台和关键词分组 |
||||
|
group = {} |
||||
|
for task in tasks: |
||||
|
if task.platform not in group: |
||||
|
group[task.platform] = {} |
||||
|
if task.keyword not in group[task.platform]: |
||||
|
group[task.platform][task.keyword] = [] |
||||
|
group[task.platform][task.keyword].append(task) |
||||
|
# 遍历每个关键词组 |
||||
|
for platform, platform_group in group.items(): |
||||
|
for keyword, task_list in platform_group.items(): |
||||
|
sorted_task_list = sorted(task_list, key=lambda e: date_format.parse_time(task.gather_time)) |
||||
|
# 判断最后一个任务是否在极限时间之前 |
||||
|
if date_format.lt_time(sorted_task_list[-1].gather_time, config.MAX_GATHER_TIME): |
||||
|
# 创建一个补偿任务 |
||||
|
new_task = copy.deepcopy(sorted_task_list[-1]) |
||||
|
new_task.gather_time = config.MAX_GATHER_TIME |
||||
|
sorted_task_list.append(new_task) |
||||
|
for sorted_task in sorted_task_list: |
||||
|
self.add_task(sorted_task, 0, is_random) |
||||
@ -0,0 +1,60 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
|
||||
|
import logging |
||||
|
from logging.handlers import TimedRotatingFileHandler |
||||
|
import re |
||||
|
import os |
||||
|
|
||||
|
|
||||
|
def init_loging_config(): |
||||
|
# 检查log文件夹是否存在 |
||||
|
if not os.path.exists("./log"): |
||||
|
os.mkdir("./log") |
||||
|
# 创建一个handler,用于按日期写入日志文件 |
||||
|
# 'W0' 表示每周滚动一次,'D' 表示每天滚动一次,'H' 表示每小时滚动一次,'M' 表示每分钟滚动一次 |
||||
|
# 'midnight' 表示在午夜滚动,'h:m' 表示在指定的小时和分钟滚动 |
||||
|
# backupCount 表示保留的日志文件的个数,超过后会删除最旧的日志文件 |
||||
|
# when='D', interval=1, backupCount=7 表示每天滚动一次,并保留最近7天的日志文件 |
||||
|
file_handler = TimedRotatingFileHandler('./log/huo_spider.log', when='D', interval=1, encoding='utf-8') |
||||
|
file_handler.setLevel(logging.DEBUG) |
||||
|
|
||||
|
# 定义handler的输出格式 |
||||
|
formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s %(message)s ') |
||||
|
file_handler.setFormatter(formatter) |
||||
|
|
||||
|
level = logging.INFO |
||||
|
logging.basicConfig( |
||||
|
level=level, |
||||
|
format="%(asctime)s [%(name)s] %(levelname)s %(message)s ", |
||||
|
datefmt='[%Y-%m-%d %H:%M:%S]' |
||||
|
) |
||||
|
_logger = logging.getLogger("HuoSpider") |
||||
|
_logger.setLevel(level) |
||||
|
_logger.addHandler(file_handler) |
||||
|
return _logger |
||||
|
|
||||
|
|
||||
|
logger = init_loging_config() |
||||
|
|
||||
|
|
||||
|
def is_blank(val: str): |
||||
|
if val is None: |
||||
|
return False |
||||
|
if not val.strip(): |
||||
|
return False |
||||
|
return True |
||||
|
|
||||
|
|
||||
|
def count_characters(val): |
||||
|
""" |
||||
|
统计中文和非中文字符个数 |
||||
|
:param val: |
||||
|
:return: |
||||
|
""" |
||||
|
if not isinstance(val, str): |
||||
|
val = str(val) |
||||
|
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]') |
||||
|
not_chinese_pattern = re.compile(r'[^\u4e00-\u9fa5]') |
||||
|
chinese = re.findall(chinese_pattern, val) |
||||
|
not_chinese = re.findall(not_chinese_pattern, val) |
||||
|
return len(chinese), len(not_chinese) |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue