commit
a38f378e16
32 changed files with 1897 additions and 0 deletions
-
10.gitignore
-
1base/__init__.py
-
21base/base_spider.py
-
26base/enums.py
-
31db.py
-
7lib/stealth.min.js
-
88main.py
-
1models/__init__.py
-
39models/monitor_result_db.py
-
50models/monitor_result_model.py
-
36models/monitor_task_db.py
-
46models/monitor_task_model.py
-
10requirements.txt
-
1spiders/__init__.py
-
1spiders/renmin/__init__.py
-
151spiders/renmin/client.py
-
19spiders/renmin/exception.py
-
188spiders/renmin/spider.py
-
4spiders/xinhua/__init__.py
-
114spiders/xinhua/client.py
-
19spiders/xinhua/exception.py
-
201spiders/xinhua/spider.py
-
1spiders/yang_shi/__init__.py
-
160spiders/yang_shi/client.py
-
19spiders/yang_shi/exception.py
-
185spiders/yang_shi/spider.py
-
1utils/__init__.py
-
102utils/date_format.py
-
83utils/mail.py
-
113utils/proxy.py
-
109utils/scheduler.py
-
60utils/utils.py
@ -0,0 +1,10 @@ |
|||
data |
|||
venv |
|||
**/log |
|||
.idea |
|||
**/__pycache__/ |
|||
test.py |
|||
config.py |
|||
Pipfile |
|||
Pipfile.lock |
|||
run.bat |
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1,21 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from abc import ABC, abstractclassmethod |
|||
|
|||
|
|||
class AbstractSpider(ABC): |
|||
""" |
|||
爬虫抽象类 |
|||
""" |
|||
def init_config(self): |
|||
""" |
|||
初始化配置 |
|||
:return: |
|||
""" |
|||
pass |
|||
|
|||
def start(self, task_id): |
|||
""" |
|||
运行爬虫 |
|||
:return: |
|||
""" |
|||
@ -0,0 +1,26 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from enum import Enum |
|||
|
|||
|
|||
class TaskStatus(Enum): |
|||
WAITING = 1 |
|||
RUNNING = 2 |
|||
COMPLETED = 3 |
|||
FAIL = 4 |
|||
|
|||
def __eq__(self, other): |
|||
return self.value == other |
|||
|
|||
|
|||
class Platform(Enum): |
|||
XIN_HUA = "xin_hua" |
|||
REN_MIN = "ren_min" |
|||
YANG_SHI = "yang_shi" |
|||
|
|||
def __eq__(self, other): |
|||
return self.value == other |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
print(Platform.REN_MIN == "ren_min") |
|||
@ -0,0 +1,31 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from tortoise import Tortoise, run_async |
|||
from config import * |
|||
from utils.utils import logger |
|||
|
|||
""" |
|||
数据库操作 |
|||
""" |
|||
|
|||
|
|||
def get_db_url(): |
|||
""" |
|||
拼接数据库url |
|||
:return: |
|||
""" |
|||
return F"mysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE}" |
|||
|
|||
|
|||
async def init(): |
|||
""" |
|||
初始化数据库连接 |
|||
:return: |
|||
""" |
|||
await Tortoise.init( |
|||
db_url=get_db_url(), |
|||
modules={"models": ['models.monitor_result_db', 'models.monitor_task_db']} |
|||
) |
|||
await Tortoise.generate_schemas() |
|||
logger.info("[数据库]初始化数据库连接成功") |
|||
|
|||
7
lib/stealth.min.js
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,88 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
|
|||
import argparse |
|||
import asyncio |
|||
import os |
|||
import sys |
|||
import config |
|||
import db |
|||
import utils.date_format as date_format |
|||
from base.enums import Platform |
|||
from models import monitor_task_model |
|||
from utils.scheduler import SchedulerManager |
|||
from utils.utils import logger |
|||
|
|||
|
|||
def task_group(tasks): |
|||
groups = {} |
|||
for name, enum in Platform.__members__.items(): |
|||
groups[enum.value] = [] |
|||
for task in tasks: |
|||
if task.platform in groups: |
|||
groups[task.platform].append(task) |
|||
return list(groups.values()) |
|||
|
|||
|
|||
async def do_get_task_job(): |
|||
""" |
|||
获取任务信息 |
|||
:return: |
|||
""" |
|||
await db.init() |
|||
tasks = await monitor_task_model.get_today_task() |
|||
if not tasks: |
|||
logger.info(F"没有获取到任务信息") |
|||
return |
|||
# 分组 |
|||
# groups = task_group(tasks) |
|||
# random.shuffle(groups) |
|||
schedular_manager = SchedulerManager() |
|||
logger.info(F"============================== 获取到{len(tasks)}条任务信息 ==============================") |
|||
schedular_manager.add_tasks(tasks, True) |
|||
|
|||
|
|||
def restart(): |
|||
os.execl(sys.executable, sys.executable, *sys.argv) |
|||
|
|||
|
|||
def load_arg_parse(): |
|||
""" |
|||
解析启动参数 |
|||
:return: |
|||
""" |
|||
parse = argparse.ArgumentParser(description="抓取社媒新闻数据") |
|||
parse.add_argument("-a", "--active", help="启动脚本时 立即进行一次任务拉取", default='false') |
|||
args = parse.parse_args() |
|||
logger.info(F"启动参数: {args}") |
|||
return args |
|||
|
|||
|
|||
def clear_system_proxy(): |
|||
# 清除系统代理相关的环境变量 |
|||
os.environ.pop('http_proxy', None) |
|||
os.environ.pop('https_proxy', None) |
|||
os.environ.pop('ftp_proxy', None) |
|||
os.environ.pop('no_proxy', None) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
try: |
|||
clear_system_proxy() |
|||
logger.info(F'启动成功 将在每天的{config.GET_TASK_TIME}拉取任务信息') |
|||
get_task_time = date_format.gen_job_datetime(config.GET_TASK_TIME) |
|||
manager = SchedulerManager() |
|||
# 启动定时任务 |
|||
manager.start() |
|||
# 添加拉取任务信息的任务 |
|||
manager.scheduler.add_job(do_get_task_job, 'cron', hour=get_task_time.hour, minute=get_task_time.minute) |
|||
manager.scheduler.add_job(restart, 'cron', hour=get_task_time.hour, minute=0) |
|||
# 参数检查 |
|||
args = load_arg_parse() |
|||
if args.active and args.active.lower() == 'true': |
|||
logger.info(F"立即执行一次任务拉取...") |
|||
asyncio.get_event_loop().run_until_complete(do_get_task_job()) |
|||
# 开启事件循环 |
|||
asyncio.get_event_loop().run_forever() |
|||
except KeyboardInterrupt: |
|||
sys.exit() |
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1,39 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from typing import Optional, Iterable |
|||
|
|||
from tortoise import fields, BaseDBAsyncClient |
|||
from tortoise.models import Model |
|||
import utils.date_format as date_format |
|||
|
|||
|
|||
class MonitorResult(Model): |
|||
""" |
|||
结果实体 |
|||
""" |
|||
id = fields.IntField(pk=True, autoincrement=True, description="ID") |
|||
keyword = fields.CharField(null=True, max_length=120, description="关键词") |
|||
title = fields.CharField(null=True, max_length=255, description="文章标题") |
|||
url = fields.CharField(null=True, max_length=500, description="文章地址") |
|||
publish_time = fields.BigIntField(null=True, max_length=20, description="发布时间") |
|||
platform = fields.CharField(null=True, max_length=20, description="平台") |
|||
gather_time = fields.CharField(null=True, description="设定采集时间", max_length=30) |
|||
content = fields.TextField(null=True, description="文章内容") |
|||
image = fields.CharField(null=True, max_length=255, description="结果截图") |
|||
is_del = fields.IntField(null=True, max_length=1, description="删除状态") |
|||
create_time = fields.BigIntField(null=True, max_length=20, description="创建时间") |
|||
update_time = fields.BigIntField(null=True, max_length=20, description="更新时间") |
|||
delete_time = fields.BigIntField(null=True, max_length=20, description="删除时间") |
|||
|
|||
class Meta: |
|||
table = "aux_monitor_result" |
|||
|
|||
def _pre_save( |
|||
self, |
|||
using_db: Optional[BaseDBAsyncClient] = None, |
|||
update_fields: Optional[Iterable[str]] = None, |
|||
) -> None: |
|||
if not self.id: |
|||
self.create_time = date_format.timestamp() |
|||
self.update_time = date_format.timestamp() |
|||
|
|||
@ -0,0 +1,50 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import config |
|||
from .monitor_result_db import MonitorResult |
|||
import utils.date_format as date_format |
|||
from .monitor_task_db import MonitorTask |
|||
|
|||
|
|||
def gen_result(task: MonitorTask, title, url, publish_time): |
|||
""" |
|||
构建任务结果对象 |
|||
:param task: 任务对象 |
|||
:param title: 标题 |
|||
:param url: 地址 |
|||
:param publish_time: 发布时间 |
|||
:return: |
|||
""" |
|||
if isinstance(publish_time, str): |
|||
timestamp = date_format.timestamp() |
|||
publish_time = int(date_format.parse(publish_time).timestamp()) |
|||
module = MonitorResult(title=title, url=url, publish_time=publish_time, |
|||
is_del=1, |
|||
keyword=task.keyword, platform=task.platform, |
|||
gather_time=F"{task.gather_date} {task.setting_time}") |
|||
return module |
|||
|
|||
|
|||
async def save(results): |
|||
if config.RESULT_UNIQUE: |
|||
await save_unique(results) |
|||
else: |
|||
model = MonitorResult() |
|||
await model.bulk_create(results) |
|||
|
|||
|
|||
async def save_unique(results): |
|||
# 过滤列表中重复的结果 |
|||
unique_results = {} |
|||
for result in results: |
|||
key = (result.platform, result.keyword, result.title) |
|||
if key not in unique_results: |
|||
unique_results[key] = result |
|||
unique_results = list(unique_results.values()) |
|||
# 过滤数据库中重复的结果 |
|||
save_results = [] |
|||
model = MonitorResult() |
|||
for result in unique_results: |
|||
exist = await model.filter(platform=result.platform, keyword=result.keyword, title=result.title).exists() |
|||
if not exist: |
|||
save_results.append(result) |
|||
await model.bulk_create(save_results) |
|||
@ -0,0 +1,36 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from typing import Optional, Iterable |
|||
|
|||
from tortoise import fields, BaseDBAsyncClient |
|||
from tortoise.models import Model |
|||
import utils.date_format as date_format |
|||
|
|||
|
|||
class MonitorTask(Model): |
|||
""" |
|||
任务实体 |
|||
""" |
|||
id = fields.IntField(pk=True, autoincrement=True, description="ID") |
|||
keyword = fields.CharField(null=True, max_length=255, description="关键词") |
|||
lang = fields.CharField(null=True, max_length=50, description="语言") |
|||
platform = fields.CharField(null=True, max_length=30, description="媒体平台") |
|||
gather_date = fields.CharField(null=True, max_length=30, description="采集日期") |
|||
gather_time = fields.CharField(null=True, max_length=30, description="采集时间") |
|||
setting_time = fields.CharField(null=True, max_length=30, description="设定时间") |
|||
status = fields.IntField(null=True, max_length=1, description="任务状态 1 待执行 2 进行中 3 已完成") |
|||
create_time = fields.BigIntField(null=True, max_length=16, description="创建时间") |
|||
update_time = fields.BigIntField(null=True, max_length=16, description="更新时间") |
|||
|
|||
class Meta: |
|||
table = "aux_monitor_task" |
|||
|
|||
async def _pre_save( |
|||
self, |
|||
using_db: Optional[BaseDBAsyncClient] = None, |
|||
update_fields: Optional[Iterable[str]] = None, |
|||
) -> None: |
|||
if not self.id: |
|||
self.create_time = date_format.timestamp() |
|||
self.update_time = date_format.timestamp() |
|||
|
|||
@ -0,0 +1,46 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from .monitor_task_db import MonitorTask |
|||
import utils.date_format as date_format |
|||
from base.enums import TaskStatus |
|||
|
|||
|
|||
async def get_today_task(): |
|||
""" |
|||
获取当天的任务信息 |
|||
:return: |
|||
""" |
|||
# 当天日期 |
|||
today = date_format.gen_today_str() |
|||
task_model = MonitorTask() |
|||
result = await task_model.filter(gather_date=today, status=TaskStatus.WAITING.value).all() |
|||
return result |
|||
|
|||
|
|||
async def get_task(task_id): |
|||
""" |
|||
获取指定id的任务信息 |
|||
:param task_id: |
|||
:return: |
|||
""" |
|||
task_model = MonitorTask() |
|||
return await task_model.get_or_none(id=task_id) |
|||
|
|||
|
|||
async def complete(task_id): |
|||
task_model = MonitorTask() |
|||
await task_model.filter(id=task_id).update(status=TaskStatus.COMPLETED.value, update_time=date_format.timestamp()) |
|||
|
|||
|
|||
async def running(task_id): |
|||
task_model = MonitorTask() |
|||
await task_model.filter(id=task_id).update(status=TaskStatus.RUNNING.value, update_time=date_format.timestamp()) |
|||
|
|||
|
|||
async def fail(task_id): |
|||
task_model = MonitorTask() |
|||
await task_model.filter(id=task_id).update(status=TaskStatus.FAIL.value, update_time=date_format.timestamp()) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
get_today_task() |
|||
@ -0,0 +1,10 @@ |
|||
tortoise-orm~=0.19.0 |
|||
playwright~=1.42.0 |
|||
httpx~=0.27.0 |
|||
aiomysql~=0.2.0 |
|||
pymysql~=1.1.0 |
|||
python-dateutil~=2.9.0.post0 |
|||
APScheduler~=3.10.4 |
|||
yagmail~=0.15.293 |
|||
retry~=0.9.2 |
|||
|
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1,151 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import json |
|||
from typing import Dict |
|||
from urllib.parse import urlencode |
|||
import httpx |
|||
from httpx._exceptions import HTTPError, RequestError |
|||
from playwright.async_api import Page |
|||
from .exception import DataFetchError |
|||
import asyncio |
|||
import json |
|||
import utils.date_format as date_format |
|||
from utils.utils import count_characters |
|||
from utils.utils import logger |
|||
import utils.proxy as proxy |
|||
import config |
|||
|
|||
|
|||
class RenMinClient: |
|||
def __init__(self, |
|||
timeout=60, |
|||
*, |
|||
playwright_page: Page, |
|||
cookie_dict: Dict[str, str]): |
|||
self.timeout = timeout |
|||
self.headers = { |
|||
"Accept": "application/json, text/plain, */*", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Cache-Control": "no-cache", |
|||
"Connection": "keep-alive", |
|||
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", |
|||
"Host": "search.people.cn", |
|||
"Content-Length": "163", |
|||
"Content-Type": "application/json", |
|||
"Origin": "http://search.people.cn", |
|||
"Pragma": "no-cache", |
|||
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" |
|||
} |
|||
self._host = "http://search.people.cn" |
|||
self.playwright_page = playwright_page |
|||
self.cookie_dict = cookie_dict |
|||
|
|||
async def request(self, method, url, **kwargs): |
|||
""" |
|||
请求方法 |
|||
:param method: 请求方法 |
|||
:param url: 地址 |
|||
:param kwargs: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
# api代理 |
|||
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None |
|||
try: |
|||
async with httpx.AsyncClient(proxies=proxies) as client: |
|||
response = await client.request( |
|||
method, url, timeout=self.timeout, |
|||
**kwargs |
|||
) |
|||
# 人民网504 是没有数据 |
|||
if response.status_code == 504: |
|||
# logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}") |
|||
# raise DataFetchError("黑名单异常", url, method, kwargs) |
|||
return {} |
|||
if not response.status_code == 200: |
|||
logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") |
|||
raise DataFetchError("httpx异常", url, method, kwargs) |
|||
data: Dict = response.json() |
|||
if data.get("code") != "0": |
|||
raise DataFetchError(data.get("message", "未知错误"), url) |
|||
else: |
|||
return data.get("data", {}) |
|||
except HTTPError as e: |
|||
logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}") |
|||
logger.error(F"[人民网]错误信息{str(e)}") |
|||
raise DataFetchError(str(e), url) |
|||
except Exception as e: |
|||
logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") |
|||
logger.error(F"[人民网]错误信息{str(e)}") |
|||
raise Exception(str(e)) |
|||
|
|||
async def get(self, uri: str, params=None) -> Dict: |
|||
""" |
|||
GET 请求方法 |
|||
:param uri: 请求地址 |
|||
:param params: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
final_uri = uri |
|||
if isinstance(params, dict): |
|||
final_uri = (f"{uri}?" |
|||
f"{urlencode(params)}") |
|||
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
|||
|
|||
async def post(self, uri: str, data: dict) -> Dict: |
|||
""" |
|||
POST 请求方法 |
|||
:param uri: 请求地址 |
|||
:param data: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
|||
return await self.request(method="POST", url=F"{self._host}{uri}", |
|||
data=json_str, headers=self.headers) |
|||
|
|||
async def search(self, keyword, cur_page): |
|||
""" |
|||
搜索 |
|||
:param end: |
|||
:param start: |
|||
:param keyword: 关键词 |
|||
:param cur_page: 页码 |
|||
:param sort_field: 排序 # 0 时间倒序 1 时间正序 |
|||
:return: |
|||
""" |
|||
# 接口地址 |
|||
uri = '/search-platform/front/search' |
|||
get_param = { |
|||
'key': keyword, |
|||
'startTime': 0, |
|||
'endTime': 0, |
|||
'hasContent': True, |
|||
'hasTitle': True, |
|||
'isFuzzy': False, # 精准匹配 |
|||
'limit': 10, |
|||
'page': cur_page, |
|||
'sortType': 0, |
|||
'type': 0 |
|||
} |
|||
chinese, not_chinese = count_characters(keyword) |
|||
# 长度 = 127+ 汉字*3 + 其他*1 |
|||
# 关键字部分 |
|||
content_length = 126 + (chinese * 3) + not_chinese + 1 # 如果精准匹配是False 加一字节 |
|||
# 页码部分 |
|||
chinese, not_chinese = count_characters(cur_page) |
|||
content_length = content_length + not_chinese |
|||
|
|||
logger.info(F"[人民网]请求长度: {content_length}") |
|||
logger.info(F"[人民网]参数: {get_param}") |
|||
self.headers['Content-Length'] = str(content_length) |
|||
content = await self.post(uri, get_param) |
|||
if not content or not content.get('records'): |
|||
return [] |
|||
return content.get('records', []) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
client = RenMinClient(playwright_page=None, cookie_dict={}) |
|||
start, end = date_format.today_timestamp_long() |
|||
asyncio.run(client.search('乡村发展', 1)) |
|||
@ -0,0 +1,19 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from httpx import RequestError |
|||
|
|||
|
|||
class DataFetchError(RequestError): |
|||
"""未知异常""" |
|||
def __init__(self, message, url, method="GET", params=None): |
|||
self.message = message |
|||
self.url = url |
|||
self.method = method |
|||
self.params = params |
|||
|
|||
def __str__(self): |
|||
return self.message |
|||
|
|||
|
|||
class IPBlockError(RequestError): |
|||
"""ip被封禁异常""" |
|||
@ -0,0 +1,188 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
|||
|
|||
from base.base_spider import AbstractSpider |
|||
from typing import Dict, List, Optional, Tuple |
|||
from .client import RenMinClient |
|||
from utils.utils import logger, is_blank |
|||
from models.monitor_task_model import get_task, running, complete, fail |
|||
from models.monitor_result_model import gen_result, save |
|||
from base.enums import Platform |
|||
import utils.date_format as date_format |
|||
import os |
|||
import config |
|||
import uuid |
|||
from .exception import DataFetchError |
|||
import utils.mail as mail |
|||
import asyncio |
|||
from tortoise.transactions import in_transaction |
|||
|
|||
|
|||
class RenMinSpider(AbstractSpider): |
|||
""" |
|||
人民网爬虫 |
|||
""" |
|||
client: RenMinClient # 请求对象 |
|||
context_page: Page # 浏览器页面上下文 |
|||
browser_context: BrowserContext # 浏览器上下文 |
|||
image_path: str |
|||
|
|||
def __init__(self): |
|||
self.index_url = "http://www.people.com.cn/" |
|||
self.platform = Platform.REN_MIN |
|||
self.image_path = None |
|||
self.retry = 0 # 自旋次数 |
|||
|
|||
def init_config(self): |
|||
super().init_config() |
|||
|
|||
async def start(self, task_id): |
|||
try: |
|||
async with in_transaction(): |
|||
await self.do_spider(task_id) |
|||
except DataFetchError as e: |
|||
logger.error(F"[人民网]任务ID: {task_id} 获取数据异常") |
|||
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
if self.retry > 3: |
|||
await fail(task_id) |
|||
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "人民网", str(e)) |
|||
else: |
|||
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
except Exception as e: |
|||
logger.error(F"[人民网]任务ID: {task_id} 爬虫异常") |
|||
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 切换代理ip并自旋 |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
if self.retry > 3: |
|||
await fail(task_id) |
|||
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "人民网", str(e)) |
|||
else: |
|||
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
|
|||
async def create_client(self) -> RenMinClient: |
|||
return RenMinClient(playwright_page=None, cookie_dict={}) |
|||
|
|||
async def launch_browser(self, |
|||
chromium: BrowserType, |
|||
playwright_proxy: Optional[Dict], |
|||
user_agent: Optional[Dict], |
|||
headless: bool = True): |
|||
""" |
|||
启动一个浏览器上下文 |
|||
:param chromium: |
|||
:param headless: |
|||
:param self: 类型 |
|||
:param playwright_proxy: 代理 |
|||
:param user_agent: 用户标识 |
|||
:return: |
|||
""" |
|||
# 浏览器对象 |
|||
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
|||
|
|||
# 浏览器上下文 |
|||
browser_context = await browser.new_context( |
|||
viewport={"width": 1920, "height": 1080}, |
|||
user_agent=user_agent |
|||
) |
|||
return browser_context |
|||
|
|||
async def do_search(self, task): |
|||
""" |
|||
获取任务信息 |
|||
:return: |
|||
""" |
|||
start, end = date_format.today_timestamp_long() # 开始结束时间 |
|||
results = [] |
|||
cur_page = 1 |
|||
logger.info(F"[人民网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
|||
self.client = await self.create_client() |
|||
while True: |
|||
logger.info(F"[人民网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
|||
search_datas = await self.client.search(task.keyword, cur_page) |
|||
logger.info(F"[人民网]获取到{len(search_datas)}条搜索结果") |
|||
if not search_datas: |
|||
logger.info(F"[人民网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
|||
break |
|||
index = -1 |
|||
for i, data in enumerate(search_datas): |
|||
# 找到一个不是今天的数据就结束 |
|||
if not date_format.is_today(date_format.timestamp2date(data.get("displayTime")).strftime("%Y-%m-%d")): |
|||
index = i |
|||
break |
|||
# 切割 |
|||
if index == -1: |
|||
# 搜索结果的最后一个依然是今天的 整个添加 |
|||
results = results + search_datas |
|||
# 翻到下一页 继续找 |
|||
cur_page = cur_page + 1 |
|||
else: |
|||
# 搜索结果中有不是今天的 切割一部分添加 |
|||
results = results + search_datas[:index] |
|||
# 结束本次搜索 |
|||
break |
|||
logger.info(F"[人民网]关键词:{task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
|||
return results |
|||
|
|||
async def cut_screen(self, url): |
|||
""" |
|||
网页截图 |
|||
:param url: 地址 |
|||
:return: |
|||
""" |
|||
if not self.image_path: |
|||
image_path = config.IMAGE_PATH |
|||
if is_blank(image_path): |
|||
self.image_path = "./data" |
|||
if not os.path.exists(self.image_path): |
|||
os.makedirs(self.image_path) |
|||
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
|||
# 开始截图 |
|||
await self.context_page.goto(url) |
|||
await self.context_page.screenshot(path=save_path, full_page=True) |
|||
return save_path |
|||
|
|||
async def do_spider(self, task_id): |
|||
# 获取任务信息 |
|||
task = await get_task(task_id) |
|||
if not task: |
|||
logger.error(F"[人民网]任务ID: {task_id}不存在 任务结束") |
|||
return |
|||
logger.info(F"[人民网]任务ID: {task_id} 任务开始") |
|||
await running(task_id) |
|||
# 从api中获取数据 |
|||
search_datas = await self.do_search(task) |
|||
if not search_datas: |
|||
logger.info(F"[人民网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束") |
|||
await complete(task_id) |
|||
return |
|||
# 保存result实体 |
|||
results = [] |
|||
# 启动浏览器 |
|||
async with async_playwright() as playwright: |
|||
chromium = playwright.chromium |
|||
self.browser_context = await self.launch_browser(chromium, None, None, headless=True) |
|||
# 反反爬脚本 |
|||
await self.browser_context.add_init_script(path="lib/stealth.min.js") |
|||
self.context_page: Page = await self.browser_context.new_page() |
|||
# 构建结果实体 截图 |
|||
for data in search_datas: |
|||
result = gen_result(task, data.get("title"), data.get("url"), int(data.get("displayTime") / 1000)) |
|||
# img_path = await self.cut_screen(data.get("url")) |
|||
# result.image = img_path |
|||
results.append(result) |
|||
# logger.info(F"[人民网]标题: {data.get('title')} 截图文件名: {img_path}") |
|||
|
|||
# 结果落库 |
|||
await save(results) |
|||
logger.info(F"[人民网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
|||
await complete(task_id) |
|||
@ -0,0 +1,4 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
|
|||
|
|||
@ -0,0 +1,114 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import json |
|||
from typing import Dict |
|||
from urllib.parse import urlencode |
|||
from .exception import DataFetchError |
|||
import httpx |
|||
from playwright.async_api import Page |
|||
from httpx._exceptions import HTTPError |
|||
from utils.utils import logger |
|||
import asyncio |
|||
import utils.proxy as proxy |
|||
import config |
|||
|
|||
|
|||
class XinHuaClient: |
|||
def __init__(self, |
|||
timeout=10, |
|||
*, |
|||
headers: Dict[str, str], |
|||
playwright_page: Page, |
|||
cookie_dict: Dict[str, str]): |
|||
self.timeout = timeout |
|||
self.headers = headers |
|||
self._host = "https://so.news.cn/" |
|||
self.playwright_page = playwright_page |
|||
self.cookie_dict = cookie_dict |
|||
|
|||
async def request(self, method, url, **kwargs): |
|||
""" |
|||
请求方法 |
|||
:param method: 请求方法 |
|||
:param url: 地址 |
|||
:param kwargs: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
# api代理 |
|||
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None |
|||
try: |
|||
async with httpx.AsyncClient(proxies=proxies) as client: |
|||
response = await client.request( |
|||
method, url, timeout=self.timeout, |
|||
**kwargs |
|||
) |
|||
# 返回不正确的状态码 |
|||
if not response.status_code == 200: |
|||
logger.error(F"[新华网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") |
|||
raise DataFetchError("httpx异常", url, method, kwargs) |
|||
# 返回正确的状态码 |
|||
data: Dict = response.json() |
|||
if data.get("code") != 200: |
|||
# 有特殊情况 敏感词会直接把content返回为没有找到相关稿件 |
|||
if data.get("content") == '没有找到相关稿件': |
|||
logger.warning(F"[新华网]触发敏感词 跳过请求 参数: {kwargs}") |
|||
return {} |
|||
raise DataFetchError(data.get("content", "API未知错误"), url, method, kwargs) |
|||
else: |
|||
return data.get("content", {}) |
|||
except HTTPError as e: |
|||
logger.error(F"[新华网]httpx异常: [{method}]{url} 参数: {kwargs}") |
|||
logger.error(F"[新华网]错误信息{str(e)}") |
|||
raise DataFetchError(str(e), url) |
|||
except Exception as e: |
|||
logger.error(F"[新华网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") |
|||
logger.error(F"[新华网]错误信息{str(e)}") |
|||
raise Exception(str(e)) |
|||
|
|||
async def get(self, uri: str, params=None) -> Dict: |
|||
""" |
|||
GET 请求方法 |
|||
:param uri: 请求地址 |
|||
:param params: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
final_uri = uri |
|||
if isinstance(params, dict): |
|||
final_uri = (f"{uri}?" |
|||
f"{urlencode(params)}") |
|||
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
|||
|
|||
async def post(self, uri: str, data: dict) -> Dict: |
|||
""" |
|||
POST 请求方法 |
|||
:param uri: 请求地址 |
|||
:param data: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
|||
return await self.request(method="POST", url=F"{self._host}{uri}", |
|||
data=json_str, headers=self.headers) |
|||
|
|||
async def search(self, keyword, cur_page, lang='cn', sort_field=0, search_fields=0): |
|||
""" |
|||
搜索 |
|||
:param lang: |
|||
:param keyword: 关键词 |
|||
:param cur_page: 页码 |
|||
:param sort_field: 排序 0: 相关度 1: 时间 |
|||
:param search_fields: 搜索类型: 0: 全文 1: 标题 |
|||
:return: |
|||
""" |
|||
# 接口地址 |
|||
uri = '/getNews' |
|||
get_param = { |
|||
'keyword': keyword, |
|||
'curPage': cur_page, |
|||
'sortField': sort_field, |
|||
'searchFields': search_fields, |
|||
'lang': lang |
|||
} |
|||
content = await self.get(uri, get_param) |
|||
if not content or not content.get('results'): |
|||
return [] |
|||
return content.get('results', []) |
|||
@ -0,0 +1,19 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from httpx import RequestError |
|||
|
|||
|
|||
class DataFetchError(RequestError): |
|||
"""未知异常""" |
|||
def __init__(self, message, url, method="GET", params=None): |
|||
self.message = message |
|||
self.url = url |
|||
self.method = method |
|||
self.params = params |
|||
|
|||
def __str__(self): |
|||
return self.message |
|||
|
|||
|
|||
class IPBlockError(RequestError): |
|||
"""ip被封禁异常""" |
|||
@ -0,0 +1,201 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
|||
|
|||
from base.base_spider import AbstractSpider |
|||
from typing import Dict, List, Optional, Tuple |
|||
from .client import XinHuaClient |
|||
from utils.utils import logger, is_blank |
|||
from models.monitor_task_model import get_task, running, complete, fail |
|||
from models.monitor_result_model import gen_result, save |
|||
from base.enums import Platform |
|||
import utils.date_format as date_format |
|||
import os |
|||
import config |
|||
import uuid |
|||
from .exception import DataFetchError |
|||
import asyncio |
|||
import utils.mail as mail |
|||
from tortoise.transactions import in_transaction |
|||
|
|||
|
|||
class XinHuaSpider(AbstractSpider): |
|||
""" |
|||
新华网爬虫 |
|||
""" |
|||
client: XinHuaClient # 请求对象 |
|||
context_page: Page # 浏览器页面上下文 |
|||
browser_context: BrowserContext # 浏览器上下文 |
|||
image_path: str |
|||
|
|||
def __init__(self): |
|||
self.index_url = "http://www.xinhuanet.com/" |
|||
self.platform = Platform.XIN_HUA |
|||
self.image_path = None |
|||
self.retry = 0 # 自旋次数 |
|||
self.context_page = None |
|||
|
|||
def init_config(self): |
|||
super().init_config() |
|||
|
|||
async def start(self, task_id): |
|||
try: |
|||
async with in_transaction(): |
|||
await self.do_spider(task_id) |
|||
except DataFetchError as e: |
|||
logger.error(F"[新华网]任务ID: {task_id} 获取数据异常") |
|||
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
if self.retry > 3: |
|||
await fail(task_id) |
|||
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "新华网", str(e)) |
|||
else: |
|||
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
except Exception as e: |
|||
logger.error(F"[新华网]任务ID: {task_id} 爬虫异常") |
|||
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
await fail(task_id) |
|||
if self.retry > 3: |
|||
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "新华网", str(e)) |
|||
else: |
|||
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
|
|||
async def create_xinhua_client(self, httpx_proxy: Optional[str]) -> XinHuaClient: |
|||
# 请求头 |
|||
headers = { |
|||
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br, zstd", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", |
|||
"Cookie": "org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; wdcid=7af5eba7b2f8b44b; arialoadData=false; acw_tc=2760778017108394678246790e1403779a009cc2c5fe412f126407bf171637", |
|||
"Host": "so.news.cn", "Pragma": "no-cache", "Referer": "https://so.news.cn/", "Sec-Fetch-Dest": "empty", |
|||
"Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", |
|||
"X-Requested-With": "XMLHttpRequest", |
|||
"sec-ch-ua": "\"Chromium\";v=\"122\", \"Not(A:Brand\";v=\"24\", \"Google Chrome\";v=\"122\"", |
|||
"sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\""} |
|||
client = XinHuaClient(headers=headers, cookie_dict=None, playwright_page=self.context_page) |
|||
return client |
|||
|
|||
async def launch_browser(self, |
|||
chromium: BrowserType, |
|||
playwright_proxy: Optional[Dict], |
|||
user_agent: Optional[Dict], |
|||
headless: bool = True): |
|||
""" |
|||
启动一个浏览器上下文 |
|||
:param chromium: |
|||
:param headless: |
|||
:param self: 类型 |
|||
:param playwright_proxy: 代理 |
|||
:param user_agent: 用户标识 |
|||
:return: |
|||
""" |
|||
# 浏览器对象 |
|||
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
|||
|
|||
# 浏览器上下文 |
|||
browser_context = await browser.new_context( |
|||
viewport={"width": 1920, "height": 1080}, |
|||
user_agent=user_agent |
|||
) |
|||
return browser_context |
|||
|
|||
async def do_search(self, task): |
|||
""" |
|||
获取任务信息 |
|||
:return: |
|||
""" |
|||
results = [] |
|||
cur_page = 1 |
|||
logger.info(F"[新华网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
|||
self.client = await self.create_xinhua_client(None) |
|||
while True: |
|||
logger.info(F"[新华网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
|||
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page, lang=task.lang) |
|||
logger.info(F"[新华网]获取到{len(search_datas)}条搜索结果") |
|||
if not search_datas: |
|||
logger.info(F"[新华网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
|||
break |
|||
index = -1 |
|||
for i, data in enumerate(search_datas): |
|||
# 找到一个不是今天的数据就结束 |
|||
if not date_format.is_today(data.get("pubtime")): |
|||
index = i |
|||
break |
|||
# 如果全都是今天的 就翻页 |
|||
if index == -1: |
|||
# 搜索结果的最后一个依然是今天的 整个添加 |
|||
results = results + search_datas |
|||
# 翻到下一页 继续找 |
|||
cur_page = cur_page + 1 |
|||
else: |
|||
# 搜索结果中有不是今天的 切割一部分添加 |
|||
results = results + search_datas[:index] |
|||
# 结束本次搜索 |
|||
break |
|||
logger.info(F"[新华网]关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
|||
return results |
|||
|
|||
async def cut_screen(self, url): |
|||
""" |
|||
网页截图 |
|||
:param url: 地址 |
|||
:return: |
|||
""" |
|||
if not self.image_path: |
|||
image_path = config.IMAGE_PATH |
|||
if is_blank(image_path): |
|||
self.image_path = "./data" |
|||
if not os.path.exists(self.image_path): |
|||
os.makedirs(self.image_path) |
|||
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
|||
# 开始截图 |
|||
await self.context_page.goto(url) |
|||
await self.context_page.screenshot(path=save_path, full_page=True) |
|||
return save_path |
|||
|
|||
async def do_spider(self, task_id): |
|||
# 获取任务信息 |
|||
task = await get_task(task_id) |
|||
if not task: |
|||
logger.error(F"[新华网]任务ID: {task_id}不存在 任务结束") |
|||
return |
|||
logger.info(F"[新华网]任务ID: {task_id} 任务开始") |
|||
await running(task_id) |
|||
# 从api中获取数据 |
|||
search_datas = await self.do_search(task) |
|||
if not search_datas: |
|||
logger.info(F"[新华网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束") |
|||
await complete(task_id) |
|||
return |
|||
# 保存result实体 |
|||
results = [] |
|||
# 启动浏览器 |
|||
async with async_playwright() as playwright: |
|||
chromium = playwright.chromium |
|||
self.browser_context = await self.launch_browser(chromium, None, None, headless=True) |
|||
# 反反爬脚本 |
|||
await self.browser_context.add_init_script(path="lib/stealth.min.js") |
|||
self.context_page: Page = await self.browser_context.new_page() |
|||
|
|||
# 构建结果实体 截图 |
|||
for data in search_datas: |
|||
result = gen_result(task, data.get("title"), data.get("url"), data.get("pubtime")) |
|||
# img_path = await self.cut_screen(data.get("url")) |
|||
# result.image = img_path |
|||
results.append(result) |
|||
# logger.info(F"[新华网]标题: {data.get('title')} 截图文件名: {img_path}") |
|||
|
|||
# 结果落库 |
|||
await save(results) |
|||
logger.info(F"[新华网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
|||
await complete(task_id) |
|||
|
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1,160 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import json |
|||
from typing import Dict |
|||
from urllib.parse import urlencode |
|||
import httpx |
|||
from playwright.async_api import Page |
|||
from .exception import DataFetchError |
|||
import asyncio |
|||
import json |
|||
import utils.date_format as date_format |
|||
from utils.utils import count_characters |
|||
from playwright.async_api import async_playwright |
|||
import asyncio |
|||
from utils.utils import logger |
|||
|
|||
|
|||
class YangShiClient: |
|||
def __init__(self, |
|||
timeout=60, |
|||
proxies=None, |
|||
*, |
|||
playwright_page: Page, |
|||
cookie_dict: Dict[str, str]): |
|||
self.proxies = proxies |
|||
self.timeout = timeout |
|||
self.headers = { |
|||
"Accept": "application/json, text/plain, */*", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Cache-Control": "no-cache", |
|||
"Connection": "keep-alive", |
|||
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", |
|||
"Host": "search.people.cn", |
|||
"Content-Length": "163", |
|||
"Content-Type": "application/json", |
|||
"Origin": "http://search.people.cn", |
|||
"Pragma": "no-cache", |
|||
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" |
|||
} |
|||
self._host = "https://search.cctv.com/" |
|||
self.playwright_page = playwright_page |
|||
self.cookie_dict = cookie_dict |
|||
|
|||
async def request(self, method, url, **kwargs): |
|||
""" |
|||
请求方法 |
|||
:param method: 请求方法 |
|||
:param url: 地址 |
|||
:param kwargs: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
async with httpx.AsyncClient(proxies=self.proxies) as client: |
|||
response = await client.request( |
|||
method, url, timeout=self.timeout, |
|||
**kwargs |
|||
) |
|||
data: Dict = response.json() |
|||
if data.get("code") != "0": |
|||
raise DataFetchError(data.get("message", "未知错误")) |
|||
else: |
|||
return data.get("data", {}) |
|||
|
|||
async def get(self, uri: str, params=None) -> Dict: |
|||
""" |
|||
GET 请求方法 |
|||
:param uri: 请求地址 |
|||
:param params: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
final_uri = uri |
|||
if isinstance(params, dict): |
|||
final_uri = (f"{uri}?" |
|||
f"{urlencode(params)}") |
|||
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) |
|||
|
|||
async def post(self, uri: str, data: dict) -> Dict: |
|||
""" |
|||
POST 请求方法 |
|||
:param uri: 请求地址 |
|||
:param data: 参数 |
|||
:return: 返回结果 |
|||
""" |
|||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) |
|||
return await self.request(method="POST", url=F"{self._host}{uri}", |
|||
data=json_str, headers=self.headers) |
|||
|
|||
async def search(self, keyword, cur_page): |
|||
""" |
|||
搜索 |
|||
:param keyword: 关键词 |
|||
:param cur_page: 页码 |
|||
:return: |
|||
""" |
|||
# 接口地址 |
|||
uri = F"/search.php?qtext={keyword}&page={cur_page}&type=web&sort=date&datepid=1&channel=&vtime=-1&is_search=1" |
|||
full_url = F"{self._host}{uri}" |
|||
try: |
|||
await self.playwright_page.goto(full_url) |
|||
results = [] |
|||
# 选择每一个结果元素 |
|||
elements = await self.playwright_page.query_selector_all("div.tright") |
|||
for element in elements: |
|||
title = "" |
|||
url = "" |
|||
publish_time = "" |
|||
# 标题元素 |
|||
tit = await element.query_selector(".tit") |
|||
if tit: |
|||
# 标题下面的链接 |
|||
span = await tit.query_selector("span") |
|||
url = await span.get_attribute("lanmu1") |
|||
|
|||
# 存放标题的a标签 |
|||
tit_a = await span.query_selector("a") |
|||
if tit_a: |
|||
title = await tit_a.inner_text() |
|||
# 发布时间元素 |
|||
tim = await element.query_selector(".src-tim .tim") |
|||
if tim: |
|||
tim_text = await tim.inner_text() |
|||
publish_time = tim_text.split(":")[1] |
|||
# 保存数据 |
|||
results.append({ |
|||
"keyword": keyword, |
|||
"title": title, |
|||
"url": url, |
|||
"publish_time": publish_time |
|||
}) |
|||
return results |
|||
except Exception as e: |
|||
logger.error(F"[央视网]搜索方法异常: 关键词: {keyword} 页码: {cur_page} {full_url}") |
|||
logger.error(F"[央视网]错误信息: {str(e)}") |
|||
raise DataFetchError(str(e), full_url) |
|||
|
|||
|
|||
async def run(): |
|||
async with async_playwright() as playwright: |
|||
# 启动浏览器 |
|||
async with async_playwright() as playwright: |
|||
chromium = playwright.chromium |
|||
browser = await chromium.launch(headless=False) |
|||
# 浏览器上下文 |
|||
browser_context = await browser.new_context( |
|||
viewport={"width": 1920, "height": 1080}, |
|||
user_agent="" |
|||
) |
|||
# 反反爬脚本 |
|||
await browser_context.add_init_script(path="../../lib/stealth.min.js") |
|||
context_page: Page = await browser_context.new_page() |
|||
|
|||
# 创建对象 |
|||
client = YangShiClient(playwright_page=context_page, cookie_dict={}) |
|||
result = await client.search("医保", 1) |
|||
print(result) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
asyncio.get_event_loop().run_until_complete(run()) |
|||
@ -0,0 +1,19 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from httpx import RequestError |
|||
|
|||
|
|||
class DataFetchError(RequestError): |
|||
"""未知异常""" |
|||
def __init__(self, message, url, method="GET", params=None): |
|||
self.message = message |
|||
self.url = url |
|||
self.method = method |
|||
self.params = params |
|||
|
|||
def __str__(self): |
|||
return self.message |
|||
|
|||
|
|||
class IPBlockError(RequestError): |
|||
"""ip被封禁异常""" |
|||
@ -0,0 +1,185 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext |
|||
|
|||
from base.base_spider import AbstractSpider |
|||
from typing import Dict, List, Optional, Tuple |
|||
from .client import YangShiClient |
|||
from utils.utils import logger, is_blank |
|||
from models.monitor_task_model import get_task, running, complete, fail |
|||
from models.monitor_result_model import gen_result, save |
|||
from base.enums import Platform |
|||
import utils.date_format as date_format |
|||
import os |
|||
import config |
|||
import uuid |
|||
from .exception import DataFetchError |
|||
import utils.mail as mail |
|||
import asyncio |
|||
from tortoise.transactions import in_transaction |
|||
|
|||
|
|||
class YangShiSpider(AbstractSpider): |
|||
""" |
|||
央视网爬虫 |
|||
""" |
|||
client: YangShiClient # 请求对象 |
|||
context_page: Page # 浏览器页面上下文 |
|||
browser_context: BrowserContext # 浏览器上下文 |
|||
image_path: str |
|||
|
|||
def __init__(self): |
|||
self.index_url = "https://tv.cctv.com/" |
|||
self.platform = Platform.YANG_SHI |
|||
self.image_path = None |
|||
self.retry = 0 # 自旋次数 |
|||
|
|||
def init_config(self): |
|||
super().init_config() |
|||
|
|||
async def start(self, task_id): |
|||
try: |
|||
async with in_transaction(): |
|||
await self.do_spider(task_id) |
|||
except DataFetchError as e: |
|||
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常") |
|||
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
if self.retry > 3: |
|||
await fail(task_id) |
|||
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "央视网", str(e)) |
|||
else: |
|||
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
except Exception as e: |
|||
logger.error(F"[央视网]任务ID: {task_id} 爬虫异常") |
|||
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 切换代理ip并自旋 |
|||
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常") |
|||
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}") |
|||
# 尝试自旋 |
|||
self.retry = self.retry + 1 |
|||
if self.retry > 3: |
|||
await fail(task_id) |
|||
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件") |
|||
await mail.send_post_mail(task_id, "央视网", str(e)) |
|||
else: |
|||
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试") |
|||
await asyncio.sleep(20) |
|||
await self.do_spider(task_id) |
|||
|
|||
async def create_client(self, httpx_proxy: Optional[str]) -> YangShiClient: |
|||
# 请求头 |
|||
client = YangShiClient(proxies=httpx_proxy, cookie_dict={}, playwright_page=self.context_page) |
|||
return client |
|||
|
|||
async def launch_browser(self, |
|||
chromium: BrowserType, |
|||
playwright_proxy: Optional[Dict], |
|||
user_agent: Optional[Dict], |
|||
headless: bool = True): |
|||
""" |
|||
启动一个浏览器上下文 |
|||
:param chromium: |
|||
:param headless: |
|||
:param self: 类型 |
|||
:param playwright_proxy: 代理 |
|||
:param user_agent: 用户标识 |
|||
:return: |
|||
""" |
|||
# 浏览器对象 |
|||
browser = await chromium.launch(proxy=playwright_proxy, headless=headless) |
|||
|
|||
# 浏览器上下文 |
|||
browser_context = await browser.new_context( |
|||
viewport={"width": 1920, "height": 1080}, |
|||
user_agent=user_agent |
|||
) |
|||
# 反反爬脚本 |
|||
await browser_context.add_init_script(path="lib/stealth.min.js") |
|||
context_page = await browser_context.new_page() |
|||
return browser_context, context_page |
|||
|
|||
async def do_search(self, task): |
|||
""" |
|||
获取任务信息 |
|||
:return: |
|||
""" |
|||
results = [] |
|||
cur_page = 1 |
|||
logger.info(F"[央视网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}") |
|||
self.client = await self.create_client(None) |
|||
while True: |
|||
logger.info(F"[央视网]任务ID: {task.id} 开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}") |
|||
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page) |
|||
logger.info(F"[央视网]任务ID: {task.id} 获取到{len(search_datas)}条搜索结果") |
|||
if not search_datas: |
|||
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 页码: {cur_page}没有搜索到数据") |
|||
break |
|||
index = -1 |
|||
for i, data in enumerate(search_datas): |
|||
# 找到一个不是今天的数据就结束 |
|||
if not date_format.is_today(data.get("publish_time")): |
|||
index = i |
|||
break |
|||
# 切割 |
|||
if index == -1: |
|||
# 搜索结果的最后一个依然是今天的 整个添加 |
|||
results = results + search_datas |
|||
# 翻到下一页 继续找 |
|||
cur_page = cur_page + 1 |
|||
else: |
|||
# 搜索结果中有不是今天的 切割一部分添加 |
|||
results = results + search_datas[:index] |
|||
# 结束本次搜索 |
|||
break |
|||
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}") |
|||
return results |
|||
|
|||
async def cut_screen(self, url): |
|||
""" |
|||
网页截图 |
|||
:param url: 地址 |
|||
:return: |
|||
""" |
|||
if not self.image_path: |
|||
image_path = config.IMAGE_PATH |
|||
if is_blank(image_path): |
|||
self.image_path = "./data" |
|||
if not os.path.exists(self.image_path): |
|||
os.makedirs(self.image_path) |
|||
save_path = F"{self.image_path}/{uuid.uuid4()}.png" |
|||
# 开始截图 |
|||
await self.context_page.goto(url) |
|||
await self.context_page.screenshot(path=save_path, full_page=True) |
|||
return save_path |
|||
|
|||
async def do_spider(self, task_id): |
|||
# 获取任务信息 |
|||
task = await get_task(task_id) |
|||
if not task: |
|||
logger.error(F"[央视网]任务ID: {task_id}不存在 任务结束") |
|||
return |
|||
logger.info(F"[央视网]任务ID: {task_id} 任务开始") |
|||
await running(task_id) |
|||
results = [] |
|||
# 启动浏览器 |
|||
async with async_playwright() as playwright: |
|||
chromium = playwright.chromium |
|||
self.browser_context, self.context_page = await self.launch_browser(chromium, None, None, headless=True) |
|||
# 创建请求客户端 |
|||
search_datas = await self.do_search(task) |
|||
# 构建结果实体 截图 |
|||
for data in search_datas: |
|||
result = gen_result(task, data.get("title"), data.get("url"), data.get("publish_time")) |
|||
# img_path = await self.cut_screen(data.get("url")) |
|||
# result.image = img_path |
|||
results.append(result) |
|||
# logger.info(F"[央视网] 任务ID: {task_id} 标题: {data.get('title')} 截图文件名: {img_path}") |
|||
# 结果落库 |
|||
await save(results) |
|||
logger.info(F"[央视网] 任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束") |
|||
await complete(task_id) |
|||
@ -0,0 +1 @@ |
|||
# -*- coding: utf-8 -*- |
|||
@ -0,0 +1,102 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from dateutil.parser import parse |
|||
from dateutil.relativedelta import relativedelta |
|||
from datetime import datetime, timedelta |
|||
import time |
|||
|
|||
|
|||
def gen_job_datetime(time_str, date_str=''): |
|||
""" |
|||
生成任务的时间和日期 |
|||
:param time_str: 时间 |
|||
:param date_str: 日期 |
|||
:return: |
|||
""" |
|||
if not time_str: |
|||
return None |
|||
return parse(date_str + time_str) |
|||
|
|||
|
|||
def gen_today_str(): |
|||
today = datetime.today() |
|||
return today.strftime("%Y-%m-%d") |
|||
|
|||
|
|||
def timestamp(): |
|||
return int(time.time()) |
|||
|
|||
|
|||
def is_today(date_str): |
|||
publish_date = parse(date_str).date() # 获取日期部分,忽略时间部分 |
|||
today = datetime.today().date() # 获取今天的日期,忽略时间部分 |
|||
# 检查日期是否相等 |
|||
return publish_date == today |
|||
|
|||
|
|||
def timestamp2date(timestamp_long): |
|||
d = datetime.utcfromtimestamp(float(timestamp_long / 1000)) |
|||
return d |
|||
|
|||
|
|||
def today_timestamp_long(): |
|||
""" |
|||
获取今天开始和结束的毫秒时间戳 |
|||
:return: |
|||
""" |
|||
start = parse("00:00") |
|||
end = start + timedelta(days=1) |
|||
return start.timestamp() * 1000, end.timestamp() * 1000 |
|||
|
|||
|
|||
def parse_time(time_str): |
|||
return datetime.strptime(time_str, '%H:%M').time() |
|||
|
|||
|
|||
def eq_time(time1: str, time2: str): |
|||
time1 = datetime.strptime(time1, '%H:%M').time() |
|||
time2 = datetime.strptime(time2, '%H:%M').time() |
|||
|
|||
today = datetime.today().date() |
|||
time1 = datetime.combine(today, time1) |
|||
time2 = datetime.combine(today, time2) |
|||
|
|||
return time1 == time2 |
|||
|
|||
|
|||
def ge_time(time1: str, time2: str): |
|||
""" |
|||
比较time1是否大于等于time2 |
|||
:param time1: |
|||
:param time2: |
|||
:return: |
|||
""" |
|||
time1 = datetime.strptime(time1, '%H:%M').time() |
|||
time2 = datetime.strptime(time2, '%H:%M').time() |
|||
|
|||
today = datetime.today().date() |
|||
time1 = datetime.combine(today, time1) |
|||
time2 = datetime.combine(today, time2) |
|||
|
|||
return time1 >= time2 |
|||
|
|||
|
|||
def lt_time(time1: str, time2: str): |
|||
""" |
|||
比较time1是否小于time2 |
|||
:param time1: |
|||
:param time2: |
|||
:return: |
|||
""" |
|||
time1 = datetime.strptime(time1, '%H:%M').time() |
|||
time2 = datetime.strptime(time2, '%H:%M').time() |
|||
|
|||
today = datetime.today().date() |
|||
time1 = datetime.combine(today, time1) |
|||
time2 = datetime.combine(today, time2) |
|||
|
|||
return time1 < time2 |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
print(lt_time("18:52", "23:55")) |
|||
@ -0,0 +1,83 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import yagmail |
|||
import config |
|||
from utils.utils import logger |
|||
import inspect |
|||
from datetime import datetime |
|||
from models.monitor_task_model import get_task, complete |
|||
|
|||
|
|||
async def send_post_mail(task_id, name, message="出现异常"): |
|||
keyword = "" |
|||
task = await get_task(task_id) |
|||
if not task: |
|||
message = F"不存在的任务ID: {task_id}" |
|||
else: |
|||
keyword = task.keyword |
|||
|
|||
mail_server = None |
|||
try: |
|||
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST) |
|||
except Exception as e: |
|||
logger.error("[邮件]初始化失败 请检查邮件配置") |
|||
return |
|||
to = [] |
|||
if not config.POST_EMAIL: |
|||
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
|||
return |
|||
if isinstance(config.POST_EMAIL, str): |
|||
to.append(config.POST_EMAIL) |
|||
elif isinstance(config.POST_EMAIL, list): |
|||
to = config.POST_EMAIL |
|||
else: |
|||
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
|||
return |
|||
|
|||
title = F"HuoSpider上报邮件: {name}数据获取异常" |
|||
content = F""" |
|||
异常任务ID: {task_id} |
|||
异常站点: {name} |
|||
关键词: {keyword} |
|||
异常信息: {message} |
|||
上报时间: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")} |
|||
""" |
|||
inspect.cleandoc(content) |
|||
mail_server.send(to, title, content) |
|||
mail_server.close() |
|||
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}") |
|||
|
|||
|
|||
async def test(task_id, name, message="出现异常"): |
|||
keyword = "测试" |
|||
|
|||
mail_server = None |
|||
try: |
|||
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST) |
|||
except Exception as e: |
|||
logger.error("[邮件]初始化失败 请检查邮件配置") |
|||
return |
|||
to = [] |
|||
if not config.POST_EMAIL: |
|||
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
|||
return |
|||
if isinstance(config.POST_EMAIL, str): |
|||
to.append(config.POST_EMAIL) |
|||
elif isinstance(config.POST_EMAIL, list): |
|||
to = config.POST_EMAIL |
|||
else: |
|||
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消") |
|||
return |
|||
|
|||
title = F"HuoSpider上报邮件: {name}数据获取异常" |
|||
content = F""" |
|||
异常任务ID: {task_id} |
|||
异常站点: {name} |
|||
关键词: {keyword} |
|||
异常信息: {message} |
|||
上报时间: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")} |
|||
""" |
|||
inspect.cleandoc(content) |
|||
mail_server.send(to, title, content) |
|||
mail_server.close() |
|||
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}") |
|||
@ -0,0 +1,113 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
from datetime import datetime, timedelta |
|||
|
|||
import httpx |
|||
import retry |
|||
from dateutil.parser import parse |
|||
|
|||
import config |
|||
from utils.utils import logger |
|||
|
|||
ip_pool = [] |
|||
|
|||
|
|||
# 代理池 |
|||
|
|||
class ProxyIp: |
|||
ip: str |
|||
port: int |
|||
expire: datetime |
|||
city: str |
|||
isp: str |
|||
|
|||
def __init__(self, ip, port, expire, city='未知', isp='未知'): |
|||
self.ip = ip |
|||
self.port = port |
|||
self.expire = parse(expire) |
|||
self.city = city |
|||
self.isp = isp |
|||
|
|||
def __str__(self): |
|||
return F"({self.city}-{self.isp}){self.ip}:{self.port} 过期时间:{self.expire}" |
|||
|
|||
def is_expire(self): |
|||
now = datetime.now() |
|||
expire = self.expire - timedelta(seconds=20) |
|||
return expire < now |
|||
|
|||
def to_httpx_proxies(self): |
|||
return {"http://": F"http://{self.ip}:{self.port}"} |
|||
|
|||
|
|||
class ProxyError(Exception): |
|||
def __init__(self, message, code=-1000): |
|||
self.code = code |
|||
self.message = message |
|||
|
|||
def __str__(self): |
|||
return F"错误码: {self.code} 错误消息: {self.message}" |
|||
|
|||
|
|||
@retry.retry(exceptions=ProxyError, tries=3, delay=2, backoff=2) |
|||
def add_ip(count=1) -> ProxyIp: |
|||
""" |
|||
向ip池中添加一个代理ip对象 |
|||
:param count: 添加的数量 默认为1 |
|||
:return: |
|||
""" |
|||
url = "http://api.tianqiip.com/getip" |
|||
params = { |
|||
"secret": config.PROXY_SECRET, # 密钥 |
|||
"sign": config.PROXY_SIGN, # 签名 |
|||
"num": count, # 数量 |
|||
"type": "json", # 返回类型 |
|||
"port": 1, # 协议 |
|||
"time": 3, # 时长三分钟 |
|||
"ts": 1, # 显示过期时间 |
|||
"mr": 1, # 去重 |
|||
"cs": 1, # 显示位置 |
|||
"ys": 1 # 显示运营商 |
|||
} |
|||
ips = [] |
|||
result: dict = httpx.get(url, params=params, proxies={}).json() |
|||
if not result['code'] == 1000: |
|||
logger.error("[IP池]API获取代理IP失败") |
|||
raise ProxyError(result['code'], result['msg']) |
|||
for data in result["data"]: |
|||
ip = ProxyIp(data['ip'], data['port'], data['expire'], city=data['city'], isp=data['isp']) |
|||
ip_pool.append(ip) |
|||
ips.append(ip) |
|||
logger.info(F"[IP池]新增代理IP {str(ip)}") |
|||
return ips[0] |
|||
|
|||
|
|||
def del_ip(index): |
|||
if index > len(ip_pool) - 1: |
|||
return |
|||
logger.error(f"[IP池]代理IP被删除: {ip_pool[index]}") |
|||
del ip_pool[index] |
|||
|
|||
|
|||
def get_ip(cache=True) -> ProxyIp: |
|||
""" |
|||
获取一个代理ip对象 |
|||
:param cache: 使用缓存 |
|||
:return: |
|||
""" |
|||
if not cache: |
|||
# 不使用缓存时 请求一个新的ip并放入池中 然后获取该ip |
|||
return add_ip() |
|||
# 从缓存中获取一个有效的ip |
|||
if not ip_pool: |
|||
return add_ip() |
|||
cur_ip = None |
|||
for index, ip in enumerate(ip_pool): |
|||
if not ip.is_expire(): |
|||
# 没过期 返回 |
|||
cur_ip = ip |
|||
break |
|||
if not cur_ip: |
|||
return add_ip() |
|||
logger.info(f"[IP池]从IP池中获取到代理IP: {cur_ip}") |
|||
return cur_ip |
|||
@ -0,0 +1,109 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import datetime |
|||
import random |
|||
|
|||
from apscheduler.schedulers.asyncio import AsyncIOScheduler |
|||
from typing import Dict, List |
|||
|
|||
import config |
|||
from models.monitor_task_db import MonitorTask |
|||
from spiders.xinhua.spider import XinHuaSpider |
|||
from spiders.renmin.spider import RenMinSpider |
|||
from spiders.yang_shi.spider import YangShiSpider |
|||
from utils.utils import logger |
|||
from base.base_spider import AbstractSpider |
|||
import utils.date_format as date_format |
|||
from base.enums import Platform |
|||
import logging |
|||
from datetime import timedelta |
|||
import copy |
|||
|
|||
|
|||
def singleton(cls): |
|||
instances = {} |
|||
|
|||
def getinstance(): |
|||
if cls not in instances: |
|||
instances[cls] = cls() |
|||
return instances[cls] |
|||
|
|||
return getinstance |
|||
|
|||
|
|||
@singleton |
|||
class SchedulerManager: |
|||
scheduler: AsyncIOScheduler |
|||
|
|||
def __init__(self): |
|||
# 调整调度器日志等级 |
|||
ap_logger = logging.getLogger('apscheduler') |
|||
ap_logger.setLevel(logging.WARNING) |
|||
self.scheduler = AsyncIOScheduler() |
|||
|
|||
def get_scheduler(self): |
|||
""" |
|||
获取调度器对象 |
|||
:return: |
|||
""" |
|||
if not self.scheduler: |
|||
self.scheduler = AsyncIOScheduler() |
|||
return self.scheduler |
|||
|
|||
def start(self, paused=False): |
|||
self.scheduler.start(paused) |
|||
|
|||
def add_task(self, task: MonitorTask, offset=0, is_random=False): |
|||
""" |
|||
添加任务 |
|||
:param is_random: 是否随机偏移 |
|||
:param offset: 偏移多少秒后执行 |
|||
:param task: |
|||
:return: |
|||
""" |
|||
scheduler = self.get_scheduler() |
|||
spider: AbstractSpider = None |
|||
if task.platform == Platform.XIN_HUA: |
|||
spider = XinHuaSpider() |
|||
elif task.platform == Platform.REN_MIN: |
|||
spider = RenMinSpider() |
|||
elif task.platform == Platform.YANG_SHI: |
|||
spider = YangShiSpider() |
|||
if not spider: |
|||
# logger.error(F"未知的平台: {task.platform} 任务id: {task.id}") |
|||
return |
|||
if not task.gather_time: |
|||
logger.error(F"[调度器]采集时间不存在 任务id: {task.id}") |
|||
if is_random: |
|||
offset = offset + random.randint(1, 29) |
|||
# 时间向后偏移 |
|||
task_date_time = date_format.gen_job_datetime(task.gather_time) |
|||
task_date_time = task_date_time + timedelta(seconds=offset) |
|||
|
|||
if task_date_time < datetime.datetime.now(): |
|||
task_date_time = datetime.datetime.now() + datetime.timedelta(seconds=60) |
|||
# 添加定时任务 |
|||
scheduler.add_job(spider.start, "date", run_date=task_date_time, kwargs={"task_id": task.id}) |
|||
logger.info( |
|||
F"[调度器]注册定时任务 ID: {task.id} 执行时间: {task_date_time} {F'偏移{offset}秒后执行' if offset > 0 else ''}") |
|||
|
|||
def add_tasks(self, tasks: List[MonitorTask], is_random=False): |
|||
# 按平台和关键词分组 |
|||
group = {} |
|||
for task in tasks: |
|||
if task.platform not in group: |
|||
group[task.platform] = {} |
|||
if task.keyword not in group[task.platform]: |
|||
group[task.platform][task.keyword] = [] |
|||
group[task.platform][task.keyword].append(task) |
|||
# 遍历每个关键词组 |
|||
for platform, platform_group in group.items(): |
|||
for keyword, task_list in platform_group.items(): |
|||
sorted_task_list = sorted(task_list, key=lambda e: date_format.parse_time(task.gather_time)) |
|||
# 判断最后一个任务是否在极限时间之前 |
|||
if date_format.lt_time(sorted_task_list[-1].gather_time, config.MAX_GATHER_TIME): |
|||
# 创建一个补偿任务 |
|||
new_task = copy.deepcopy(sorted_task_list[-1]) |
|||
new_task.gather_time = config.MAX_GATHER_TIME |
|||
sorted_task_list.append(new_task) |
|||
for sorted_task in sorted_task_list: |
|||
self.add_task(sorted_task, 0, is_random) |
|||
@ -0,0 +1,60 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
import logging |
|||
from logging.handlers import TimedRotatingFileHandler |
|||
import re |
|||
import os |
|||
|
|||
|
|||
def init_loging_config(): |
|||
# 检查log文件夹是否存在 |
|||
if not os.path.exists("./log"): |
|||
os.mkdir("./log") |
|||
# 创建一个handler,用于按日期写入日志文件 |
|||
# 'W0' 表示每周滚动一次,'D' 表示每天滚动一次,'H' 表示每小时滚动一次,'M' 表示每分钟滚动一次 |
|||
# 'midnight' 表示在午夜滚动,'h:m' 表示在指定的小时和分钟滚动 |
|||
# backupCount 表示保留的日志文件的个数,超过后会删除最旧的日志文件 |
|||
# when='D', interval=1, backupCount=7 表示每天滚动一次,并保留最近7天的日志文件 |
|||
file_handler = TimedRotatingFileHandler('./log/huo_spider.log', when='D', interval=1, encoding='utf-8') |
|||
file_handler.setLevel(logging.DEBUG) |
|||
|
|||
# 定义handler的输出格式 |
|||
formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s %(message)s ') |
|||
file_handler.setFormatter(formatter) |
|||
|
|||
level = logging.INFO |
|||
logging.basicConfig( |
|||
level=level, |
|||
format="%(asctime)s [%(name)s] %(levelname)s %(message)s ", |
|||
datefmt='[%Y-%m-%d %H:%M:%S]' |
|||
) |
|||
_logger = logging.getLogger("HuoSpider") |
|||
_logger.setLevel(level) |
|||
_logger.addHandler(file_handler) |
|||
return _logger |
|||
|
|||
|
|||
logger = init_loging_config() |
|||
|
|||
|
|||
def is_blank(val: str): |
|||
if val is None: |
|||
return False |
|||
if not val.strip(): |
|||
return False |
|||
return True |
|||
|
|||
|
|||
def count_characters(val): |
|||
""" |
|||
统计中文和非中文字符个数 |
|||
:param val: |
|||
:return: |
|||
""" |
|||
if not isinstance(val, str): |
|||
val = str(val) |
|||
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]') |
|||
not_chinese_pattern = re.compile(r'[^\u4e00-\u9fa5]') |
|||
chinese = re.findall(chinese_pattern, val) |
|||
not_chinese = re.findall(not_chinese_pattern, val) |
|||
return len(chinese), len(not_chinese) |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue