Browse Source

代码初始化

master
shenfan 7 months ago
commit
a38f378e16
  1. 10
      .gitignore
  2. 1
      base/__init__.py
  3. 21
      base/base_spider.py
  4. 26
      base/enums.py
  5. 31
      db.py
  6. 7
      lib/stealth.min.js
  7. 88
      main.py
  8. 1
      models/__init__.py
  9. 39
      models/monitor_result_db.py
  10. 50
      models/monitor_result_model.py
  11. 36
      models/monitor_task_db.py
  12. 46
      models/monitor_task_model.py
  13. 10
      requirements.txt
  14. 1
      spiders/__init__.py
  15. 1
      spiders/renmin/__init__.py
  16. 151
      spiders/renmin/client.py
  17. 19
      spiders/renmin/exception.py
  18. 188
      spiders/renmin/spider.py
  19. 4
      spiders/xinhua/__init__.py
  20. 114
      spiders/xinhua/client.py
  21. 19
      spiders/xinhua/exception.py
  22. 201
      spiders/xinhua/spider.py
  23. 1
      spiders/yang_shi/__init__.py
  24. 160
      spiders/yang_shi/client.py
  25. 19
      spiders/yang_shi/exception.py
  26. 185
      spiders/yang_shi/spider.py
  27. 1
      utils/__init__.py
  28. 102
      utils/date_format.py
  29. 83
      utils/mail.py
  30. 113
      utils/proxy.py
  31. 109
      utils/scheduler.py
  32. 60
      utils/utils.py

10
.gitignore

@ -0,0 +1,10 @@
data
venv
**/log
.idea
**/__pycache__/
test.py
config.py
Pipfile
Pipfile.lock
run.bat

1
base/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

21
base/base_spider.py

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
from abc import ABC, abstractclassmethod
class AbstractSpider(ABC):
"""
"""
def init_config(self):
"""
:return:
"""
pass
def start(self, task_id):
"""
:return:
"""

26
base/enums.py

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from enum import Enum
class TaskStatus(Enum):
WAITING = 1
RUNNING = 2
COMPLETED = 3
FAIL = 4
def __eq__(self, other):
return self.value == other
class Platform(Enum):
XIN_HUA = "xin_hua"
REN_MIN = "ren_min"
YANG_SHI = "yang_shi"
def __eq__(self, other):
return self.value == other
if __name__ == '__main__':
print(Platform.REN_MIN == "ren_min")

31
db.py

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
from tortoise import Tortoise, run_async
from config import *
from utils.utils import logger
"""
"""
def get_db_url():
"""
url
:return:
"""
return F"mysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_DATABASE}"
async def init():
"""
:return:
"""
await Tortoise.init(
db_url=get_db_url(),
modules={"models": ['models.monitor_result_db', 'models.monitor_task_db']}
)
await Tortoise.generate_schemas()
logger.info("[数据库]初始化数据库连接成功")

7
lib/stealth.min.js
File diff suppressed because it is too large
View File

88
main.py

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
import argparse
import asyncio
import os
import sys
import config
import db
import utils.date_format as date_format
from base.enums import Platform
from models import monitor_task_model
from utils.scheduler import SchedulerManager
from utils.utils import logger
def task_group(tasks):
groups = {}
for name, enum in Platform.__members__.items():
groups[enum.value] = []
for task in tasks:
if task.platform in groups:
groups[task.platform].append(task)
return list(groups.values())
async def do_get_task_job():
"""
:return:
"""
await db.init()
tasks = await monitor_task_model.get_today_task()
if not tasks:
logger.info(F"没有获取到任务信息")
return
# 分组
# groups = task_group(tasks)
# random.shuffle(groups)
schedular_manager = SchedulerManager()
logger.info(F"============================== 获取到{len(tasks)}条任务信息 ==============================")
schedular_manager.add_tasks(tasks, True)
def restart():
os.execl(sys.executable, sys.executable, *sys.argv)
def load_arg_parse():
"""
:return:
"""
parse = argparse.ArgumentParser(description="抓取社媒新闻数据")
parse.add_argument("-a", "--active", help="启动脚本时 立即进行一次任务拉取", default='false')
args = parse.parse_args()
logger.info(F"启动参数: {args}")
return args
def clear_system_proxy():
# 清除系统代理相关的环境变量
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)
os.environ.pop('ftp_proxy', None)
os.environ.pop('no_proxy', None)
if __name__ == '__main__':
try:
clear_system_proxy()
logger.info(F'启动成功 将在每天的{config.GET_TASK_TIME}拉取任务信息')
get_task_time = date_format.gen_job_datetime(config.GET_TASK_TIME)
manager = SchedulerManager()
# 启动定时任务
manager.start()
# 添加拉取任务信息的任务
manager.scheduler.add_job(do_get_task_job, 'cron', hour=get_task_time.hour, minute=get_task_time.minute)
manager.scheduler.add_job(restart, 'cron', hour=get_task_time.hour, minute=0)
# 参数检查
args = load_arg_parse()
if args.active and args.active.lower() == 'true':
logger.info(F"立即执行一次任务拉取...")
asyncio.get_event_loop().run_until_complete(do_get_task_job())
# 开启事件循环
asyncio.get_event_loop().run_forever()
except KeyboardInterrupt:
sys.exit()

1
models/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

39
models/monitor_result_db.py

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
from typing import Optional, Iterable
from tortoise import fields, BaseDBAsyncClient
from tortoise.models import Model
import utils.date_format as date_format
class MonitorResult(Model):
"""
"""
id = fields.IntField(pk=True, autoincrement=True, description="ID")
keyword = fields.CharField(null=True, max_length=120, description="关键词")
title = fields.CharField(null=True, max_length=255, description="文章标题")
url = fields.CharField(null=True, max_length=500, description="文章地址")
publish_time = fields.BigIntField(null=True, max_length=20, description="发布时间")
platform = fields.CharField(null=True, max_length=20, description="平台")
gather_time = fields.CharField(null=True, description="设定采集时间", max_length=30)
content = fields.TextField(null=True, description="文章内容")
image = fields.CharField(null=True, max_length=255, description="结果截图")
is_del = fields.IntField(null=True, max_length=1, description="删除状态")
create_time = fields.BigIntField(null=True, max_length=20, description="创建时间")
update_time = fields.BigIntField(null=True, max_length=20, description="更新时间")
delete_time = fields.BigIntField(null=True, max_length=20, description="删除时间")
class Meta:
table = "aux_monitor_result"
def _pre_save(
self,
using_db: Optional[BaseDBAsyncClient] = None,
update_fields: Optional[Iterable[str]] = None,
) -> None:
if not self.id:
self.create_time = date_format.timestamp()
self.update_time = date_format.timestamp()

50
models/monitor_result_model.py

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
import config
from .monitor_result_db import MonitorResult
import utils.date_format as date_format
from .monitor_task_db import MonitorTask
def gen_result(task: MonitorTask, title, url, publish_time):
"""
:param task:
:param title:
:param url:
:param publish_time:
:return:
"""
if isinstance(publish_time, str):
timestamp = date_format.timestamp()
publish_time = int(date_format.parse(publish_time).timestamp())
module = MonitorResult(title=title, url=url, publish_time=publish_time,
is_del=1,
keyword=task.keyword, platform=task.platform,
gather_time=F"{task.gather_date} {task.setting_time}")
return module
async def save(results):
if config.RESULT_UNIQUE:
await save_unique(results)
else:
model = MonitorResult()
await model.bulk_create(results)
async def save_unique(results):
# 过滤列表中重复的结果
unique_results = {}
for result in results:
key = (result.platform, result.keyword, result.title)
if key not in unique_results:
unique_results[key] = result
unique_results = list(unique_results.values())
# 过滤数据库中重复的结果
save_results = []
model = MonitorResult()
for result in unique_results:
exist = await model.filter(platform=result.platform, keyword=result.keyword, title=result.title).exists()
if not exist:
save_results.append(result)
await model.bulk_create(save_results)

36
models/monitor_task_db.py

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
from typing import Optional, Iterable
from tortoise import fields, BaseDBAsyncClient
from tortoise.models import Model
import utils.date_format as date_format
class MonitorTask(Model):
"""
"""
id = fields.IntField(pk=True, autoincrement=True, description="ID")
keyword = fields.CharField(null=True, max_length=255, description="关键词")
lang = fields.CharField(null=True, max_length=50, description="语言")
platform = fields.CharField(null=True, max_length=30, description="媒体平台")
gather_date = fields.CharField(null=True, max_length=30, description="采集日期")
gather_time = fields.CharField(null=True, max_length=30, description="采集时间")
setting_time = fields.CharField(null=True, max_length=30, description="设定时间")
status = fields.IntField(null=True, max_length=1, description="任务状态 1 待执行 2 进行中 3 已完成")
create_time = fields.BigIntField(null=True, max_length=16, description="创建时间")
update_time = fields.BigIntField(null=True, max_length=16, description="更新时间")
class Meta:
table = "aux_monitor_task"
async def _pre_save(
self,
using_db: Optional[BaseDBAsyncClient] = None,
update_fields: Optional[Iterable[str]] = None,
) -> None:
if not self.id:
self.create_time = date_format.timestamp()
self.update_time = date_format.timestamp()

46
models/monitor_task_model.py

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
from .monitor_task_db import MonitorTask
import utils.date_format as date_format
from base.enums import TaskStatus
async def get_today_task():
"""
:return:
"""
# 当天日期
today = date_format.gen_today_str()
task_model = MonitorTask()
result = await task_model.filter(gather_date=today, status=TaskStatus.WAITING.value).all()
return result
async def get_task(task_id):
"""
id的任务信息
:param task_id:
:return:
"""
task_model = MonitorTask()
return await task_model.get_or_none(id=task_id)
async def complete(task_id):
task_model = MonitorTask()
await task_model.filter(id=task_id).update(status=TaskStatus.COMPLETED.value, update_time=date_format.timestamp())
async def running(task_id):
task_model = MonitorTask()
await task_model.filter(id=task_id).update(status=TaskStatus.RUNNING.value, update_time=date_format.timestamp())
async def fail(task_id):
task_model = MonitorTask()
await task_model.filter(id=task_id).update(status=TaskStatus.FAIL.value, update_time=date_format.timestamp())
if __name__ == '__main__':
get_today_task()

10
requirements.txt

@ -0,0 +1,10 @@
tortoise-orm~=0.19.0
playwright~=1.42.0
httpx~=0.27.0
aiomysql~=0.2.0
pymysql~=1.1.0
python-dateutil~=2.9.0.post0
APScheduler~=3.10.4
yagmail~=0.15.293
retry~=0.9.2

1
spiders/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

1
spiders/renmin/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

151
spiders/renmin/client.py

@ -0,0 +1,151 @@
# -*- coding: utf-8 -*-
import json
from typing import Dict
from urllib.parse import urlencode
import httpx
from httpx._exceptions import HTTPError, RequestError
from playwright.async_api import Page
from .exception import DataFetchError
import asyncio
import json
import utils.date_format as date_format
from utils.utils import count_characters
from utils.utils import logger
import utils.proxy as proxy
import config
class RenMinClient:
def __init__(self,
timeout=60,
*,
playwright_page: Page,
cookie_dict: Dict[str, str]):
self.timeout = timeout
self.headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1",
"Host": "search.people.cn",
"Content-Length": "163",
"Content-Type": "application/json",
"Origin": "http://search.people.cn",
"Pragma": "no-cache",
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
self._host = "http://search.people.cn"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def request(self, method, url, **kwargs):
"""
:param method:
:param url:
:param kwargs:
:return:
"""
# api代理
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None
try:
async with httpx.AsyncClient(proxies=proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
# 人民网504 是没有数据
if response.status_code == 504:
# logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}")
# raise DataFetchError("黑名单异常", url, method, kwargs)
return {}
if not response.status_code == 200:
logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}")
raise DataFetchError("httpx异常", url, method, kwargs)
data: Dict = response.json()
if data.get("code") != "0":
raise DataFetchError(data.get("message", "未知错误"), url)
else:
return data.get("data", {})
except HTTPError as e:
logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}")
logger.error(F"[人民网]错误信息{str(e)}")
raise DataFetchError(str(e), url)
except Exception as e:
logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}")
logger.error(F"[人民网]错误信息{str(e)}")
raise Exception(str(e))
async def get(self, uri: str, params=None) -> Dict:
"""
GET
:param uri:
:param params:
:return:
"""
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers)
async def post(self, uri: str, data: dict) -> Dict:
"""
POST
:param uri:
:param data:
:return:
"""
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=F"{self._host}{uri}",
data=json_str, headers=self.headers)
async def search(self, keyword, cur_page):
"""
:param end:
:param start:
:param keyword:
:param cur_page:
:param sort_field: # 0 时间倒序 1 时间正序
:return:
"""
# 接口地址
uri = '/search-platform/front/search'
get_param = {
'key': keyword,
'startTime': 0,
'endTime': 0,
'hasContent': True,
'hasTitle': True,
'isFuzzy': False, # 精准匹配
'limit': 10,
'page': cur_page,
'sortType': 0,
'type': 0
}
chinese, not_chinese = count_characters(keyword)
# 长度 = 127+ 汉字*3 + 其他*1
# 关键字部分
content_length = 126 + (chinese * 3) + not_chinese + 1 # 如果精准匹配是False 加一字节
# 页码部分
chinese, not_chinese = count_characters(cur_page)
content_length = content_length + not_chinese
logger.info(F"[人民网]请求长度: {content_length}")
logger.info(F"[人民网]参数: {get_param}")
self.headers['Content-Length'] = str(content_length)
content = await self.post(uri, get_param)
if not content or not content.get('records'):
return []
return content.get('records', [])
if __name__ == '__main__':
client = RenMinClient(playwright_page=None, cookie_dict={})
start, end = date_format.today_timestamp_long()
asyncio.run(client.search('乡村发展', 1))

19
spiders/renmin/exception.py

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
from httpx import RequestError
class DataFetchError(RequestError):
"""未知异常"""
def __init__(self, message, url, method="GET", params=None):
self.message = message
self.url = url
self.method = method
self.params = params
def __str__(self):
return self.message
class IPBlockError(RequestError):
"""ip被封禁异常"""

188
spiders/renmin/spider.py

@ -0,0 +1,188 @@
# -*- coding: utf-8 -*-
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext
from base.base_spider import AbstractSpider
from typing import Dict, List, Optional, Tuple
from .client import RenMinClient
from utils.utils import logger, is_blank
from models.monitor_task_model import get_task, running, complete, fail
from models.monitor_result_model import gen_result, save
from base.enums import Platform
import utils.date_format as date_format
import os
import config
import uuid
from .exception import DataFetchError
import utils.mail as mail
import asyncio
from tortoise.transactions import in_transaction
class RenMinSpider(AbstractSpider):
"""
"""
client: RenMinClient # 请求对象
context_page: Page # 浏览器页面上下文
browser_context: BrowserContext # 浏览器上下文
image_path: str
def __init__(self):
self.index_url = "http://www.people.com.cn/"
self.platform = Platform.REN_MIN
self.image_path = None
self.retry = 0 # 自旋次数
def init_config(self):
super().init_config()
async def start(self, task_id):
try:
async with in_transaction():
await self.do_spider(task_id)
except DataFetchError as e:
logger.error(F"[人民网]任务ID: {task_id} 获取数据异常")
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}")
# 尝试自旋
self.retry = self.retry + 1
if self.retry > 3:
await fail(task_id)
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "人民网", str(e))
else:
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
except Exception as e:
logger.error(F"[人民网]任务ID: {task_id} 爬虫异常")
logger.error(F"[人民网]任务ID: {task_id} 异常信息: {str(e)}")
# 切换代理ip并自旋
# 尝试自旋
self.retry = self.retry + 1
if self.retry > 3:
await fail(task_id)
logger.error(F"[人民网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "人民网", str(e))
else:
logger.info(F"[人民网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
async def create_client(self) -> RenMinClient:
return RenMinClient(playwright_page=None, cookie_dict={})
async def launch_browser(self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[Dict],
headless: bool = True):
"""
:param chromium:
:param headless:
:param self:
:param playwright_proxy:
:param user_agent:
:return:
"""
# 浏览器对象
browser = await chromium.launch(proxy=playwright_proxy, headless=headless)
# 浏览器上下文
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def do_search(self, task):
"""
:return:
"""
start, end = date_format.today_timestamp_long() # 开始结束时间
results = []
cur_page = 1
logger.info(F"[人民网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}")
self.client = await self.create_client()
while True:
logger.info(F"[人民网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}")
search_datas = await self.client.search(task.keyword, cur_page)
logger.info(F"[人民网]获取到{len(search_datas)}条搜索结果")
if not search_datas:
logger.info(F"[人民网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据")
break
index = -1
for i, data in enumerate(search_datas):
# 找到一个不是今天的数据就结束
if not date_format.is_today(date_format.timestamp2date(data.get("displayTime")).strftime("%Y-%m-%d")):
index = i
break
# 切割
if index == -1:
# 搜索结果的最后一个依然是今天的 整个添加
results = results + search_datas
# 翻到下一页 继续找
cur_page = cur_page + 1
else:
# 搜索结果中有不是今天的 切割一部分添加
results = results + search_datas[:index]
# 结束本次搜索
break
logger.info(F"[人民网]关键词:{task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}")
return results
async def cut_screen(self, url):
"""
:param url:
:return:
"""
if not self.image_path:
image_path = config.IMAGE_PATH
if is_blank(image_path):
self.image_path = "./data"
if not os.path.exists(self.image_path):
os.makedirs(self.image_path)
save_path = F"{self.image_path}/{uuid.uuid4()}.png"
# 开始截图
await self.context_page.goto(url)
await self.context_page.screenshot(path=save_path, full_page=True)
return save_path
async def do_spider(self, task_id):
# 获取任务信息
task = await get_task(task_id)
if not task:
logger.error(F"[人民网]任务ID: {task_id}不存在 任务结束")
return
logger.info(F"[人民网]任务ID: {task_id} 任务开始")
await running(task_id)
# 从api中获取数据
search_datas = await self.do_search(task)
if not search_datas:
logger.info(F"[人民网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束")
await complete(task_id)
return
# 保存result实体
results = []
# 启动浏览器
async with async_playwright() as playwright:
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, None, headless=True)
# 反反爬脚本
await self.browser_context.add_init_script(path="lib/stealth.min.js")
self.context_page: Page = await self.browser_context.new_page()
# 构建结果实体 截图
for data in search_datas:
result = gen_result(task, data.get("title"), data.get("url"), int(data.get("displayTime") / 1000))
# img_path = await self.cut_screen(data.get("url"))
# result.image = img_path
results.append(result)
# logger.info(F"[人民网]标题: {data.get('title')} 截图文件名: {img_path}")
# 结果落库
await save(results)
logger.info(F"[人民网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束")
await complete(task_id)

4
spiders/xinhua/__init__.py

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-

114
spiders/xinhua/client.py

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import json
from typing import Dict
from urllib.parse import urlencode
from .exception import DataFetchError
import httpx
from playwright.async_api import Page
from httpx._exceptions import HTTPError
from utils.utils import logger
import asyncio
import utils.proxy as proxy
import config
class XinHuaClient:
def __init__(self,
timeout=10,
*,
headers: Dict[str, str],
playwright_page: Page,
cookie_dict: Dict[str, str]):
self.timeout = timeout
self.headers = headers
self._host = "https://so.news.cn/"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def request(self, method, url, **kwargs):
"""
:param method:
:param url:
:param kwargs:
:return:
"""
# api代理
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None
try:
async with httpx.AsyncClient(proxies=proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
# 返回不正确的状态码
if not response.status_code == 200:
logger.error(F"[新华网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}")
raise DataFetchError("httpx异常", url, method, kwargs)
# 返回正确的状态码
data: Dict = response.json()
if data.get("code") != 200:
# 有特殊情况 敏感词会直接把content返回为没有找到相关稿件
if data.get("content") == '没有找到相关稿件':
logger.warning(F"[新华网]触发敏感词 跳过请求 参数: {kwargs}")
return {}
raise DataFetchError(data.get("content", "API未知错误"), url, method, kwargs)
else:
return data.get("content", {})
except HTTPError as e:
logger.error(F"[新华网]httpx异常: [{method}]{url} 参数: {kwargs}")
logger.error(F"[新华网]错误信息{str(e)}")
raise DataFetchError(str(e), url)
except Exception as e:
logger.error(F"[新华网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}")
logger.error(F"[新华网]错误信息{str(e)}")
raise Exception(str(e))
async def get(self, uri: str, params=None) -> Dict:
"""
GET
:param uri:
:param params:
:return:
"""
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers)
async def post(self, uri: str, data: dict) -> Dict:
"""
POST
:param uri:
:param data:
:return:
"""
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=F"{self._host}{uri}",
data=json_str, headers=self.headers)
async def search(self, keyword, cur_page, lang='cn', sort_field=0, search_fields=0):
"""
:param lang:
:param keyword:
:param cur_page:
:param sort_field: 0: 1:
:param search_fields: : 0: 1:
:return:
"""
# 接口地址
uri = '/getNews'
get_param = {
'keyword': keyword,
'curPage': cur_page,
'sortField': sort_field,
'searchFields': search_fields,
'lang': lang
}
content = await self.get(uri, get_param)
if not content or not content.get('results'):
return []
return content.get('results', [])

19
spiders/xinhua/exception.py

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
from httpx import RequestError
class DataFetchError(RequestError):
"""未知异常"""
def __init__(self, message, url, method="GET", params=None):
self.message = message
self.url = url
self.method = method
self.params = params
def __str__(self):
return self.message
class IPBlockError(RequestError):
"""ip被封禁异常"""

201
spiders/xinhua/spider.py

@ -0,0 +1,201 @@
# -*- coding: utf-8 -*-
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext
from base.base_spider import AbstractSpider
from typing import Dict, List, Optional, Tuple
from .client import XinHuaClient
from utils.utils import logger, is_blank
from models.monitor_task_model import get_task, running, complete, fail
from models.monitor_result_model import gen_result, save
from base.enums import Platform
import utils.date_format as date_format
import os
import config
import uuid
from .exception import DataFetchError
import asyncio
import utils.mail as mail
from tortoise.transactions import in_transaction
class XinHuaSpider(AbstractSpider):
"""
"""
client: XinHuaClient # 请求对象
context_page: Page # 浏览器页面上下文
browser_context: BrowserContext # 浏览器上下文
image_path: str
def __init__(self):
self.index_url = "http://www.xinhuanet.com/"
self.platform = Platform.XIN_HUA
self.image_path = None
self.retry = 0 # 自旋次数
self.context_page = None
def init_config(self):
super().init_config()
async def start(self, task_id):
try:
async with in_transaction():
await self.do_spider(task_id)
except DataFetchError as e:
logger.error(F"[新华网]任务ID: {task_id} 获取数据异常")
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}")
# 尝试自旋
self.retry = self.retry + 1
if self.retry > 3:
await fail(task_id)
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "新华网", str(e))
else:
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
except Exception as e:
logger.error(F"[新华网]任务ID: {task_id} 爬虫异常")
logger.error(F"[新华网]任务ID: {task_id} 异常信息: {str(e)}")
# 尝试自旋
self.retry = self.retry + 1
await fail(task_id)
if self.retry > 3:
logger.error(F"[新华网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "新华网", str(e))
else:
logger.info(F"[新华网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
async def create_xinhua_client(self, httpx_proxy: Optional[str]) -> XinHuaClient:
# 请求头
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive",
"Cookie": "org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; wdcid=7af5eba7b2f8b44b; arialoadData=false; acw_tc=2760778017108394678246790e1403779a009cc2c5fe412f126407bf171637",
"Host": "so.news.cn", "Pragma": "no-cache", "Referer": "https://so.news.cn/", "Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"sec-ch-ua": "\"Chromium\";v=\"122\", \"Not(A:Brand\";v=\"24\", \"Google Chrome\";v=\"122\"",
"sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\""}
client = XinHuaClient(headers=headers, cookie_dict=None, playwright_page=self.context_page)
return client
async def launch_browser(self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[Dict],
headless: bool = True):
"""
:param chromium:
:param headless:
:param self:
:param playwright_proxy:
:param user_agent:
:return:
"""
# 浏览器对象
browser = await chromium.launch(proxy=playwright_proxy, headless=headless)
# 浏览器上下文
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def do_search(self, task):
"""
:return:
"""
results = []
cur_page = 1
logger.info(F"[新华网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}")
self.client = await self.create_xinhua_client(None)
while True:
logger.info(F"[新华网]开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}")
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page, lang=task.lang)
logger.info(F"[新华网]获取到{len(search_datas)}条搜索结果")
if not search_datas:
logger.info(F"[新华网]关键词: {task.keyword} 页码: {cur_page}没有搜索到数据")
break
index = -1
for i, data in enumerate(search_datas):
# 找到一个不是今天的数据就结束
if not date_format.is_today(data.get("pubtime")):
index = i
break
# 如果全都是今天的 就翻页
if index == -1:
# 搜索结果的最后一个依然是今天的 整个添加
results = results + search_datas
# 翻到下一页 继续找
cur_page = cur_page + 1
else:
# 搜索结果中有不是今天的 切割一部分添加
results = results + search_datas[:index]
# 结束本次搜索
break
logger.info(F"[新华网]关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}")
return results
async def cut_screen(self, url):
"""
:param url:
:return:
"""
if not self.image_path:
image_path = config.IMAGE_PATH
if is_blank(image_path):
self.image_path = "./data"
if not os.path.exists(self.image_path):
os.makedirs(self.image_path)
save_path = F"{self.image_path}/{uuid.uuid4()}.png"
# 开始截图
await self.context_page.goto(url)
await self.context_page.screenshot(path=save_path, full_page=True)
return save_path
async def do_spider(self, task_id):
# 获取任务信息
task = await get_task(task_id)
if not task:
logger.error(F"[新华网]任务ID: {task_id}不存在 任务结束")
return
logger.info(F"[新华网]任务ID: {task_id} 任务开始")
await running(task_id)
# 从api中获取数据
search_datas = await self.do_search(task)
if not search_datas:
logger.info(F"[新华网]任务ID: {task_id} 关键词:{task.keyword} 未搜索到结果 任务结束")
await complete(task_id)
return
# 保存result实体
results = []
# 启动浏览器
async with async_playwright() as playwright:
chromium = playwright.chromium
self.browser_context = await self.launch_browser(chromium, None, None, headless=True)
# 反反爬脚本
await self.browser_context.add_init_script(path="lib/stealth.min.js")
self.context_page: Page = await self.browser_context.new_page()
# 构建结果实体 截图
for data in search_datas:
result = gen_result(task, data.get("title"), data.get("url"), data.get("pubtime"))
# img_path = await self.cut_screen(data.get("url"))
# result.image = img_path
results.append(result)
# logger.info(F"[新华网]标题: {data.get('title')} 截图文件名: {img_path}")
# 结果落库
await save(results)
logger.info(F"[新华网]任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束")
await complete(task_id)

1
spiders/yang_shi/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

160
spiders/yang_shi/client.py

@ -0,0 +1,160 @@
# -*- coding: utf-8 -*-
import json
from typing import Dict
from urllib.parse import urlencode
import httpx
from playwright.async_api import Page
from .exception import DataFetchError
import asyncio
import json
import utils.date_format as date_format
from utils.utils import count_characters
from playwright.async_api import async_playwright
import asyncio
from utils.utils import logger
class YangShiClient:
def __init__(self,
timeout=60,
proxies=None,
*,
playwright_page: Page,
cookie_dict: Dict[str, str]):
self.proxies = proxies
self.timeout = timeout
self.headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1",
"Host": "search.people.cn",
"Content-Length": "163",
"Content-Type": "application/json",
"Origin": "http://search.people.cn",
"Pragma": "no-cache",
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
self._host = "https://search.cctv.com/"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def request(self, method, url, **kwargs):
"""
:param method:
:param url:
:param kwargs:
:return:
"""
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
data: Dict = response.json()
if data.get("code") != "0":
raise DataFetchError(data.get("message", "未知错误"))
else:
return data.get("data", {})
async def get(self, uri: str, params=None) -> Dict:
"""
GET
:param uri:
:param params:
:return:
"""
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers)
async def post(self, uri: str, data: dict) -> Dict:
"""
POST
:param uri:
:param data:
:return:
"""
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=F"{self._host}{uri}",
data=json_str, headers=self.headers)
async def search(self, keyword, cur_page):
"""
:param keyword:
:param cur_page:
:return:
"""
# 接口地址
uri = F"/search.php?qtext={keyword}&page={cur_page}&type=web&sort=date&datepid=1&channel=&vtime=-1&is_search=1"
full_url = F"{self._host}{uri}"
try:
await self.playwright_page.goto(full_url)
results = []
# 选择每一个结果元素
elements = await self.playwright_page.query_selector_all("div.tright")
for element in elements:
title = ""
url = ""
publish_time = ""
# 标题元素
tit = await element.query_selector(".tit")
if tit:
# 标题下面的链接
span = await tit.query_selector("span")
url = await span.get_attribute("lanmu1")
# 存放标题的a标签
tit_a = await span.query_selector("a")
if tit_a:
title = await tit_a.inner_text()
# 发布时间元素
tim = await element.query_selector(".src-tim .tim")
if tim:
tim_text = await tim.inner_text()
publish_time = tim_text.split("")[1]
# 保存数据
results.append({
"keyword": keyword,
"title": title,
"url": url,
"publish_time": publish_time
})
return results
except Exception as e:
logger.error(F"[央视网]搜索方法异常: 关键词: {keyword} 页码: {cur_page} {full_url}")
logger.error(F"[央视网]错误信息: {str(e)}")
raise DataFetchError(str(e), full_url)
async def run():
async with async_playwright() as playwright:
# 启动浏览器
async with async_playwright() as playwright:
chromium = playwright.chromium
browser = await chromium.launch(headless=False)
# 浏览器上下文
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=""
)
# 反反爬脚本
await browser_context.add_init_script(path="../../lib/stealth.min.js")
context_page: Page = await browser_context.new_page()
# 创建对象
client = YangShiClient(playwright_page=context_page, cookie_dict={})
result = await client.search("医保", 1)
print(result)
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(run())

19
spiders/yang_shi/exception.py

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
from httpx import RequestError
class DataFetchError(RequestError):
"""未知异常"""
def __init__(self, message, url, method="GET", params=None):
self.message = message
self.url = url
self.method = method
self.params = params
def __str__(self):
return self.message
class IPBlockError(RequestError):
"""ip被封禁异常"""

185
spiders/yang_shi/spider.py

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
from playwright.async_api import async_playwright, Page, BrowserType, BrowserContext
from base.base_spider import AbstractSpider
from typing import Dict, List, Optional, Tuple
from .client import YangShiClient
from utils.utils import logger, is_blank
from models.monitor_task_model import get_task, running, complete, fail
from models.monitor_result_model import gen_result, save
from base.enums import Platform
import utils.date_format as date_format
import os
import config
import uuid
from .exception import DataFetchError
import utils.mail as mail
import asyncio
from tortoise.transactions import in_transaction
class YangShiSpider(AbstractSpider):
"""
"""
client: YangShiClient # 请求对象
context_page: Page # 浏览器页面上下文
browser_context: BrowserContext # 浏览器上下文
image_path: str
def __init__(self):
self.index_url = "https://tv.cctv.com/"
self.platform = Platform.YANG_SHI
self.image_path = None
self.retry = 0 # 自旋次数
def init_config(self):
super().init_config()
async def start(self, task_id):
try:
async with in_transaction():
await self.do_spider(task_id)
except DataFetchError as e:
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常")
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}")
# 尝试自旋
self.retry = self.retry + 1
if self.retry > 3:
await fail(task_id)
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "央视网", str(e))
else:
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
except Exception as e:
logger.error(F"[央视网]任务ID: {task_id} 爬虫异常")
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}")
# 切换代理ip并自旋
logger.error(F"[央视网]任务ID: {task_id} 获取数据异常")
logger.error(F"[央视网]任务ID: {task_id} 异常信息: {str(e)}")
# 尝试自旋
self.retry = self.retry + 1
if self.retry > 3:
await fail(task_id)
logger.error(F"[央视网]任务ID: {task_id} 重试达到最大次数 即将发送告警邮件")
await mail.send_post_mail(task_id, "央视网", str(e))
else:
logger.info(F"[央视网]任务ID: {task_id} 20秒后进行第{self.retry}次重试")
await asyncio.sleep(20)
await self.do_spider(task_id)
async def create_client(self, httpx_proxy: Optional[str]) -> YangShiClient:
# 请求头
client = YangShiClient(proxies=httpx_proxy, cookie_dict={}, playwright_page=self.context_page)
return client
async def launch_browser(self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[Dict],
headless: bool = True):
"""
:param chromium:
:param headless:
:param self:
:param playwright_proxy:
:param user_agent:
:return:
"""
# 浏览器对象
browser = await chromium.launch(proxy=playwright_proxy, headless=headless)
# 浏览器上下文
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
# 反反爬脚本
await browser_context.add_init_script(path="lib/stealth.min.js")
context_page = await browser_context.new_page()
return browser_context, context_page
async def do_search(self, task):
"""
:return:
"""
results = []
cur_page = 1
logger.info(F"[央视网]开始执行任务 ID: {task.id} 关键词: {task.keyword} 语言: {task.lang}")
self.client = await self.create_client(None)
while True:
logger.info(F"[央视网]任务ID: {task.id} 开始获取搜索结果 关键词: {task.keyword} 页码: {cur_page}")
search_datas = await self.client.search(keyword=task.keyword, cur_page=cur_page)
logger.info(F"[央视网]任务ID: {task.id} 获取到{len(search_datas)}条搜索结果")
if not search_datas:
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 页码: {cur_page}没有搜索到数据")
break
index = -1
for i, data in enumerate(search_datas):
# 找到一个不是今天的数据就结束
if not date_format.is_today(data.get("publish_time")):
index = i
break
# 切割
if index == -1:
# 搜索结果的最后一个依然是今天的 整个添加
results = results + search_datas
# 翻到下一页 继续找
cur_page = cur_page + 1
else:
# 搜索结果中有不是今天的 切割一部分添加
results = results + search_datas[:index]
# 结束本次搜索
break
logger.info(F"[央视网]任务ID: {task.id} 关键词: {task.keyword} 搜索结束 总页码: {cur_page} 总条数: {len(results)}")
return results
async def cut_screen(self, url):
"""
:param url:
:return:
"""
if not self.image_path:
image_path = config.IMAGE_PATH
if is_blank(image_path):
self.image_path = "./data"
if not os.path.exists(self.image_path):
os.makedirs(self.image_path)
save_path = F"{self.image_path}/{uuid.uuid4()}.png"
# 开始截图
await self.context_page.goto(url)
await self.context_page.screenshot(path=save_path, full_page=True)
return save_path
async def do_spider(self, task_id):
# 获取任务信息
task = await get_task(task_id)
if not task:
logger.error(F"[央视网]任务ID: {task_id}不存在 任务结束")
return
logger.info(F"[央视网]任务ID: {task_id} 任务开始")
await running(task_id)
results = []
# 启动浏览器
async with async_playwright() as playwright:
chromium = playwright.chromium
self.browser_context, self.context_page = await self.launch_browser(chromium, None, None, headless=True)
# 创建请求客户端
search_datas = await self.do_search(task)
# 构建结果实体 截图
for data in search_datas:
result = gen_result(task, data.get("title"), data.get("url"), data.get("publish_time"))
# img_path = await self.cut_screen(data.get("url"))
# result.image = img_path
results.append(result)
# logger.info(F"[央视网] 任务ID: {task_id} 标题: {data.get('title')} 截图文件名: {img_path}")
# 结果落库
await save(results)
logger.info(F"[央视网] 任务ID: {task_id} 关键词: {task.keyword} 保存{len(results)}条数据 任务结束")
await complete(task_id)

1
utils/__init__.py

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

102
utils/date_format.py

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import time
def gen_job_datetime(time_str, date_str=''):
"""
:param time_str:
:param date_str:
:return:
"""
if not time_str:
return None
return parse(date_str + time_str)
def gen_today_str():
today = datetime.today()
return today.strftime("%Y-%m-%d")
def timestamp():
return int(time.time())
def is_today(date_str):
publish_date = parse(date_str).date() # 获取日期部分,忽略时间部分
today = datetime.today().date() # 获取今天的日期,忽略时间部分
# 检查日期是否相等
return publish_date == today
def timestamp2date(timestamp_long):
d = datetime.utcfromtimestamp(float(timestamp_long / 1000))
return d
def today_timestamp_long():
"""
:return:
"""
start = parse("00:00")
end = start + timedelta(days=1)
return start.timestamp() * 1000, end.timestamp() * 1000
def parse_time(time_str):
return datetime.strptime(time_str, '%H:%M').time()
def eq_time(time1: str, time2: str):
time1 = datetime.strptime(time1, '%H:%M').time()
time2 = datetime.strptime(time2, '%H:%M').time()
today = datetime.today().date()
time1 = datetime.combine(today, time1)
time2 = datetime.combine(today, time2)
return time1 == time2
def ge_time(time1: str, time2: str):
"""
time1是否大于等于time2
:param time1:
:param time2:
:return:
"""
time1 = datetime.strptime(time1, '%H:%M').time()
time2 = datetime.strptime(time2, '%H:%M').time()
today = datetime.today().date()
time1 = datetime.combine(today, time1)
time2 = datetime.combine(today, time2)
return time1 >= time2
def lt_time(time1: str, time2: str):
"""
time1是否小于time2
:param time1:
:param time2:
:return:
"""
time1 = datetime.strptime(time1, '%H:%M').time()
time2 = datetime.strptime(time2, '%H:%M').time()
today = datetime.today().date()
time1 = datetime.combine(today, time1)
time2 = datetime.combine(today, time2)
return time1 < time2
if __name__ == '__main__':
print(lt_time("18:52", "23:55"))

83
utils/mail.py

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
import yagmail
import config
from utils.utils import logger
import inspect
from datetime import datetime
from models.monitor_task_model import get_task, complete
async def send_post_mail(task_id, name, message="出现异常"):
keyword = ""
task = await get_task(task_id)
if not task:
message = F"不存在的任务ID: {task_id}"
else:
keyword = task.keyword
mail_server = None
try:
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST)
except Exception as e:
logger.error("[邮件]初始化失败 请检查邮件配置")
return
to = []
if not config.POST_EMAIL:
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消")
return
if isinstance(config.POST_EMAIL, str):
to.append(config.POST_EMAIL)
elif isinstance(config.POST_EMAIL, list):
to = config.POST_EMAIL
else:
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消")
return
title = F"HuoSpider上报邮件: {name}数据获取异常"
content = F"""
ID: {task_id}
: {name}
: {keyword}
: {message}
: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")}
"""
inspect.cleandoc(content)
mail_server.send(to, title, content)
mail_server.close()
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}")
async def test(task_id, name, message="出现异常"):
keyword = "测试"
mail_server = None
try:
mail_server = yagmail.SMTP(user=config.SMTP_USER, password=config.SMTP_PASSWORD, host=config.SMTP_HOST)
except Exception as e:
logger.error("[邮件]初始化失败 请检查邮件配置")
return
to = []
if not config.POST_EMAIL:
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消")
return
if isinstance(config.POST_EMAIL, str):
to.append(config.POST_EMAIL)
elif isinstance(config.POST_EMAIL, list):
to = config.POST_EMAIL
else:
logger.warn("[邮件]未配置用于接收邮件上报的邮箱地址 邮件上报被取消")
return
title = F"HuoSpider上报邮件: {name}数据获取异常"
content = F"""
ID: {task_id}
: {name}
: {keyword}
: {message}
: {datetime.today().strftime("%Y-%m-%d %H:%M:%S")}
"""
inspect.cleandoc(content)
mail_server.send(to, title, content)
mail_server.close()
logger.info(f"[邮件]任务ID: {task_id} 异常上报邮件发送成功 收件人: {to}")

113
utils/proxy.py

@ -0,0 +1,113 @@
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import httpx
import retry
from dateutil.parser import parse
import config
from utils.utils import logger
ip_pool = []
# 代理池
class ProxyIp:
ip: str
port: int
expire: datetime
city: str
isp: str
def __init__(self, ip, port, expire, city='未知', isp='未知'):
self.ip = ip
self.port = port
self.expire = parse(expire)
self.city = city
self.isp = isp
def __str__(self):
return F"({self.city}-{self.isp}){self.ip}:{self.port} 过期时间:{self.expire}"
def is_expire(self):
now = datetime.now()
expire = self.expire - timedelta(seconds=20)
return expire < now
def to_httpx_proxies(self):
return {"http://": F"http://{self.ip}:{self.port}"}
class ProxyError(Exception):
def __init__(self, message, code=-1000):
self.code = code
self.message = message
def __str__(self):
return F"错误码: {self.code} 错误消息: {self.message}"
@retry.retry(exceptions=ProxyError, tries=3, delay=2, backoff=2)
def add_ip(count=1) -> ProxyIp:
"""
ip池中添加一个代理ip对象
:param count: 1
:return:
"""
url = "http://api.tianqiip.com/getip"
params = {
"secret": config.PROXY_SECRET, # 密钥
"sign": config.PROXY_SIGN, # 签名
"num": count, # 数量
"type": "json", # 返回类型
"port": 1, # 协议
"time": 3, # 时长三分钟
"ts": 1, # 显示过期时间
"mr": 1, # 去重
"cs": 1, # 显示位置
"ys": 1 # 显示运营商
}
ips = []
result: dict = httpx.get(url, params=params, proxies={}).json()
if not result['code'] == 1000:
logger.error("[IP池]API获取代理IP失败")
raise ProxyError(result['code'], result['msg'])
for data in result["data"]:
ip = ProxyIp(data['ip'], data['port'], data['expire'], city=data['city'], isp=data['isp'])
ip_pool.append(ip)
ips.append(ip)
logger.info(F"[IP池]新增代理IP {str(ip)}")
return ips[0]
def del_ip(index):
if index > len(ip_pool) - 1:
return
logger.error(f"[IP池]代理IP被删除: {ip_pool[index]}")
del ip_pool[index]
def get_ip(cache=True) -> ProxyIp:
"""
ip对象
:param cache: 使
:return:
"""
if not cache:
# 不使用缓存时 请求一个新的ip并放入池中 然后获取该ip
return add_ip()
# 从缓存中获取一个有效的ip
if not ip_pool:
return add_ip()
cur_ip = None
for index, ip in enumerate(ip_pool):
if not ip.is_expire():
# 没过期 返回
cur_ip = ip
break
if not cur_ip:
return add_ip()
logger.info(f"[IP池]从IP池中获取到代理IP: {cur_ip}")
return cur_ip

109
utils/scheduler.py

@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
import datetime
import random
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from typing import Dict, List
import config
from models.monitor_task_db import MonitorTask
from spiders.xinhua.spider import XinHuaSpider
from spiders.renmin.spider import RenMinSpider
from spiders.yang_shi.spider import YangShiSpider
from utils.utils import logger
from base.base_spider import AbstractSpider
import utils.date_format as date_format
from base.enums import Platform
import logging
from datetime import timedelta
import copy
def singleton(cls):
instances = {}
def getinstance():
if cls not in instances:
instances[cls] = cls()
return instances[cls]
return getinstance
@singleton
class SchedulerManager:
scheduler: AsyncIOScheduler
def __init__(self):
# 调整调度器日志等级
ap_logger = logging.getLogger('apscheduler')
ap_logger.setLevel(logging.WARNING)
self.scheduler = AsyncIOScheduler()
def get_scheduler(self):
"""
:return:
"""
if not self.scheduler:
self.scheduler = AsyncIOScheduler()
return self.scheduler
def start(self, paused=False):
self.scheduler.start(paused)
def add_task(self, task: MonitorTask, offset=0, is_random=False):
"""
:param is_random:
:param offset:
:param task:
:return:
"""
scheduler = self.get_scheduler()
spider: AbstractSpider = None
if task.platform == Platform.XIN_HUA:
spider = XinHuaSpider()
elif task.platform == Platform.REN_MIN:
spider = RenMinSpider()
elif task.platform == Platform.YANG_SHI:
spider = YangShiSpider()
if not spider:
# logger.error(F"未知的平台: {task.platform} 任务id: {task.id}")
return
if not task.gather_time:
logger.error(F"[调度器]采集时间不存在 任务id: {task.id}")
if is_random:
offset = offset + random.randint(1, 29)
# 时间向后偏移
task_date_time = date_format.gen_job_datetime(task.gather_time)
task_date_time = task_date_time + timedelta(seconds=offset)
if task_date_time < datetime.datetime.now():
task_date_time = datetime.datetime.now() + datetime.timedelta(seconds=60)
# 添加定时任务
scheduler.add_job(spider.start, "date", run_date=task_date_time, kwargs={"task_id": task.id})
logger.info(
F"[调度器]注册定时任务 ID: {task.id} 执行时间: {task_date_time} {F'偏移{offset}秒后执行' if offset > 0 else ''}")
def add_tasks(self, tasks: List[MonitorTask], is_random=False):
# 按平台和关键词分组
group = {}
for task in tasks:
if task.platform not in group:
group[task.platform] = {}
if task.keyword not in group[task.platform]:
group[task.platform][task.keyword] = []
group[task.platform][task.keyword].append(task)
# 遍历每个关键词组
for platform, platform_group in group.items():
for keyword, task_list in platform_group.items():
sorted_task_list = sorted(task_list, key=lambda e: date_format.parse_time(task.gather_time))
# 判断最后一个任务是否在极限时间之前
if date_format.lt_time(sorted_task_list[-1].gather_time, config.MAX_GATHER_TIME):
# 创建一个补偿任务
new_task = copy.deepcopy(sorted_task_list[-1])
new_task.gather_time = config.MAX_GATHER_TIME
sorted_task_list.append(new_task)
for sorted_task in sorted_task_list:
self.add_task(sorted_task, 0, is_random)

60
utils/utils.py

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
import logging
from logging.handlers import TimedRotatingFileHandler
import re
import os
def init_loging_config():
# 检查log文件夹是否存在
if not os.path.exists("./log"):
os.mkdir("./log")
# 创建一个handler,用于按日期写入日志文件
# 'W0' 表示每周滚动一次,'D' 表示每天滚动一次,'H' 表示每小时滚动一次,'M' 表示每分钟滚动一次
# 'midnight' 表示在午夜滚动,'h:m' 表示在指定的小时和分钟滚动
# backupCount 表示保留的日志文件的个数,超过后会删除最旧的日志文件
# when='D', interval=1, backupCount=7 表示每天滚动一次,并保留最近7天的日志文件
file_handler = TimedRotatingFileHandler('./log/huo_spider.log', when='D', interval=1, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
# 定义handler的输出格式
formatter = logging.Formatter('%(asctime)s [%(name)s] %(levelname)s %(message)s ')
file_handler.setFormatter(formatter)
level = logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s [%(name)s] %(levelname)s %(message)s ",
datefmt='[%Y-%m-%d %H:%M:%S]'
)
_logger = logging.getLogger("HuoSpider")
_logger.setLevel(level)
_logger.addHandler(file_handler)
return _logger
logger = init_loging_config()
def is_blank(val: str):
if val is None:
return False
if not val.strip():
return False
return True
def count_characters(val):
"""
:param val:
:return:
"""
if not isinstance(val, str):
val = str(val)
chinese_pattern = re.compile(r'[\u4e00-\u9fa5]')
not_chinese_pattern = re.compile(r'[^\u4e00-\u9fa5]')
chinese = re.findall(chinese_pattern, val)
not_chinese = re.findall(not_chinese_pattern, val)
return len(chinese), len(not_chinese)
Loading…
Cancel
Save