You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

151 lines
5.7 KiB

# -*- coding: utf-8 -*-
import json
from typing import Dict
from urllib.parse import urlencode
import httpx
from httpx._exceptions import HTTPError, RequestError
from playwright.async_api import Page
from .exception import DataFetchError
import asyncio
import json
import utils.date_format as date_format
from utils.utils import count_characters
from utils.utils import logger
import utils.proxy as proxy
import config
class RenMinClient:
def __init__(self,
timeout=60,
*,
playwright_page: Page,
cookie_dict: Dict[str, str]):
self.timeout = timeout
self.headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1",
"Host": "search.people.cn",
"Content-Length": "163",
"Content-Type": "application/json",
"Origin": "http://search.people.cn",
"Pragma": "no-cache",
"Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
self._host = "http://search.people.cn"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def request(self, method, url, **kwargs):
"""
请求方法
:param method: 请求方法
:param url: 地址
:param kwargs: 参数
:return: 返回结果
"""
# api代理
proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None
try:
async with httpx.AsyncClient(proxies=proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
# 人民网504 是没有数据
if response.status_code == 504:
# logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}")
# raise DataFetchError("黑名单异常", url, method, kwargs)
return {}
if not response.status_code == 200:
logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}")
raise DataFetchError("httpx异常", url, method, kwargs)
data: Dict = response.json()
if data.get("code") != "0":
raise DataFetchError(data.get("message", "未知错误"), url)
else:
return data.get("data", {})
except HTTPError as e:
logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}")
logger.error(F"[人民网]错误信息{str(e)}")
raise DataFetchError(str(e), url)
except Exception as e:
logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}")
logger.error(F"[人民网]错误信息{str(e)}")
raise Exception(str(e))
async def get(self, uri: str, params=None) -> Dict:
"""
GET 请求方法
:param uri: 请求地址
:param params: 参数
:return: 返回结果
"""
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers)
async def post(self, uri: str, data: dict) -> Dict:
"""
POST 请求方法
:param uri: 请求地址
:param data: 参数
:return: 返回结果
"""
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=F"{self._host}{uri}",
data=json_str, headers=self.headers)
async def search(self, keyword, cur_page):
"""
搜索
:param end:
:param start:
:param keyword: 关键词
:param cur_page: 页码
:param sort_field: 排序 # 0 时间倒序 1 时间正序
:return:
"""
# 接口地址
uri = '/search-platform/front/search'
get_param = {
'key': keyword,
'startTime': 0,
'endTime': 0,
'hasContent': True,
'hasTitle': True,
'isFuzzy': False, # 精准匹配
'limit': 10,
'page': cur_page,
'sortType': 0,
'type': 0
}
chinese, not_chinese = count_characters(keyword)
# 长度 = 127+ 汉字*3 + 其他*1
# 关键字部分
content_length = 126 + (chinese * 3) + not_chinese + 1 # 如果精准匹配是False 加一字节
# 页码部分
chinese, not_chinese = count_characters(cur_page)
content_length = content_length + not_chinese
logger.info(F"[人民网]请求长度: {content_length}")
logger.info(F"[人民网]参数: {get_param}")
self.headers['Content-Length'] = str(content_length)
content = await self.post(uri, get_param)
if not content or not content.get('records'):
return []
return content.get('records', [])
if __name__ == '__main__':
client = RenMinClient(playwright_page=None, cookie_dict={})
start, end = date_format.today_timestamp_long()
asyncio.run(client.search('乡村发展', 1))