# -*- coding: utf-8 -*- import json from typing import Dict from urllib.parse import urlencode import httpx from httpx._exceptions import HTTPError, RequestError from playwright.async_api import Page from .exception import DataFetchError import asyncio import json import utils.date_format as date_format from utils.utils import count_characters from utils.utils import logger import utils.proxy as proxy import config class RenMinClient: def __init__(self, timeout=60, *, playwright_page: Page, cookie_dict: Dict[str, str]): self.timeout = timeout self.headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", "Host": "search.people.cn", "Content-Length": "163", "Content-Type": "application/json", "Origin": "http://search.people.cn", "Pragma": "no-cache", "Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" } self._host = "http://search.people.cn" self.playwright_page = playwright_page self.cookie_dict = cookie_dict async def request(self, method, url, **kwargs): """ 请求方法 :param method: 请求方法 :param url: 地址 :param kwargs: 参数 :return: 返回结果 """ # api代理 proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None try: async with httpx.AsyncClient(proxies=proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) # 人民网504 是没有数据 if response.status_code == 504: # logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}") # raise DataFetchError("黑名单异常", url, method, kwargs) return {} if not response.status_code == 200: logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") raise DataFetchError("httpx异常", url, method, kwargs) data: Dict = response.json() if data.get("code") != "0": raise DataFetchError(data.get("message", "未知错误"), url) else: return data.get("data", {}) except HTTPError as e: logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}") logger.error(F"[人民网]错误信息{str(e)}") raise DataFetchError(str(e), url) except Exception as e: logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") logger.error(F"[人民网]错误信息{str(e)}") raise Exception(str(e)) async def get(self, uri: str, params=None) -> Dict: """ GET 请求方法 :param uri: 请求地址 :param params: 参数 :return: 返回结果 """ final_uri = uri if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) async def post(self, uri: str, data: dict) -> Dict: """ POST 请求方法 :param uri: 请求地址 :param data: 参数 :return: 返回结果 """ json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=F"{self._host}{uri}", data=json_str, headers=self.headers) async def search(self, keyword, cur_page): """ 搜索 :param end: :param start: :param keyword: 关键词 :param cur_page: 页码 :param sort_field: 排序 # 0 时间倒序 1 时间正序 :return: """ # 接口地址 uri = '/search-platform/front/search' get_param = { 'key': keyword, 'startTime': 0, 'endTime': 0, 'hasContent': True, 'hasTitle': True, 'isFuzzy': False, # 精准匹配 'limit': 10, 'page': cur_page, 'sortType': 0, 'type': 0 } chinese, not_chinese = count_characters(keyword) # 长度 = 127+ 汉字*3 + 其他*1 # 关键字部分 content_length = 126 + (chinese * 3) + not_chinese + 1 # 如果精准匹配是False 加一字节 # 页码部分 chinese, not_chinese = count_characters(cur_page) content_length = content_length + not_chinese logger.info(F"[人民网]请求长度: {content_length}") logger.info(F"[人民网]参数: {get_param}") self.headers['Content-Length'] = str(content_length) content = await self.post(uri, get_param) if not content or not content.get('records'): return [] return content.get('records', []) if __name__ == '__main__': client = RenMinClient(playwright_page=None, cookie_dict={}) start, end = date_format.today_timestamp_long() asyncio.run(client.search('乡村发展', 1))