You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							151 lines
						
					
					
						
							5.7 KiB
						
					
					
				
			
		
		
		
			
			
			
				
					
				
				
					
				
			
		
		
	
	
							151 lines
						
					
					
						
							5.7 KiB
						
					
					
				| # -*- coding: utf-8 -*- | |
| 
 | |
| import json | |
| from typing import Dict | |
| from urllib.parse import urlencode | |
| import httpx | |
| from httpx._exceptions import HTTPError, RequestError | |
| from playwright.async_api import Page | |
| from .exception import DataFetchError | |
| import asyncio | |
| import json | |
| import utils.date_format as date_format | |
| from utils.utils import count_characters | |
| from utils.utils import logger | |
| import utils.proxy as proxy | |
| import config | |
| 
 | |
| 
 | |
| class RenMinClient: | |
|     def __init__(self, | |
|                  timeout=60, | |
|                  *, | |
|                  playwright_page: Page, | |
|                  cookie_dict: Dict[str, str]): | |
|         self.timeout = timeout | |
|         self.headers = { | |
|             "Accept": "application/json, text/plain, */*", | |
|             "Accept-Encoding": "gzip, deflate", | |
|             "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", | |
|             "Cache-Control": "no-cache", | |
|             "Connection": "keep-alive", | |
|             "Cookie": "__jsluid_h=103d2323e283c476b59b2fdd3b9a5371; sso_c=0; sfr=1", | |
|             "Host": "search.people.cn", | |
|             "Content-Length": "163", | |
|             "Content-Type": "application/json", | |
|             "Origin": "http://search.people.cn", | |
|             "Pragma": "no-cache", | |
|             "Referer": "http://search.people.cn/s?keyword=%E4%B9%A1%E6%9D%91%E6%8C%AF%E5%85%B4&st=0&_=1710919073824", | |
|             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" | |
|         } | |
|         self._host = "http://search.people.cn" | |
|         self.playwright_page = playwright_page | |
|         self.cookie_dict = cookie_dict | |
| 
 | |
|     async def request(self, method, url, **kwargs): | |
|         """ | |
|         请求方法 | |
|         :param method:  请求方法 | |
|         :param url: 地址 | |
|         :param kwargs: 参数 | |
|         :return: 返回结果 | |
|         """ | |
|         # api代理 | |
|         proxies = proxy.get_ip().to_httpx_proxies() if config.API_PROXY else None | |
|         try: | |
|             async with httpx.AsyncClient(proxies=proxies) as client: | |
|                 response = await client.request( | |
|                     method, url, timeout=self.timeout, | |
|                     **kwargs | |
|                 ) | |
|                 # 人民网504 是没有数据 | |
|                 if response.status_code == 504: | |
|                     # logger.error(F"[人民网]黑名单异常: [{method}]{url} 参数: {kwargs}") | |
|                     # raise DataFetchError("黑名单异常", url, method, kwargs) | |
|                     return {} | |
|                 if not response.status_code == 200: | |
|                     logger.error(F"[人民网]httpx异常[{response.status_code}]: [{method}]{url} 参数: {kwargs}") | |
|                     raise DataFetchError("httpx异常", url, method, kwargs) | |
|             data: Dict = response.json() | |
|             if data.get("code") != "0": | |
|                 raise DataFetchError(data.get("message", "未知错误"), url) | |
|             else: | |
|                 return data.get("data", {}) | |
|         except HTTPError as e: | |
|             logger.error(F"[人民网]httpx异常: [{e.request.method}]{e.request.url} 参数: {kwargs}") | |
|             logger.error(F"[人民网]错误信息{str(e)}") | |
|             raise DataFetchError(str(e), url) | |
|         except Exception as e: | |
|             logger.error(F"[人民网]未知的请求方法异常: [{method}]{url} 参数: {kwargs}") | |
|             logger.error(F"[人民网]错误信息{str(e)}") | |
|             raise Exception(str(e)) | |
| 
 | |
|     async def get(self, uri: str, params=None) -> Dict: | |
|         """ | |
|         GET 请求方法 | |
|         :param uri: 请求地址 | |
|         :param params: 参数 | |
|         :return:  返回结果 | |
|         """ | |
|         final_uri = uri | |
|         if isinstance(params, dict): | |
|             final_uri = (f"{uri}?" | |
|                          f"{urlencode(params)}") | |
|         return await self.request(method="GET", url=F"{self._host}{final_uri}", headers=self.headers) | |
| 
 | |
|     async def post(self, uri: str, data: dict) -> Dict: | |
|         """ | |
|         POST 请求方法 | |
|         :param uri: 请求地址 | |
|         :param data: 参数 | |
|         :return:  返回结果 | |
|         """ | |
|         json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) | |
|         return await self.request(method="POST", url=F"{self._host}{uri}", | |
|                                   data=json_str, headers=self.headers) | |
| 
 | |
|     async def search(self, keyword, cur_page): | |
|         """ | |
|         搜索 | |
|         :param end: | |
|         :param start: | |
|         :param keyword: 关键词 | |
|         :param cur_page:  页码 | |
|         :param sort_field:  排序 # 0 时间倒序 1 时间正序 | |
|         :return: | |
|         """ | |
|         # 接口地址 | |
|         uri = '/search-platform/front/search' | |
|         get_param = { | |
|             'key': keyword, | |
|             'startTime': 0, | |
|             'endTime': 0, | |
|             'hasContent': True, | |
|             'hasTitle': True, | |
|             'isFuzzy': False,  # 精准匹配 | |
|             'limit': 10, | |
|             'page': cur_page, | |
|             'sortType': 0, | |
|             'type': 0 | |
|         } | |
|         chinese, not_chinese = count_characters(keyword) | |
|         # 长度 = 127+ 汉字*3 + 其他*1 | |
|         # 关键字部分 | |
|         content_length = 126 + (chinese * 3) + not_chinese + 1  # 如果精准匹配是False 加一字节 | |
|         # 页码部分 | |
|         chinese, not_chinese = count_characters(cur_page) | |
|         content_length = content_length + not_chinese | |
| 
 | |
|         logger.info(F"[人民网]请求长度: {content_length}") | |
|         logger.info(F"[人民网]参数: {get_param}") | |
|         self.headers['Content-Length'] = str(content_length) | |
|         content = await self.post(uri, get_param) | |
|         if not content or not content.get('records'): | |
|             return [] | |
|         return content.get('records', []) | |
| 
 | |
| 
 | |
| if __name__ == '__main__': | |
|     client = RenMinClient(playwright_page=None, cookie_dict={}) | |
|     start, end = date_format.today_timestamp_long() | |
|     asyncio.run(client.search('乡村发展', 1))
 |