You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
3.1 KiB
89 lines
3.1 KiB
# _*_ coding: utf-8 _*_
|
|
|
|
import execjs
|
|
import requests
|
|
from loguru import logger
|
|
import os
|
|
from datetime import datetime
|
|
import config
|
|
import json
|
|
|
|
from utils import CookieUtils
|
|
|
|
|
|
# sign.js就是上面还原的算法代码,自行保存即可
|
|
def load_js(file_path="sign.js"):
|
|
with open(file_path, "r", encoding="utf-8") as js_file:
|
|
js_code = js_file.read()
|
|
return execjs.compile(js_code)
|
|
|
|
|
|
def generate_signature(keyword, page, js_context):
|
|
return js_context.call('get_signature', keyword, page)
|
|
|
|
|
|
def perform_search(cookies, keyword, page, js_context):
|
|
headers = {
|
|
"accept": "*/*",
|
|
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
|
|
"cache-control": "no-cache",
|
|
"pragma": "no-cache",
|
|
"priority": "u=1, i",
|
|
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1",
|
|
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
|
|
"sec-ch-ua-mobile": "?0",
|
|
"sec-ch-ua-platform": "\"macOS\"",
|
|
"sec-fetch-dest": "empty",
|
|
"sec-fetch-mode": "cors",
|
|
"sec-fetch-site": "same-origin",
|
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
|
# 自行获取
|
|
"x-requested-with": "XMLHttpRequest"
|
|
}
|
|
|
|
signature_data = generate_signature(keyword, page, js_context)
|
|
logger.info(f'抓取第 {page} 页.')
|
|
params = {
|
|
'ts': signature_data['ts'],
|
|
'rs': signature_data['rs'],
|
|
'signature': signature_data['signature'],
|
|
'keywords': keyword,
|
|
'page_size': '10',
|
|
'page': str(page),
|
|
}
|
|
|
|
response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers)
|
|
if not response.status_code == 200:
|
|
raise RuntimeError(f"响应异常 状态码: {response.status_code}")
|
|
try:
|
|
return response.json()
|
|
except Exception:
|
|
raise RuntimeError(f"Json解析异常 响应体: {response.text}")
|
|
|
|
|
|
def process_search_results(cookies, keyword, max_page):
|
|
datas = []
|
|
js_context = load_js()
|
|
for page in range(1, max_page):
|
|
try:
|
|
result = perform_search(cookies, keyword, page, js_context)
|
|
datas.append(result)
|
|
logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}')
|
|
except Exception as e:
|
|
logger.error(f"出现异常: 关键词: {keyword} 页码: {page}")
|
|
logger.error(f"异常信息: {str(e)}")
|
|
return datas
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cookies = CookieUtils.cookie_str_to_dict(config.COOKIE)
|
|
if not os.path.exists('./data'):
|
|
os.mkdir('./data')
|
|
for k in config.KEYWORDS:
|
|
all_result = process_search_results(cookies, k, config.MAX_PAGE)
|
|
json_string = json.dumps(all_result, indent=4, ensure_ascii=False)
|
|
with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file:
|
|
for item in all_result:
|
|
file.write(f"{json_string}\n")
|
|
logger.success(f"{k} 查询结果已保存")
|
|
|