From 556e082319868e9e9d3fdba339cd8f21fe0903e2 Mon Sep 17 00:00:00 2001 From: zzx Date: Mon, 24 Feb 2025 10:45:13 +0800 Subject: [PATCH] =?UTF-8?q?:tada:=20=E5=88=9D=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 67 ++++++++++++++++++++++++++ config.py | 5 ++ main.py | 87 ++++++++++++++++++++++++++++++++++ requirements.txt | 11 +++++ sign.js | 38 +++++++++++++++ test2.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++ utils/__init__.py | 2 + utils/cookie_utils.py | 34 ++++++++++++++ 9 files changed, 488 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config.py create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 sign.js create mode 100644 test2.py create mode 100644 utils/__init__.py create mode 100644 utils/cookie_utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7633447 --- /dev/null +++ b/.gitignore @@ -0,0 +1,127 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +data +.idea +.vscode + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the executable, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Poetry +# Uncomment if you are NOT using Poetry +#poetry.lock + +# End of file \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0816a09 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +## 安装步骤 + +### 1. 安装 Python +确保你已安装 Python 3.9 或更高版本。你可以从 [Python 官方网站](https://www.python.org/downloads/) 下载并安装。 + +验证 Python 是否安装成功: +```bash +python --version +``` + +### 2. 安装 virtualenv +`virtualenv` 是一个用于创建隔离的 Python 环境的工具。如果尚未安装,请运行以下命令: +```bash +pip install virtualenv +``` + +### 3. 创建虚拟环境 +在项目根目录下运行以下命令,创建一个虚拟环境: +```bash +virtualenv venv +``` + +激活虚拟环境: +- **Windows**: + ```bash + .\venv\Scripts\activate + ``` +- **Linux/macOS**: + ```bash + source venv/bin/activate + ``` + +### 4. 安装依赖 +在虚拟环境中,运行以下命令安装项目依赖: +```bash +pip install -r requirements.txt +``` + +`requirements.txt` 文件包含了项目所需的所有依赖及其版本。确保该文件位于项目根目录下。 + +### 5. 运行项目 +在虚拟环境中,运行以下命令启动项目: +```bash +python main.py +``` + +如果项目包含多个脚本,根据需要运行相应的脚本文件。 + +## 示例结构 +以下是项目的文件结构示例: +``` +project-root/ +├── README.md +├── requirements.txt +├── main.py +├── data/ +│ └── example.txt +└── venv/ +``` + +## 注意事项 +1. **虚拟环境**:始终在虚拟环境中运行项目,以避免全局环境的依赖冲突。 +2. **依赖更新**:如果需要更新依赖,请运行以下命令: + ```bash + pip install --upgrade -r requirements.txt + ``` +3. **退出虚拟环境**:运行 `deactivate` 命令退出虚拟环境。 diff --git a/config.py b/config.py new file mode 100644 index 0000000..ac1a63d --- /dev/null +++ b/config.py @@ -0,0 +1,5 @@ +# coding=utf-8 + +KEYWORDS = ['咸鱼之王'] +MAX_PAGE = 10 +COOKIE = 'UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=180.109.135.223_1734009009.184774; Apache=180.109.135.223_1734009009.184775; ULV=1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920; ALF=02_1742823137; SCF=AnbhzHKrnUQl7Hr1ketFkwfNrrNrnZoluPHwHKF6Cd5jepxd4jnBttaKovu1rtniHOjeKih3dtFzpJfX3fnoexE.; SUB=_2A25KvaGxDeRhGeVO6VQW9S7FzD2IHXVpsrt5rDV_PUJbkNAbLRGkkW9NTWl_zXi3K6i7_10g-b280K9gc5zMpYjq; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt; U_TRS1=000000d8.6b0c8bd59.67b9d1e3.a609672c; U_TRS2=000000d8.6b148bd59.67b9d1e3.70e955ed; HM-AMT=%7B%22amt%22%3A24089921%2C%22amt24h%22%3A15814%2C%22v%22%3A%222.3.168%22%2C%22vPcJs%22%3A%221.6.79%22%2C%22vPcCss%22%3A%221.2.393%22%7D' diff --git a/main.py b/main.py new file mode 100644 index 0000000..ac4adaa --- /dev/null +++ b/main.py @@ -0,0 +1,87 @@ +# _*_ coding: utf-8 _*_ + +import execjs +import requests +from loguru import logger +import os +from datetime import datetime +import config + +from utils import CookieUtils + + +# sign.js就是上面还原的算法代码,自行保存即可 +def load_js(file_path="sign.js"): + with open(file_path, "r", encoding="utf-8") as js_file: + js_code = js_file.read() + return execjs.compile(js_code) + + +def generate_signature(keyword, page, js_context): + return js_context.call('get_signature', keyword, page) + + +def perform_search(cookies, keyword, page, js_context): + headers = { + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "cache-control": "no-cache", + "pragma": "no-cache", + "priority": "u=1, i", + "referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", + "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", + # 自行获取 + "x-requested-with": "XMLHttpRequest" + } + + signature_data = generate_signature(keyword, page, js_context) + logger.info(f'抓取第 {page} 页.') + params = { + 'ts': signature_data['ts'], + 'rs': signature_data['rs'], + 'signature': signature_data['signature'], + 'keywords': keyword, + 'page_size': '10', + 'page': str(page), + } + + response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers) + if not response.status_code == 200: + raise RuntimeError(f"响应异常 状态码: {response.status_code}") + try: + return response.json() + except Exception: + raise RuntimeError(f"Json解析异常 响应体: {response.text}") + + +def process_search_results(cookies, keyword, max_page): + datas = [] + js_context = load_js() + for page in range(1, max_page): + try: + result = perform_search(cookies, keyword, page, js_context) + datas.append(result) + logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}') + except Exception as e: + logger.error(f"出现异常: 关键词: {keyword} 页码: {page}") + logger.error(f"异常信息: {str(e)}") + return datas + + +if __name__ == '__main__': + cookies = CookieUtils.cookie_str_to_dict(config.COOKIE) + if not os.path.exists('./data'): + os.mkdir('./data') + for k in config.KEYWORDS: + all_result = process_search_results(cookies, k, config.MAX_PAGE) + with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file: + for item in all_result: + file.write(f"{item}\n") + logger.success(f"{k} 查询结果已保存") + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ceef0c0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +certifi==2025.1.31 +charset-normalizer==3.4.1 +colorama==0.4.6 +idna==3.10 +loguru==0.7.3 +PyExecJS==1.5.1 +requests==2.32.3 +six==1.17.0 +urllib3==2.3.0 +wheel==0.41.2 +win32_setctime==1.2.0 \ No newline at end of file diff --git a/sign.js b/sign.js new file mode 100644 index 0000000..f099340 --- /dev/null +++ b/sign.js @@ -0,0 +1,38 @@ +const crypto = require('crypto'); + +function sha256Hash(input) { + return crypto.createHash('sha256').update(input).digest('hex'); +} + +function get_signature(keyword, page) { + // 获取当前时间戳 + const u = new Date().getTime(); + + // 生成随机字符串 + function generateRandomString(e, t, r) { + let n = ""; + let i = t; + const a = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + if (e) { + i = Math.round(Math.random() * (r - t)) + t; + } + for (let o = 0; o < i; o++) { + n += a[Math.round(Math.random() * (a.length - 1))]; + } + return n; + } + + const l = generateRandomString(false, 16); + + // 将输入参数排序并连接成字符串 + const inputStr = [u, l, '$d6eb7ff91ee257475%', keyword, 10, page].sort().join(""); + + // 使用 SHA-256 算法生成签名 + const signature = sha256Hash(inputStr); + + return { + 'ts': u, + 'rs': l, + 'signature': signature + }; +} diff --git a/test2.py b/test2.py new file mode 100644 index 0000000..f8c3c0b --- /dev/null +++ b/test2.py @@ -0,0 +1,117 @@ +import requests +import random +import hashlib +import time +import json +from bs4 import BeautifulSoup + + +cookies = { + #自己复制 + "UOR": "www.baidu.com,tousu.sina.com.cn,", + "SINAGLOBAL": "180.109.135.223_1734009009.184774", + "Apache": "180.109.135.223_1734009009.184775", + "ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920", + "ALF": "02_1742823137", + "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt", + "U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c", + "U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed" +} + +headers = { + #自己复制 + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "cache-control": "no-cache", + "pragma": "no-cache", + "priority": "u=1, i", + "referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", + "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取 + "x-requested-with":"XMLHttpRequest" +} + +#[l, p, b, h, c, d["type" + e]].sort().join("") +def generate_random_string(e=True, t=4, r=16): + chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + length = t if not e else random.randint(t, r) + return ''.join(random.choice(chars) for _ in range(length)) + +def get_sha256(value): + """ + sha256加密 + :param value: 加密字符串 + :return: 加密结果转换为16进制字符串,并大写 + """ + hsobj = hashlib.sha256() + hsobj.update(value.encode("utf-8")) + return hsobj.hexdigest() + +requests.packages.urllib3.disable_warnings() +sessions=requests.session() +data=[] +number=0 +for i in range(1,101):#1524 + print(i) + url_list=[] + if len(data)%50==0 and len(data)!=0: + time.sleep(60) + while True: + ts=str(int(time.time() * 1000))#ts,时间戳 + l=ts + rs=generate_random_string(True, 4, 16) + p=rs#rs + b = '$d6eb7ff91ee257475%' + h='外卖 食品安全'#keywords + c='10'#page_size + d=str(i)#d["type" + e]=page + signature=''.join(sorted([l, p, b, h, c, d])) + signature=get_sha256(signature) + params = { + 'ts': ts, + 'rs': rs, + 'signature': signature, + 'keywords': h, + 'page_size': c, + 'page': d, + } + try: + response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies, + headers=headers,params=params,verify=False,allow_redirects=False) + response=json.loads(response.text)['result']['data']['lists'] + #print(response) + for n in range(len(response)): + if response[n]['main']['evaluate_u']==None: + number+=1 + continue + else: + url=response[n]['main']['url'] + url_list.append(url) + number+=1 + break + except Exception as e: + print(e,response.text,i) + time.sleep(300) + continue + for url in url_list: + while True: + try: + response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False) + soup = BeautifulSoup(response.text, 'html.parser') + u_date_elements = soup.find_all(class_='u-date') + u_list=soup.find('ul', class_='ts-q-list') + c_num=u_list.find_all('li')[0].text + endtime=u_date_elements[2].text + starttime=u_date_elements[6].text + data.append([starttime,endtime,c_num]) + break + except Exception as e: + print(e,response.text,i) + time.sleep(60) + continue +data=pd.DataFrame(data,columns=['starttime','endtime','c_num']) \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..9649ba2 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,2 @@ +# coding=utf-8 +from .cookie_utils import CookieUtils diff --git a/utils/cookie_utils.py b/utils/cookie_utils.py new file mode 100644 index 0000000..ed233ed --- /dev/null +++ b/utils/cookie_utils.py @@ -0,0 +1,34 @@ +# coding=utf-8 + +import urllib.parse +import os + + +class CookieUtils: + @staticmethod + def cookie_str_to_dict(cookie_string): + cookie_dict = {} + # 分割字符串,每个cookie之间用分号隔开 + cookie_pairs = cookie_string.split(';') + for pair in cookie_pairs: + # 去除前后空白字符 + pair = pair.strip() + if pair: # 确保不处理空字符串 + # 分割键和值,键和值之间用等号连接 + key, value = pair.split('=', 1) + # 去除键和值的前后空白字符 + key = key.strip() + value = value.strip() + # URL解码值(如果需要) + value = urllib.parse.unquote(value) + # 将键值对添加到字典中 + cookie_dict[key] = value + return cookie_dict + + @staticmethod + def read_cookie_dict_from_file(path='./cookie'): + if not os.path.exists(path): + raise RuntimeError(f"找不到cookie文件: {path}") + with open(path, 'r', encoding='utf-8') as file: + cookie_str = file.read() + return CookieUtils.cookie_str_to_dict(cookie_str)