commit 556e082319868e9e9d3fdba339cd8f21fe0903e2 Author: zzx Date: Mon Feb 24 10:45:13 2025 +0800 :tada: 初始化 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7633447 --- /dev/null +++ b/.gitignore @@ -0,0 +1,127 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +data +.idea +.vscode + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the executable, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Poetry +# Uncomment if you are NOT using Poetry +#poetry.lock + +# End of file \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0816a09 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +## 安装步骤 + +### 1. 安装 Python +确保你已安装 Python 3.9 或更高版本。你可以从 [Python 官方网站](https://www.python.org/downloads/) 下载并安装。 + +验证 Python 是否安装成功: +```bash +python --version +``` + +### 2. 安装 virtualenv +`virtualenv` 是一个用于创建隔离的 Python 环境的工具。如果尚未安装,请运行以下命令: +```bash +pip install virtualenv +``` + +### 3. 创建虚拟环境 +在项目根目录下运行以下命令,创建一个虚拟环境: +```bash +virtualenv venv +``` + +激活虚拟环境: +- **Windows**: + ```bash + .\venv\Scripts\activate + ``` +- **Linux/macOS**: + ```bash + source venv/bin/activate + ``` + +### 4. 安装依赖 +在虚拟环境中,运行以下命令安装项目依赖: +```bash +pip install -r requirements.txt +``` + +`requirements.txt` 文件包含了项目所需的所有依赖及其版本。确保该文件位于项目根目录下。 + +### 5. 运行项目 +在虚拟环境中,运行以下命令启动项目: +```bash +python main.py +``` + +如果项目包含多个脚本,根据需要运行相应的脚本文件。 + +## 示例结构 +以下是项目的文件结构示例: +``` +project-root/ +├── README.md +├── requirements.txt +├── main.py +├── data/ +│ └── example.txt +└── venv/ +``` + +## 注意事项 +1. **虚拟环境**:始终在虚拟环境中运行项目,以避免全局环境的依赖冲突。 +2. **依赖更新**:如果需要更新依赖,请运行以下命令: + ```bash + pip install --upgrade -r requirements.txt + ``` +3. **退出虚拟环境**:运行 `deactivate` 命令退出虚拟环境。 diff --git a/config.py b/config.py new file mode 100644 index 0000000..ac1a63d --- /dev/null +++ b/config.py @@ -0,0 +1,5 @@ +# coding=utf-8 + +KEYWORDS = ['咸鱼之王'] +MAX_PAGE = 10 +COOKIE = 'UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=180.109.135.223_1734009009.184774; Apache=180.109.135.223_1734009009.184775; ULV=1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920; ALF=02_1742823137; SCF=AnbhzHKrnUQl7Hr1ketFkwfNrrNrnZoluPHwHKF6Cd5jepxd4jnBttaKovu1rtniHOjeKih3dtFzpJfX3fnoexE.; SUB=_2A25KvaGxDeRhGeVO6VQW9S7FzD2IHXVpsrt5rDV_PUJbkNAbLRGkkW9NTWl_zXi3K6i7_10g-b280K9gc5zMpYjq; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt; U_TRS1=000000d8.6b0c8bd59.67b9d1e3.a609672c; U_TRS2=000000d8.6b148bd59.67b9d1e3.70e955ed; HM-AMT=%7B%22amt%22%3A24089921%2C%22amt24h%22%3A15814%2C%22v%22%3A%222.3.168%22%2C%22vPcJs%22%3A%221.6.79%22%2C%22vPcCss%22%3A%221.2.393%22%7D' diff --git a/main.py b/main.py new file mode 100644 index 0000000..ac4adaa --- /dev/null +++ b/main.py @@ -0,0 +1,87 @@ +# _*_ coding: utf-8 _*_ + +import execjs +import requests +from loguru import logger +import os +from datetime import datetime +import config + +from utils import CookieUtils + + +# sign.js就是上面还原的算法代码,自行保存即可 +def load_js(file_path="sign.js"): + with open(file_path, "r", encoding="utf-8") as js_file: + js_code = js_file.read() + return execjs.compile(js_code) + + +def generate_signature(keyword, page, js_context): + return js_context.call('get_signature', keyword, page) + + +def perform_search(cookies, keyword, page, js_context): + headers = { + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "cache-control": "no-cache", + "pragma": "no-cache", + "priority": "u=1, i", + "referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", + "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", + # 自行获取 + "x-requested-with": "XMLHttpRequest" + } + + signature_data = generate_signature(keyword, page, js_context) + logger.info(f'抓取第 {page} 页.') + params = { + 'ts': signature_data['ts'], + 'rs': signature_data['rs'], + 'signature': signature_data['signature'], + 'keywords': keyword, + 'page_size': '10', + 'page': str(page), + } + + response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers) + if not response.status_code == 200: + raise RuntimeError(f"响应异常 状态码: {response.status_code}") + try: + return response.json() + except Exception: + raise RuntimeError(f"Json解析异常 响应体: {response.text}") + + +def process_search_results(cookies, keyword, max_page): + datas = [] + js_context = load_js() + for page in range(1, max_page): + try: + result = perform_search(cookies, keyword, page, js_context) + datas.append(result) + logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}') + except Exception as e: + logger.error(f"出现异常: 关键词: {keyword} 页码: {page}") + logger.error(f"异常信息: {str(e)}") + return datas + + +if __name__ == '__main__': + cookies = CookieUtils.cookie_str_to_dict(config.COOKIE) + if not os.path.exists('./data'): + os.mkdir('./data') + for k in config.KEYWORDS: + all_result = process_search_results(cookies, k, config.MAX_PAGE) + with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file: + for item in all_result: + file.write(f"{item}\n") + logger.success(f"{k} 查询结果已保存") + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ceef0c0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +certifi==2025.1.31 +charset-normalizer==3.4.1 +colorama==0.4.6 +idna==3.10 +loguru==0.7.3 +PyExecJS==1.5.1 +requests==2.32.3 +six==1.17.0 +urllib3==2.3.0 +wheel==0.41.2 +win32_setctime==1.2.0 \ No newline at end of file diff --git a/sign.js b/sign.js new file mode 100644 index 0000000..f099340 --- /dev/null +++ b/sign.js @@ -0,0 +1,38 @@ +const crypto = require('crypto'); + +function sha256Hash(input) { + return crypto.createHash('sha256').update(input).digest('hex'); +} + +function get_signature(keyword, page) { + // 获取当前时间戳 + const u = new Date().getTime(); + + // 生成随机字符串 + function generateRandomString(e, t, r) { + let n = ""; + let i = t; + const a = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + if (e) { + i = Math.round(Math.random() * (r - t)) + t; + } + for (let o = 0; o < i; o++) { + n += a[Math.round(Math.random() * (a.length - 1))]; + } + return n; + } + + const l = generateRandomString(false, 16); + + // 将输入参数排序并连接成字符串 + const inputStr = [u, l, '$d6eb7ff91ee257475%', keyword, 10, page].sort().join(""); + + // 使用 SHA-256 算法生成签名 + const signature = sha256Hash(inputStr); + + return { + 'ts': u, + 'rs': l, + 'signature': signature + }; +} diff --git a/test2.py b/test2.py new file mode 100644 index 0000000..f8c3c0b --- /dev/null +++ b/test2.py @@ -0,0 +1,117 @@ +import requests +import random +import hashlib +import time +import json +from bs4 import BeautifulSoup + + +cookies = { + #自己复制 + "UOR": "www.baidu.com,tousu.sina.com.cn,", + "SINAGLOBAL": "180.109.135.223_1734009009.184774", + "Apache": "180.109.135.223_1734009009.184775", + "ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920", + "ALF": "02_1742823137", + "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt", + "U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c", + "U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed" +} + +headers = { + #自己复制 + "accept": "*/*", + "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", + "cache-control": "no-cache", + "pragma": "no-cache", + "priority": "u=1, i", + "referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", + "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取 + "x-requested-with":"XMLHttpRequest" +} + +#[l, p, b, h, c, d["type" + e]].sort().join("") +def generate_random_string(e=True, t=4, r=16): + chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + length = t if not e else random.randint(t, r) + return ''.join(random.choice(chars) for _ in range(length)) + +def get_sha256(value): + """ + sha256加密 + :param value: 加密字符串 + :return: 加密结果转换为16进制字符串,并大写 + """ + hsobj = hashlib.sha256() + hsobj.update(value.encode("utf-8")) + return hsobj.hexdigest() + +requests.packages.urllib3.disable_warnings() +sessions=requests.session() +data=[] +number=0 +for i in range(1,101):#1524 + print(i) + url_list=[] + if len(data)%50==0 and len(data)!=0: + time.sleep(60) + while True: + ts=str(int(time.time() * 1000))#ts,时间戳 + l=ts + rs=generate_random_string(True, 4, 16) + p=rs#rs + b = '$d6eb7ff91ee257475%' + h='外卖 食品安全'#keywords + c='10'#page_size + d=str(i)#d["type" + e]=page + signature=''.join(sorted([l, p, b, h, c, d])) + signature=get_sha256(signature) + params = { + 'ts': ts, + 'rs': rs, + 'signature': signature, + 'keywords': h, + 'page_size': c, + 'page': d, + } + try: + response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies, + headers=headers,params=params,verify=False,allow_redirects=False) + response=json.loads(response.text)['result']['data']['lists'] + #print(response) + for n in range(len(response)): + if response[n]['main']['evaluate_u']==None: + number+=1 + continue + else: + url=response[n]['main']['url'] + url_list.append(url) + number+=1 + break + except Exception as e: + print(e,response.text,i) + time.sleep(300) + continue + for url in url_list: + while True: + try: + response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False) + soup = BeautifulSoup(response.text, 'html.parser') + u_date_elements = soup.find_all(class_='u-date') + u_list=soup.find('ul', class_='ts-q-list') + c_num=u_list.find_all('li')[0].text + endtime=u_date_elements[2].text + starttime=u_date_elements[6].text + data.append([starttime,endtime,c_num]) + break + except Exception as e: + print(e,response.text,i) + time.sleep(60) + continue +data=pd.DataFrame(data,columns=['starttime','endtime','c_num']) \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..9649ba2 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,2 @@ +# coding=utf-8 +from .cookie_utils import CookieUtils diff --git a/utils/cookie_utils.py b/utils/cookie_utils.py new file mode 100644 index 0000000..ed233ed --- /dev/null +++ b/utils/cookie_utils.py @@ -0,0 +1,34 @@ +# coding=utf-8 + +import urllib.parse +import os + + +class CookieUtils: + @staticmethod + def cookie_str_to_dict(cookie_string): + cookie_dict = {} + # 分割字符串,每个cookie之间用分号隔开 + cookie_pairs = cookie_string.split(';') + for pair in cookie_pairs: + # 去除前后空白字符 + pair = pair.strip() + if pair: # 确保不处理空字符串 + # 分割键和值,键和值之间用等号连接 + key, value = pair.split('=', 1) + # 去除键和值的前后空白字符 + key = key.strip() + value = value.strip() + # URL解码值(如果需要) + value = urllib.parse.unquote(value) + # 将键值对添加到字典中 + cookie_dict[key] = value + return cookie_dict + + @staticmethod + def read_cookie_dict_from_file(path='./cookie'): + if not os.path.exists(path): + raise RuntimeError(f"找不到cookie文件: {path}") + with open(path, 'r', encoding='utf-8') as file: + cookie_str = file.read() + return CookieUtils.cookie_str_to_dict(cookie_str)