Browse Source

🎉 初始化

master
zzx 8 months ago
commit
556e082319
  1. 127
      .gitignore
  2. 67
      README.md
  3. 5
      config.py
  4. 87
      main.py
  5. 11
      requirements.txt
  6. 38
      sign.js
  7. 117
      test2.py
  8. 2
      utils/__init__.py
  9. 34
      utils/cookie_utils.py

127
.gitignore

@ -0,0 +1,127 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
data
.idea
.vscode
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the executable, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Poetry
# Uncomment if you are NOT using Poetry
#poetry.lock
# End of file

67
README.md

@ -0,0 +1,67 @@
## 安装步骤
### 1. 安装 Python
确保你已安装 Python 3.9 或更高版本。你可以从 [Python 官方网站](https://www.python.org/downloads/) 下载并安装。
验证 Python 是否安装成功:
```bash
python --version
```
### 2. 安装 virtualenv
`virtualenv` 是一个用于创建隔离的 Python 环境的工具。如果尚未安装,请运行以下命令:
```bash
pip install virtualenv
```
### 3. 创建虚拟环境
在项目根目录下运行以下命令,创建一个虚拟环境:
```bash
virtualenv venv
```
激活虚拟环境:
- **Windows**
```bash
.\venv\Scripts\activate
```
- **Linux/macOS**
```bash
source venv/bin/activate
```
### 4. 安装依赖
在虚拟环境中,运行以下命令安装项目依赖:
```bash
pip install -r requirements.txt
```
`requirements.txt` 文件包含了项目所需的所有依赖及其版本。确保该文件位于项目根目录下。
### 5. 运行项目
在虚拟环境中,运行以下命令启动项目:
```bash
python main.py
```
如果项目包含多个脚本,根据需要运行相应的脚本文件。
## 示例结构
以下是项目的文件结构示例:
```
project-root/
├── README.md
├── requirements.txt
├── main.py
├── data/
│ └── example.txt
└── venv/
```
## 注意事项
1. **虚拟环境**:始终在虚拟环境中运行项目,以避免全局环境的依赖冲突。
2. **依赖更新**:如果需要更新依赖,请运行以下命令:
```bash
pip install --upgrade -r requirements.txt
```
3. **退出虚拟环境**:运行 `deactivate` 命令退出虚拟环境。

5
config.py

@ -0,0 +1,5 @@
# coding=utf-8
KEYWORDS = ['咸鱼之王']
MAX_PAGE = 10
COOKIE = 'UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=180.109.135.223_1734009009.184774; Apache=180.109.135.223_1734009009.184775; ULV=1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920; ALF=02_1742823137; SCF=AnbhzHKrnUQl7Hr1ketFkwfNrrNrnZoluPHwHKF6Cd5jepxd4jnBttaKovu1rtniHOjeKih3dtFzpJfX3fnoexE.; SUB=_2A25KvaGxDeRhGeVO6VQW9S7FzD2IHXVpsrt5rDV_PUJbkNAbLRGkkW9NTWl_zXi3K6i7_10g-b280K9gc5zMpYjq; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt; U_TRS1=000000d8.6b0c8bd59.67b9d1e3.a609672c; U_TRS2=000000d8.6b148bd59.67b9d1e3.70e955ed; HM-AMT=%7B%22amt%22%3A24089921%2C%22amt24h%22%3A15814%2C%22v%22%3A%222.3.168%22%2C%22vPcJs%22%3A%221.6.79%22%2C%22vPcCss%22%3A%221.2.393%22%7D'

87
main.py

@ -0,0 +1,87 @@
# _*_ coding: utf-8 _*_
import execjs
import requests
from loguru import logger
import os
from datetime import datetime
import config
from utils import CookieUtils
# sign.js就是上面还原的算法代码,自行保存即可
def load_js(file_path="sign.js"):
with open(file_path, "r", encoding="utf-8") as js_file:
js_code = js_file.read()
return execjs.compile(js_code)
def generate_signature(keyword, page, js_context):
return js_context.call('get_signature', keyword, page)
def perform_search(cookies, keyword, page, js_context):
headers = {
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
# 自行获取
"x-requested-with": "XMLHttpRequest"
}
signature_data = generate_signature(keyword, page, js_context)
logger.info(f'抓取第 {page} 页.')
params = {
'ts': signature_data['ts'],
'rs': signature_data['rs'],
'signature': signature_data['signature'],
'keywords': keyword,
'page_size': '10',
'page': str(page),
}
response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers)
if not response.status_code == 200:
raise RuntimeError(f"响应异常 状态码: {response.status_code}")
try:
return response.json()
except Exception:
raise RuntimeError(f"Json解析异常 响应体: {response.text}")
def process_search_results(cookies, keyword, max_page):
datas = []
js_context = load_js()
for page in range(1, max_page):
try:
result = perform_search(cookies, keyword, page, js_context)
datas.append(result)
logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}')
except Exception as e:
logger.error(f"出现异常: 关键词: {keyword} 页码: {page}")
logger.error(f"异常信息: {str(e)}")
return datas
if __name__ == '__main__':
cookies = CookieUtils.cookie_str_to_dict(config.COOKIE)
if not os.path.exists('./data'):
os.mkdir('./data')
for k in config.KEYWORDS:
all_result = process_search_results(cookies, k, config.MAX_PAGE)
with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file:
for item in all_result:
file.write(f"{item}\n")
logger.success(f"{k} 查询结果已保存")

11
requirements.txt

@ -0,0 +1,11 @@
certifi==2025.1.31
charset-normalizer==3.4.1
colorama==0.4.6
idna==3.10
loguru==0.7.3
PyExecJS==1.5.1
requests==2.32.3
six==1.17.0
urllib3==2.3.0
wheel==0.41.2
win32_setctime==1.2.0

38
sign.js

@ -0,0 +1,38 @@
const crypto = require('crypto');
function sha256Hash(input) {
return crypto.createHash('sha256').update(input).digest('hex');
}
function get_signature(keyword, page) {
// 获取当前时间戳
const u = new Date().getTime();
// 生成随机字符串
function generateRandomString(e, t, r) {
let n = "";
let i = t;
const a = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
if (e) {
i = Math.round(Math.random() * (r - t)) + t;
}
for (let o = 0; o < i; o++) {
n += a[Math.round(Math.random() * (a.length - 1))];
}
return n;
}
const l = generateRandomString(false, 16);
// 将输入参数排序并连接成字符串
const inputStr = [u, l, '$d6eb7ff91ee257475%', keyword, 10, page].sort().join("");
// 使用 SHA-256 算法生成签名
const signature = sha256Hash(inputStr);
return {
'ts': u,
'rs': l,
'signature': signature
};
}

117
test2.py

@ -0,0 +1,117 @@
import requests
import random
import hashlib
import time
import json
from bs4 import BeautifulSoup
cookies = {
#自己复制
"UOR": "www.baidu.com,tousu.sina.com.cn,",
"SINAGLOBAL": "180.109.135.223_1734009009.184774",
"Apache": "180.109.135.223_1734009009.184775",
"ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920",
"ALF": "02_1742823137",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt",
"U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c",
"U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed"
}
headers = {
#自己复制
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取
"x-requested-with":"XMLHttpRequest"
}
#[l, p, b, h, c, d["type" + e]].sort().join("")
def generate_random_string(e=True, t=4, r=16):
chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
length = t if not e else random.randint(t, r)
return ''.join(random.choice(chars) for _ in range(length))
def get_sha256(value):
"""
sha256加密
:param value:
:return: 16
"""
hsobj = hashlib.sha256()
hsobj.update(value.encode("utf-8"))
return hsobj.hexdigest()
requests.packages.urllib3.disable_warnings()
sessions=requests.session()
data=[]
number=0
for i in range(1,101):#1524
print(i)
url_list=[]
if len(data)%50==0 and len(data)!=0:
time.sleep(60)
while True:
ts=str(int(time.time() * 1000))#ts,时间戳
l=ts
rs=generate_random_string(True, 4, 16)
p=rs#rs
b = '$d6eb7ff91ee257475%'
h='外卖 食品安全'#keywords
c='10'#page_size
d=str(i)#d["type" + e]=page
signature=''.join(sorted([l, p, b, h, c, d]))
signature=get_sha256(signature)
params = {
'ts': ts,
'rs': rs,
'signature': signature,
'keywords': h,
'page_size': c,
'page': d,
}
try:
response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies,
headers=headers,params=params,verify=False,allow_redirects=False)
response=json.loads(response.text)['result']['data']['lists']
#print(response)
for n in range(len(response)):
if response[n]['main']['evaluate_u']==None:
number+=1
continue
else:
url=response[n]['main']['url']
url_list.append(url)
number+=1
break
except Exception as e:
print(e,response.text,i)
time.sleep(300)
continue
for url in url_list:
while True:
try:
response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False)
soup = BeautifulSoup(response.text, 'html.parser')
u_date_elements = soup.find_all(class_='u-date')
u_list=soup.find('ul', class_='ts-q-list')
c_num=u_list.find_all('li')[0].text
endtime=u_date_elements[2].text
starttime=u_date_elements[6].text
data.append([starttime,endtime,c_num])
break
except Exception as e:
print(e,response.text,i)
time.sleep(60)
continue
data=pd.DataFrame(data,columns=['starttime','endtime','c_num'])

2
utils/__init__.py

@ -0,0 +1,2 @@
# coding=utf-8
from .cookie_utils import CookieUtils

34
utils/cookie_utils.py

@ -0,0 +1,34 @@
# coding=utf-8
import urllib.parse
import os
class CookieUtils:
@staticmethod
def cookie_str_to_dict(cookie_string):
cookie_dict = {}
# 分割字符串,每个cookie之间用分号隔开
cookie_pairs = cookie_string.split(';')
for pair in cookie_pairs:
# 去除前后空白字符
pair = pair.strip()
if pair: # 确保不处理空字符串
# 分割键和值,键和值之间用等号连接
key, value = pair.split('=', 1)
# 去除键和值的前后空白字符
key = key.strip()
value = value.strip()
# URL解码值(如果需要)
value = urllib.parse.unquote(value)
# 将键值对添加到字典中
cookie_dict[key] = value
return cookie_dict
@staticmethod
def read_cookie_dict_from_file(path='./cookie'):
if not os.path.exists(path):
raise RuntimeError(f"找不到cookie文件: {path}")
with open(path, 'r', encoding='utf-8') as file:
cookie_str = file.read()
return CookieUtils.cookie_str_to_dict(cookie_str)
Loading…
Cancel
Save