commit
556e082319
9 changed files with 488 additions and 0 deletions
-
127.gitignore
-
67README.md
-
5config.py
-
87main.py
-
11requirements.txt
-
38sign.js
-
117test2.py
-
2utils/__init__.py
-
34utils/cookie_utils.py
@ -0,0 +1,127 @@ |
|||||
|
# Byte-compiled / optimized / DLL files |
||||
|
__pycache__/ |
||||
|
*.py[cod] |
||||
|
*$py.class |
||||
|
data |
||||
|
.idea |
||||
|
.vscode |
||||
|
|
||||
|
# Distribution / packaging |
||||
|
.Python |
||||
|
build/ |
||||
|
develop-eggs/ |
||||
|
dist/ |
||||
|
downloads/ |
||||
|
eggs/ |
||||
|
.eggs/ |
||||
|
lib/ |
||||
|
lib64/ |
||||
|
parts/ |
||||
|
sdist/ |
||||
|
var/ |
||||
|
*.egg-info/ |
||||
|
.installed.cfg |
||||
|
*.egg |
||||
|
|
||||
|
# PyInstaller |
||||
|
# Usually these files are written by a python script from a template |
||||
|
# before PyInstaller builds the executable, so as to inject date/other infos into it. |
||||
|
*.manifest |
||||
|
*.spec |
||||
|
|
||||
|
# Installer logs |
||||
|
pip-log.txt |
||||
|
pip-delete-this-directory.txt |
||||
|
|
||||
|
# Unit test / coverage reports |
||||
|
htmlcov/ |
||||
|
.tox/ |
||||
|
.nox/ |
||||
|
.coverage |
||||
|
.coverage.* |
||||
|
.cache |
||||
|
nosetests.xml |
||||
|
coverage.xml |
||||
|
*.cover |
||||
|
.hypothesis/ |
||||
|
.pytest_cache/ |
||||
|
|
||||
|
# Translations |
||||
|
*.mo |
||||
|
*.pot |
||||
|
|
||||
|
# Django stuff: |
||||
|
*.log |
||||
|
local_settings.py |
||||
|
db.sqlite3 |
||||
|
db.sqlite3-journal |
||||
|
|
||||
|
# Flask stuff: |
||||
|
instance/ |
||||
|
.webassets-cache |
||||
|
|
||||
|
# Scrapy stuff: |
||||
|
.scrapy |
||||
|
|
||||
|
# Sphinx documentation |
||||
|
docs/_build/ |
||||
|
doc/_build/ |
||||
|
|
||||
|
# PyBuilder |
||||
|
target/ |
||||
|
|
||||
|
# Jupyter Notebook |
||||
|
.ipynb_checkpoints |
||||
|
|
||||
|
# IPython |
||||
|
profile_default/ |
||||
|
ipython_config.py |
||||
|
|
||||
|
# pyenv |
||||
|
.python-version |
||||
|
|
||||
|
# pipenv |
||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
|
# install all needed dependencies. |
||||
|
#Pipfile.lock |
||||
|
|
||||
|
# celery beat schedule file |
||||
|
celerybeat-schedule |
||||
|
|
||||
|
# SageMath parsed files |
||||
|
*.sage.py |
||||
|
|
||||
|
# Environments |
||||
|
.env |
||||
|
.venv |
||||
|
env/ |
||||
|
venv/ |
||||
|
ENV/ |
||||
|
env.bak/ |
||||
|
venv.bak/ |
||||
|
|
||||
|
# Spyder project settings |
||||
|
.spyderproject |
||||
|
.spyproject |
||||
|
|
||||
|
# Rope project settings |
||||
|
.ropeproject |
||||
|
|
||||
|
# mkdocs documentation |
||||
|
/site |
||||
|
|
||||
|
# mypy |
||||
|
.mypy_cache/ |
||||
|
.dmypy.json |
||||
|
dmypy.json |
||||
|
|
||||
|
# Pyre type checker |
||||
|
.pyre/ |
||||
|
|
||||
|
# Poetry |
||||
|
# Uncomment if you are NOT using Poetry |
||||
|
#poetry.lock |
||||
|
|
||||
|
# End of file |
||||
@ -0,0 +1,67 @@ |
|||||
|
## 安装步骤 |
||||
|
|
||||
|
### 1. 安装 Python |
||||
|
确保你已安装 Python 3.9 或更高版本。你可以从 [Python 官方网站](https://www.python.org/downloads/) 下载并安装。 |
||||
|
|
||||
|
验证 Python 是否安装成功: |
||||
|
```bash |
||||
|
python --version |
||||
|
``` |
||||
|
|
||||
|
### 2. 安装 virtualenv |
||||
|
`virtualenv` 是一个用于创建隔离的 Python 环境的工具。如果尚未安装,请运行以下命令: |
||||
|
```bash |
||||
|
pip install virtualenv |
||||
|
``` |
||||
|
|
||||
|
### 3. 创建虚拟环境 |
||||
|
在项目根目录下运行以下命令,创建一个虚拟环境: |
||||
|
```bash |
||||
|
virtualenv venv |
||||
|
``` |
||||
|
|
||||
|
激活虚拟环境: |
||||
|
- **Windows**: |
||||
|
```bash |
||||
|
.\venv\Scripts\activate |
||||
|
``` |
||||
|
- **Linux/macOS**: |
||||
|
```bash |
||||
|
source venv/bin/activate |
||||
|
``` |
||||
|
|
||||
|
### 4. 安装依赖 |
||||
|
在虚拟环境中,运行以下命令安装项目依赖: |
||||
|
```bash |
||||
|
pip install -r requirements.txt |
||||
|
``` |
||||
|
|
||||
|
`requirements.txt` 文件包含了项目所需的所有依赖及其版本。确保该文件位于项目根目录下。 |
||||
|
|
||||
|
### 5. 运行项目 |
||||
|
在虚拟环境中,运行以下命令启动项目: |
||||
|
```bash |
||||
|
python main.py |
||||
|
``` |
||||
|
|
||||
|
如果项目包含多个脚本,根据需要运行相应的脚本文件。 |
||||
|
|
||||
|
## 示例结构 |
||||
|
以下是项目的文件结构示例: |
||||
|
``` |
||||
|
project-root/ |
||||
|
├── README.md |
||||
|
├── requirements.txt |
||||
|
├── main.py |
||||
|
├── data/ |
||||
|
│ └── example.txt |
||||
|
└── venv/ |
||||
|
``` |
||||
|
|
||||
|
## 注意事项 |
||||
|
1. **虚拟环境**:始终在虚拟环境中运行项目,以避免全局环境的依赖冲突。 |
||||
|
2. **依赖更新**:如果需要更新依赖,请运行以下命令: |
||||
|
```bash |
||||
|
pip install --upgrade -r requirements.txt |
||||
|
``` |
||||
|
3. **退出虚拟环境**:运行 `deactivate` 命令退出虚拟环境。 |
||||
@ -0,0 +1,5 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
KEYWORDS = ['咸鱼之王'] |
||||
|
MAX_PAGE = 10 |
||||
|
COOKIE = 'UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=180.109.135.223_1734009009.184774; Apache=180.109.135.223_1734009009.184775; ULV=1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920; ALF=02_1742823137; SCF=AnbhzHKrnUQl7Hr1ketFkwfNrrNrnZoluPHwHKF6Cd5jepxd4jnBttaKovu1rtniHOjeKih3dtFzpJfX3fnoexE.; SUB=_2A25KvaGxDeRhGeVO6VQW9S7FzD2IHXVpsrt5rDV_PUJbkNAbLRGkkW9NTWl_zXi3K6i7_10g-b280K9gc5zMpYjq; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt; U_TRS1=000000d8.6b0c8bd59.67b9d1e3.a609672c; U_TRS2=000000d8.6b148bd59.67b9d1e3.70e955ed; HM-AMT=%7B%22amt%22%3A24089921%2C%22amt24h%22%3A15814%2C%22v%22%3A%222.3.168%22%2C%22vPcJs%22%3A%221.6.79%22%2C%22vPcCss%22%3A%221.2.393%22%7D' |
||||
@ -0,0 +1,87 @@ |
|||||
|
# _*_ coding: utf-8 _*_ |
||||
|
|
||||
|
import execjs |
||||
|
import requests |
||||
|
from loguru import logger |
||||
|
import os |
||||
|
from datetime import datetime |
||||
|
import config |
||||
|
|
||||
|
from utils import CookieUtils |
||||
|
|
||||
|
|
||||
|
# sign.js就是上面还原的算法代码,自行保存即可 |
||||
|
def load_js(file_path="sign.js"): |
||||
|
with open(file_path, "r", encoding="utf-8") as js_file: |
||||
|
js_code = js_file.read() |
||||
|
return execjs.compile(js_code) |
||||
|
|
||||
|
|
||||
|
def generate_signature(keyword, page, js_context): |
||||
|
return js_context.call('get_signature', keyword, page) |
||||
|
|
||||
|
|
||||
|
def perform_search(cookies, keyword, page, js_context): |
||||
|
headers = { |
||||
|
"accept": "*/*", |
||||
|
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", |
||||
|
"cache-control": "no-cache", |
||||
|
"pragma": "no-cache", |
||||
|
"priority": "u=1, i", |
||||
|
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", |
||||
|
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", |
||||
|
"sec-ch-ua-mobile": "?0", |
||||
|
"sec-ch-ua-platform": "\"macOS\"", |
||||
|
"sec-fetch-dest": "empty", |
||||
|
"sec-fetch-mode": "cors", |
||||
|
"sec-fetch-site": "same-origin", |
||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", |
||||
|
# 自行获取 |
||||
|
"x-requested-with": "XMLHttpRequest" |
||||
|
} |
||||
|
|
||||
|
signature_data = generate_signature(keyword, page, js_context) |
||||
|
logger.info(f'抓取第 {page} 页.') |
||||
|
params = { |
||||
|
'ts': signature_data['ts'], |
||||
|
'rs': signature_data['rs'], |
||||
|
'signature': signature_data['signature'], |
||||
|
'keywords': keyword, |
||||
|
'page_size': '10', |
||||
|
'page': str(page), |
||||
|
} |
||||
|
|
||||
|
response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers) |
||||
|
if not response.status_code == 200: |
||||
|
raise RuntimeError(f"响应异常 状态码: {response.status_code}") |
||||
|
try: |
||||
|
return response.json() |
||||
|
except Exception: |
||||
|
raise RuntimeError(f"Json解析异常 响应体: {response.text}") |
||||
|
|
||||
|
|
||||
|
def process_search_results(cookies, keyword, max_page): |
||||
|
datas = [] |
||||
|
js_context = load_js() |
||||
|
for page in range(1, max_page): |
||||
|
try: |
||||
|
result = perform_search(cookies, keyword, page, js_context) |
||||
|
datas.append(result) |
||||
|
logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}') |
||||
|
except Exception as e: |
||||
|
logger.error(f"出现异常: 关键词: {keyword} 页码: {page}") |
||||
|
logger.error(f"异常信息: {str(e)}") |
||||
|
return datas |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
cookies = CookieUtils.cookie_str_to_dict(config.COOKIE) |
||||
|
if not os.path.exists('./data'): |
||||
|
os.mkdir('./data') |
||||
|
for k in config.KEYWORDS: |
||||
|
all_result = process_search_results(cookies, k, config.MAX_PAGE) |
||||
|
with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file: |
||||
|
for item in all_result: |
||||
|
file.write(f"{item}\n") |
||||
|
logger.success(f"{k} 查询结果已保存") |
||||
|
|
||||
@ -0,0 +1,11 @@ |
|||||
|
certifi==2025.1.31 |
||||
|
charset-normalizer==3.4.1 |
||||
|
colorama==0.4.6 |
||||
|
idna==3.10 |
||||
|
loguru==0.7.3 |
||||
|
PyExecJS==1.5.1 |
||||
|
requests==2.32.3 |
||||
|
six==1.17.0 |
||||
|
urllib3==2.3.0 |
||||
|
wheel==0.41.2 |
||||
|
win32_setctime==1.2.0 |
||||
@ -0,0 +1,38 @@ |
|||||
|
const crypto = require('crypto'); |
||||
|
|
||||
|
function sha256Hash(input) { |
||||
|
return crypto.createHash('sha256').update(input).digest('hex'); |
||||
|
} |
||||
|
|
||||
|
function get_signature(keyword, page) { |
||||
|
// 获取当前时间戳
|
||||
|
const u = new Date().getTime(); |
||||
|
|
||||
|
// 生成随机字符串
|
||||
|
function generateRandomString(e, t, r) { |
||||
|
let n = ""; |
||||
|
let i = t; |
||||
|
const a = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
||||
|
if (e) { |
||||
|
i = Math.round(Math.random() * (r - t)) + t; |
||||
|
} |
||||
|
for (let o = 0; o < i; o++) { |
||||
|
n += a[Math.round(Math.random() * (a.length - 1))]; |
||||
|
} |
||||
|
return n; |
||||
|
} |
||||
|
|
||||
|
const l = generateRandomString(false, 16); |
||||
|
|
||||
|
// 将输入参数排序并连接成字符串
|
||||
|
const inputStr = [u, l, '$d6eb7ff91ee257475%', keyword, 10, page].sort().join(""); |
||||
|
|
||||
|
// 使用 SHA-256 算法生成签名
|
||||
|
const signature = sha256Hash(inputStr); |
||||
|
|
||||
|
return { |
||||
|
'ts': u, |
||||
|
'rs': l, |
||||
|
'signature': signature |
||||
|
}; |
||||
|
} |
||||
@ -0,0 +1,117 @@ |
|||||
|
import requests |
||||
|
import random |
||||
|
import hashlib |
||||
|
import time |
||||
|
import json |
||||
|
from bs4 import BeautifulSoup |
||||
|
|
||||
|
|
||||
|
cookies = { |
||||
|
#自己复制 |
||||
|
"UOR": "www.baidu.com,tousu.sina.com.cn,", |
||||
|
"SINAGLOBAL": "180.109.135.223_1734009009.184774", |
||||
|
"Apache": "180.109.135.223_1734009009.184775", |
||||
|
"ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920", |
||||
|
"ALF": "02_1742823137", |
||||
|
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt", |
||||
|
"U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c", |
||||
|
"U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed" |
||||
|
} |
||||
|
|
||||
|
headers = { |
||||
|
#自己复制 |
||||
|
"accept": "*/*", |
||||
|
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", |
||||
|
"cache-control": "no-cache", |
||||
|
"pragma": "no-cache", |
||||
|
"priority": "u=1, i", |
||||
|
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", |
||||
|
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", |
||||
|
"sec-ch-ua-mobile": "?0", |
||||
|
"sec-ch-ua-platform": "\"macOS\"", |
||||
|
"sec-fetch-dest": "empty", |
||||
|
"sec-fetch-mode": "cors", |
||||
|
"sec-fetch-site": "same-origin", |
||||
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取 |
||||
|
"x-requested-with":"XMLHttpRequest" |
||||
|
} |
||||
|
|
||||
|
#[l, p, b, h, c, d["type" + e]].sort().join("") |
||||
|
def generate_random_string(e=True, t=4, r=16): |
||||
|
chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |
||||
|
length = t if not e else random.randint(t, r) |
||||
|
return ''.join(random.choice(chars) for _ in range(length)) |
||||
|
|
||||
|
def get_sha256(value): |
||||
|
""" |
||||
|
sha256加密 |
||||
|
:param value: 加密字符串 |
||||
|
:return: 加密结果转换为16进制字符串,并大写 |
||||
|
""" |
||||
|
hsobj = hashlib.sha256() |
||||
|
hsobj.update(value.encode("utf-8")) |
||||
|
return hsobj.hexdigest() |
||||
|
|
||||
|
requests.packages.urllib3.disable_warnings() |
||||
|
sessions=requests.session() |
||||
|
data=[] |
||||
|
number=0 |
||||
|
for i in range(1,101):#1524 |
||||
|
print(i) |
||||
|
url_list=[] |
||||
|
if len(data)%50==0 and len(data)!=0: |
||||
|
time.sleep(60) |
||||
|
while True: |
||||
|
ts=str(int(time.time() * 1000))#ts,时间戳 |
||||
|
l=ts |
||||
|
rs=generate_random_string(True, 4, 16) |
||||
|
p=rs#rs |
||||
|
b = '$d6eb7ff91ee257475%' |
||||
|
h='外卖 食品安全'#keywords |
||||
|
c='10'#page_size |
||||
|
d=str(i)#d["type" + e]=page |
||||
|
signature=''.join(sorted([l, p, b, h, c, d])) |
||||
|
signature=get_sha256(signature) |
||||
|
params = { |
||||
|
'ts': ts, |
||||
|
'rs': rs, |
||||
|
'signature': signature, |
||||
|
'keywords': h, |
||||
|
'page_size': c, |
||||
|
'page': d, |
||||
|
} |
||||
|
try: |
||||
|
response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies, |
||||
|
headers=headers,params=params,verify=False,allow_redirects=False) |
||||
|
response=json.loads(response.text)['result']['data']['lists'] |
||||
|
#print(response) |
||||
|
for n in range(len(response)): |
||||
|
if response[n]['main']['evaluate_u']==None: |
||||
|
number+=1 |
||||
|
continue |
||||
|
else: |
||||
|
url=response[n]['main']['url'] |
||||
|
url_list.append(url) |
||||
|
number+=1 |
||||
|
break |
||||
|
except Exception as e: |
||||
|
print(e,response.text,i) |
||||
|
time.sleep(300) |
||||
|
continue |
||||
|
for url in url_list: |
||||
|
while True: |
||||
|
try: |
||||
|
response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False) |
||||
|
soup = BeautifulSoup(response.text, 'html.parser') |
||||
|
u_date_elements = soup.find_all(class_='u-date') |
||||
|
u_list=soup.find('ul', class_='ts-q-list') |
||||
|
c_num=u_list.find_all('li')[0].text |
||||
|
endtime=u_date_elements[2].text |
||||
|
starttime=u_date_elements[6].text |
||||
|
data.append([starttime,endtime,c_num]) |
||||
|
break |
||||
|
except Exception as e: |
||||
|
print(e,response.text,i) |
||||
|
time.sleep(60) |
||||
|
continue |
||||
|
data=pd.DataFrame(data,columns=['starttime','endtime','c_num']) |
||||
@ -0,0 +1,2 @@ |
|||||
|
# coding=utf-8 |
||||
|
from .cookie_utils import CookieUtils |
||||
@ -0,0 +1,34 @@ |
|||||
|
# coding=utf-8 |
||||
|
|
||||
|
import urllib.parse |
||||
|
import os |
||||
|
|
||||
|
|
||||
|
class CookieUtils: |
||||
|
@staticmethod |
||||
|
def cookie_str_to_dict(cookie_string): |
||||
|
cookie_dict = {} |
||||
|
# 分割字符串,每个cookie之间用分号隔开 |
||||
|
cookie_pairs = cookie_string.split(';') |
||||
|
for pair in cookie_pairs: |
||||
|
# 去除前后空白字符 |
||||
|
pair = pair.strip() |
||||
|
if pair: # 确保不处理空字符串 |
||||
|
# 分割键和值,键和值之间用等号连接 |
||||
|
key, value = pair.split('=', 1) |
||||
|
# 去除键和值的前后空白字符 |
||||
|
key = key.strip() |
||||
|
value = value.strip() |
||||
|
# URL解码值(如果需要) |
||||
|
value = urllib.parse.unquote(value) |
||||
|
# 将键值对添加到字典中 |
||||
|
cookie_dict[key] = value |
||||
|
return cookie_dict |
||||
|
|
||||
|
@staticmethod |
||||
|
def read_cookie_dict_from_file(path='./cookie'): |
||||
|
if not os.path.exists(path): |
||||
|
raise RuntimeError(f"找不到cookie文件: {path}") |
||||
|
with open(path, 'r', encoding='utf-8') as file: |
||||
|
cookie_str = file.read() |
||||
|
return CookieUtils.cookie_str_to_dict(cookie_str) |
||||
Write
Preview
Loading…
Cancel
Save
Reference in new issue