commit
556e082319
9 changed files with 488 additions and 0 deletions
-
127.gitignore
-
67README.md
-
5config.py
-
87main.py
-
11requirements.txt
-
38sign.js
-
117test2.py
-
2utils/__init__.py
-
34utils/cookie_utils.py
@ -0,0 +1,127 @@ |
|||
# Byte-compiled / optimized / DLL files |
|||
__pycache__/ |
|||
*.py[cod] |
|||
*$py.class |
|||
data |
|||
.idea |
|||
.vscode |
|||
|
|||
# Distribution / packaging |
|||
.Python |
|||
build/ |
|||
develop-eggs/ |
|||
dist/ |
|||
downloads/ |
|||
eggs/ |
|||
.eggs/ |
|||
lib/ |
|||
lib64/ |
|||
parts/ |
|||
sdist/ |
|||
var/ |
|||
*.egg-info/ |
|||
.installed.cfg |
|||
*.egg |
|||
|
|||
# PyInstaller |
|||
# Usually these files are written by a python script from a template |
|||
# before PyInstaller builds the executable, so as to inject date/other infos into it. |
|||
*.manifest |
|||
*.spec |
|||
|
|||
# Installer logs |
|||
pip-log.txt |
|||
pip-delete-this-directory.txt |
|||
|
|||
# Unit test / coverage reports |
|||
htmlcov/ |
|||
.tox/ |
|||
.nox/ |
|||
.coverage |
|||
.coverage.* |
|||
.cache |
|||
nosetests.xml |
|||
coverage.xml |
|||
*.cover |
|||
.hypothesis/ |
|||
.pytest_cache/ |
|||
|
|||
# Translations |
|||
*.mo |
|||
*.pot |
|||
|
|||
# Django stuff: |
|||
*.log |
|||
local_settings.py |
|||
db.sqlite3 |
|||
db.sqlite3-journal |
|||
|
|||
# Flask stuff: |
|||
instance/ |
|||
.webassets-cache |
|||
|
|||
# Scrapy stuff: |
|||
.scrapy |
|||
|
|||
# Sphinx documentation |
|||
docs/_build/ |
|||
doc/_build/ |
|||
|
|||
# PyBuilder |
|||
target/ |
|||
|
|||
# Jupyter Notebook |
|||
.ipynb_checkpoints |
|||
|
|||
# IPython |
|||
profile_default/ |
|||
ipython_config.py |
|||
|
|||
# pyenv |
|||
.python-version |
|||
|
|||
# pipenv |
|||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
|||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
|||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
|||
# install all needed dependencies. |
|||
#Pipfile.lock |
|||
|
|||
# celery beat schedule file |
|||
celerybeat-schedule |
|||
|
|||
# SageMath parsed files |
|||
*.sage.py |
|||
|
|||
# Environments |
|||
.env |
|||
.venv |
|||
env/ |
|||
venv/ |
|||
ENV/ |
|||
env.bak/ |
|||
venv.bak/ |
|||
|
|||
# Spyder project settings |
|||
.spyderproject |
|||
.spyproject |
|||
|
|||
# Rope project settings |
|||
.ropeproject |
|||
|
|||
# mkdocs documentation |
|||
/site |
|||
|
|||
# mypy |
|||
.mypy_cache/ |
|||
.dmypy.json |
|||
dmypy.json |
|||
|
|||
# Pyre type checker |
|||
.pyre/ |
|||
|
|||
# Poetry |
|||
# Uncomment if you are NOT using Poetry |
|||
#poetry.lock |
|||
|
|||
# End of file |
|||
@ -0,0 +1,67 @@ |
|||
## 安装步骤 |
|||
|
|||
### 1. 安装 Python |
|||
确保你已安装 Python 3.9 或更高版本。你可以从 [Python 官方网站](https://www.python.org/downloads/) 下载并安装。 |
|||
|
|||
验证 Python 是否安装成功: |
|||
```bash |
|||
python --version |
|||
``` |
|||
|
|||
### 2. 安装 virtualenv |
|||
`virtualenv` 是一个用于创建隔离的 Python 环境的工具。如果尚未安装,请运行以下命令: |
|||
```bash |
|||
pip install virtualenv |
|||
``` |
|||
|
|||
### 3. 创建虚拟环境 |
|||
在项目根目录下运行以下命令,创建一个虚拟环境: |
|||
```bash |
|||
virtualenv venv |
|||
``` |
|||
|
|||
激活虚拟环境: |
|||
- **Windows**: |
|||
```bash |
|||
.\venv\Scripts\activate |
|||
``` |
|||
- **Linux/macOS**: |
|||
```bash |
|||
source venv/bin/activate |
|||
``` |
|||
|
|||
### 4. 安装依赖 |
|||
在虚拟环境中,运行以下命令安装项目依赖: |
|||
```bash |
|||
pip install -r requirements.txt |
|||
``` |
|||
|
|||
`requirements.txt` 文件包含了项目所需的所有依赖及其版本。确保该文件位于项目根目录下。 |
|||
|
|||
### 5. 运行项目 |
|||
在虚拟环境中,运行以下命令启动项目: |
|||
```bash |
|||
python main.py |
|||
``` |
|||
|
|||
如果项目包含多个脚本,根据需要运行相应的脚本文件。 |
|||
|
|||
## 示例结构 |
|||
以下是项目的文件结构示例: |
|||
``` |
|||
project-root/ |
|||
├── README.md |
|||
├── requirements.txt |
|||
├── main.py |
|||
├── data/ |
|||
│ └── example.txt |
|||
└── venv/ |
|||
``` |
|||
|
|||
## 注意事项 |
|||
1. **虚拟环境**:始终在虚拟环境中运行项目,以避免全局环境的依赖冲突。 |
|||
2. **依赖更新**:如果需要更新依赖,请运行以下命令: |
|||
```bash |
|||
pip install --upgrade -r requirements.txt |
|||
``` |
|||
3. **退出虚拟环境**:运行 `deactivate` 命令退出虚拟环境。 |
|||
@ -0,0 +1,5 @@ |
|||
# coding=utf-8 |
|||
|
|||
KEYWORDS = ['咸鱼之王'] |
|||
MAX_PAGE = 10 |
|||
COOKIE = 'UOR=www.baidu.com,tousu.sina.com.cn,; SINAGLOBAL=180.109.135.223_1734009009.184774; Apache=180.109.135.223_1734009009.184775; ULV=1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920; ALF=02_1742823137; SCF=AnbhzHKrnUQl7Hr1ketFkwfNrrNrnZoluPHwHKF6Cd5jepxd4jnBttaKovu1rtniHOjeKih3dtFzpJfX3fnoexE.; SUB=_2A25KvaGxDeRhGeVO6VQW9S7FzD2IHXVpsrt5rDV_PUJbkNAbLRGkkW9NTWl_zXi3K6i7_10g-b280K9gc5zMpYjq; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt; U_TRS1=000000d8.6b0c8bd59.67b9d1e3.a609672c; U_TRS2=000000d8.6b148bd59.67b9d1e3.70e955ed; HM-AMT=%7B%22amt%22%3A24089921%2C%22amt24h%22%3A15814%2C%22v%22%3A%222.3.168%22%2C%22vPcJs%22%3A%221.6.79%22%2C%22vPcCss%22%3A%221.2.393%22%7D' |
|||
@ -0,0 +1,87 @@ |
|||
# _*_ coding: utf-8 _*_ |
|||
|
|||
import execjs |
|||
import requests |
|||
from loguru import logger |
|||
import os |
|||
from datetime import datetime |
|||
import config |
|||
|
|||
from utils import CookieUtils |
|||
|
|||
|
|||
# sign.js就是上面还原的算法代码,自行保存即可 |
|||
def load_js(file_path="sign.js"): |
|||
with open(file_path, "r", encoding="utf-8") as js_file: |
|||
js_code = js_file.read() |
|||
return execjs.compile(js_code) |
|||
|
|||
|
|||
def generate_signature(keyword, page, js_context): |
|||
return js_context.call('get_signature', keyword, page) |
|||
|
|||
|
|||
def perform_search(cookies, keyword, page, js_context): |
|||
headers = { |
|||
"accept": "*/*", |
|||
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", |
|||
"cache-control": "no-cache", |
|||
"pragma": "no-cache", |
|||
"priority": "u=1, i", |
|||
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", |
|||
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", |
|||
"sec-ch-ua-mobile": "?0", |
|||
"sec-ch-ua-platform": "\"macOS\"", |
|||
"sec-fetch-dest": "empty", |
|||
"sec-fetch-mode": "cors", |
|||
"sec-fetch-site": "same-origin", |
|||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", |
|||
# 自行获取 |
|||
"x-requested-with": "XMLHttpRequest" |
|||
} |
|||
|
|||
signature_data = generate_signature(keyword, page, js_context) |
|||
logger.info(f'抓取第 {page} 页.') |
|||
params = { |
|||
'ts': signature_data['ts'], |
|||
'rs': signature_data['rs'], |
|||
'signature': signature_data['signature'], |
|||
'keywords': keyword, |
|||
'page_size': '10', |
|||
'page': str(page), |
|||
} |
|||
|
|||
response = requests.get('https://tousu.sina.com.cn/api/index/s', cookies=cookies, params=params, headers=headers) |
|||
if not response.status_code == 200: |
|||
raise RuntimeError(f"响应异常 状态码: {response.status_code}") |
|||
try: |
|||
return response.json() |
|||
except Exception: |
|||
raise RuntimeError(f"Json解析异常 响应体: {response.text}") |
|||
|
|||
|
|||
def process_search_results(cookies, keyword, max_page): |
|||
datas = [] |
|||
js_context = load_js() |
|||
for page in range(1, max_page): |
|||
try: |
|||
result = perform_search(cookies, keyword, page, js_context) |
|||
datas.append(result) |
|||
logger.info(f'搜索结果({keyword}[{page}/{max_page}]): {result}') |
|||
except Exception as e: |
|||
logger.error(f"出现异常: 关键词: {keyword} 页码: {page}") |
|||
logger.error(f"异常信息: {str(e)}") |
|||
return datas |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
cookies = CookieUtils.cookie_str_to_dict(config.COOKIE) |
|||
if not os.path.exists('./data'): |
|||
os.mkdir('./data') |
|||
for k in config.KEYWORDS: |
|||
all_result = process_search_results(cookies, k, config.MAX_PAGE) |
|||
with open(f"./data/{k}_{datetime.now().strftime('%Y_%m_%d_%H%M%S')}.json", 'w', encoding='utf-8') as file: |
|||
for item in all_result: |
|||
file.write(f"{item}\n") |
|||
logger.success(f"{k} 查询结果已保存") |
|||
|
|||
@ -0,0 +1,11 @@ |
|||
certifi==2025.1.31 |
|||
charset-normalizer==3.4.1 |
|||
colorama==0.4.6 |
|||
idna==3.10 |
|||
loguru==0.7.3 |
|||
PyExecJS==1.5.1 |
|||
requests==2.32.3 |
|||
six==1.17.0 |
|||
urllib3==2.3.0 |
|||
wheel==0.41.2 |
|||
win32_setctime==1.2.0 |
|||
@ -0,0 +1,38 @@ |
|||
const crypto = require('crypto'); |
|||
|
|||
function sha256Hash(input) { |
|||
return crypto.createHash('sha256').update(input).digest('hex'); |
|||
} |
|||
|
|||
function get_signature(keyword, page) { |
|||
// 获取当前时间戳
|
|||
const u = new Date().getTime(); |
|||
|
|||
// 生成随机字符串
|
|||
function generateRandomString(e, t, r) { |
|||
let n = ""; |
|||
let i = t; |
|||
const a = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; |
|||
if (e) { |
|||
i = Math.round(Math.random() * (r - t)) + t; |
|||
} |
|||
for (let o = 0; o < i; o++) { |
|||
n += a[Math.round(Math.random() * (a.length - 1))]; |
|||
} |
|||
return n; |
|||
} |
|||
|
|||
const l = generateRandomString(false, 16); |
|||
|
|||
// 将输入参数排序并连接成字符串
|
|||
const inputStr = [u, l, '$d6eb7ff91ee257475%', keyword, 10, page].sort().join(""); |
|||
|
|||
// 使用 SHA-256 算法生成签名
|
|||
const signature = sha256Hash(inputStr); |
|||
|
|||
return { |
|||
'ts': u, |
|||
'rs': l, |
|||
'signature': signature |
|||
}; |
|||
} |
|||
@ -0,0 +1,117 @@ |
|||
import requests |
|||
import random |
|||
import hashlib |
|||
import time |
|||
import json |
|||
from bs4 import BeautifulSoup |
|||
|
|||
|
|||
cookies = { |
|||
#自己复制 |
|||
"UOR": "www.baidu.com,tousu.sina.com.cn,", |
|||
"SINAGLOBAL": "180.109.135.223_1734009009.184774", |
|||
"Apache": "180.109.135.223_1734009009.184775", |
|||
"ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920", |
|||
"ALF": "02_1742823137", |
|||
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt", |
|||
"U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c", |
|||
"U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed" |
|||
} |
|||
|
|||
headers = { |
|||
#自己复制 |
|||
"accept": "*/*", |
|||
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", |
|||
"cache-control": "no-cache", |
|||
"pragma": "no-cache", |
|||
"priority": "u=1, i", |
|||
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", |
|||
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", |
|||
"sec-ch-ua-mobile": "?0", |
|||
"sec-ch-ua-platform": "\"macOS\"", |
|||
"sec-fetch-dest": "empty", |
|||
"sec-fetch-mode": "cors", |
|||
"sec-fetch-site": "same-origin", |
|||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取 |
|||
"x-requested-with":"XMLHttpRequest" |
|||
} |
|||
|
|||
#[l, p, b, h, c, d["type" + e]].sort().join("") |
|||
def generate_random_string(e=True, t=4, r=16): |
|||
chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |
|||
length = t if not e else random.randint(t, r) |
|||
return ''.join(random.choice(chars) for _ in range(length)) |
|||
|
|||
def get_sha256(value): |
|||
""" |
|||
sha256加密 |
|||
:param value: 加密字符串 |
|||
:return: 加密结果转换为16进制字符串,并大写 |
|||
""" |
|||
hsobj = hashlib.sha256() |
|||
hsobj.update(value.encode("utf-8")) |
|||
return hsobj.hexdigest() |
|||
|
|||
requests.packages.urllib3.disable_warnings() |
|||
sessions=requests.session() |
|||
data=[] |
|||
number=0 |
|||
for i in range(1,101):#1524 |
|||
print(i) |
|||
url_list=[] |
|||
if len(data)%50==0 and len(data)!=0: |
|||
time.sleep(60) |
|||
while True: |
|||
ts=str(int(time.time() * 1000))#ts,时间戳 |
|||
l=ts |
|||
rs=generate_random_string(True, 4, 16) |
|||
p=rs#rs |
|||
b = '$d6eb7ff91ee257475%' |
|||
h='外卖 食品安全'#keywords |
|||
c='10'#page_size |
|||
d=str(i)#d["type" + e]=page |
|||
signature=''.join(sorted([l, p, b, h, c, d])) |
|||
signature=get_sha256(signature) |
|||
params = { |
|||
'ts': ts, |
|||
'rs': rs, |
|||
'signature': signature, |
|||
'keywords': h, |
|||
'page_size': c, |
|||
'page': d, |
|||
} |
|||
try: |
|||
response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies, |
|||
headers=headers,params=params,verify=False,allow_redirects=False) |
|||
response=json.loads(response.text)['result']['data']['lists'] |
|||
#print(response) |
|||
for n in range(len(response)): |
|||
if response[n]['main']['evaluate_u']==None: |
|||
number+=1 |
|||
continue |
|||
else: |
|||
url=response[n]['main']['url'] |
|||
url_list.append(url) |
|||
number+=1 |
|||
break |
|||
except Exception as e: |
|||
print(e,response.text,i) |
|||
time.sleep(300) |
|||
continue |
|||
for url in url_list: |
|||
while True: |
|||
try: |
|||
response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False) |
|||
soup = BeautifulSoup(response.text, 'html.parser') |
|||
u_date_elements = soup.find_all(class_='u-date') |
|||
u_list=soup.find('ul', class_='ts-q-list') |
|||
c_num=u_list.find_all('li')[0].text |
|||
endtime=u_date_elements[2].text |
|||
starttime=u_date_elements[6].text |
|||
data.append([starttime,endtime,c_num]) |
|||
break |
|||
except Exception as e: |
|||
print(e,response.text,i) |
|||
time.sleep(60) |
|||
continue |
|||
data=pd.DataFrame(data,columns=['starttime','endtime','c_num']) |
|||
@ -0,0 +1,2 @@ |
|||
# coding=utf-8 |
|||
from .cookie_utils import CookieUtils |
|||
@ -0,0 +1,34 @@ |
|||
# coding=utf-8 |
|||
|
|||
import urllib.parse |
|||
import os |
|||
|
|||
|
|||
class CookieUtils: |
|||
@staticmethod |
|||
def cookie_str_to_dict(cookie_string): |
|||
cookie_dict = {} |
|||
# 分割字符串,每个cookie之间用分号隔开 |
|||
cookie_pairs = cookie_string.split(';') |
|||
for pair in cookie_pairs: |
|||
# 去除前后空白字符 |
|||
pair = pair.strip() |
|||
if pair: # 确保不处理空字符串 |
|||
# 分割键和值,键和值之间用等号连接 |
|||
key, value = pair.split('=', 1) |
|||
# 去除键和值的前后空白字符 |
|||
key = key.strip() |
|||
value = value.strip() |
|||
# URL解码值(如果需要) |
|||
value = urllib.parse.unquote(value) |
|||
# 将键值对添加到字典中 |
|||
cookie_dict[key] = value |
|||
return cookie_dict |
|||
|
|||
@staticmethod |
|||
def read_cookie_dict_from_file(path='./cookie'): |
|||
if not os.path.exists(path): |
|||
raise RuntimeError(f"找不到cookie文件: {path}") |
|||
with open(path, 'r', encoding='utf-8') as file: |
|||
cookie_str = file.read() |
|||
return CookieUtils.cookie_str_to_dict(cookie_str) |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue