You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

117 lines
4.2 KiB

import requests
import random
import hashlib
import time
import json
from bs4 import BeautifulSoup
cookies = {
#自己复制
"UOR": "www.baidu.com,tousu.sina.com.cn,",
"SINAGLOBAL": "180.109.135.223_1734009009.184774",
"Apache": "180.109.135.223_1734009009.184775",
"ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920",
"ALF": "02_1742823137",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt",
"U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c",
"U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed"
}
headers = {
#自己复制
"accept": "*/*",
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取
"x-requested-with":"XMLHttpRequest"
}
#[l, p, b, h, c, d["type" + e]].sort().join("")
def generate_random_string(e=True, t=4, r=16):
chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
length = t if not e else random.randint(t, r)
return ''.join(random.choice(chars) for _ in range(length))
def get_sha256(value):
"""
sha256加密
:param value: 加密字符串
:return: 加密结果转换为16进制字符串,并大写
"""
hsobj = hashlib.sha256()
hsobj.update(value.encode("utf-8"))
return hsobj.hexdigest()
requests.packages.urllib3.disable_warnings()
sessions=requests.session()
data=[]
number=0
for i in range(1,101):#1524
print(i)
url_list=[]
if len(data)%50==0 and len(data)!=0:
time.sleep(60)
while True:
ts=str(int(time.time() * 1000))#ts,时间戳
l=ts
rs=generate_random_string(True, 4, 16)
p=rs#rs
b = '$d6eb7ff91ee257475%'
h='外卖 食品安全'#keywords
c='10'#page_size
d=str(i)#d["type" + e]=page
signature=''.join(sorted([l, p, b, h, c, d]))
signature=get_sha256(signature)
params = {
'ts': ts,
'rs': rs,
'signature': signature,
'keywords': h,
'page_size': c,
'page': d,
}
try:
response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies,
headers=headers,params=params,verify=False,allow_redirects=False)
response=json.loads(response.text)['result']['data']['lists']
#print(response)
for n in range(len(response)):
if response[n]['main']['evaluate_u']==None:
number+=1
continue
else:
url=response[n]['main']['url']
url_list.append(url)
number+=1
break
except Exception as e:
print(e,response.text,i)
time.sleep(300)
continue
for url in url_list:
while True:
try:
response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False)
soup = BeautifulSoup(response.text, 'html.parser')
u_date_elements = soup.find_all(class_='u-date')
u_list=soup.find('ul', class_='ts-q-list')
c_num=u_list.find_all('li')[0].text
endtime=u_date_elements[2].text
starttime=u_date_elements[6].text
data.append([starttime,endtime,c_num])
break
except Exception as e:
print(e,response.text,i)
time.sleep(60)
continue
data=pd.DataFrame(data,columns=['starttime','endtime','c_num'])