You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
117 lines
4.2 KiB
117 lines
4.2 KiB
import requests
|
|
import random
|
|
import hashlib
|
|
import time
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
cookies = {
|
|
#自己复制
|
|
"UOR": "www.baidu.com,tousu.sina.com.cn,",
|
|
"SINAGLOBAL": "180.109.135.223_1734009009.184774",
|
|
"Apache": "180.109.135.223_1734009009.184775",
|
|
"ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920",
|
|
"ALF": "02_1742823137",
|
|
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt",
|
|
"U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c",
|
|
"U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed"
|
|
}
|
|
|
|
headers = {
|
|
#自己复制
|
|
"accept": "*/*",
|
|
"accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
|
|
"cache-control": "no-cache",
|
|
"pragma": "no-cache",
|
|
"priority": "u=1, i",
|
|
"referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1",
|
|
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
|
|
"sec-ch-ua-mobile": "?0",
|
|
"sec-ch-ua-platform": "\"macOS\"",
|
|
"sec-fetch-dest": "empty",
|
|
"sec-fetch-mode": "cors",
|
|
"sec-fetch-site": "same-origin",
|
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取
|
|
"x-requested-with":"XMLHttpRequest"
|
|
}
|
|
|
|
#[l, p, b, h, c, d["type" + e]].sort().join("")
|
|
def generate_random_string(e=True, t=4, r=16):
|
|
chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
length = t if not e else random.randint(t, r)
|
|
return ''.join(random.choice(chars) for _ in range(length))
|
|
|
|
def get_sha256(value):
|
|
"""
|
|
sha256加密
|
|
:param value: 加密字符串
|
|
:return: 加密结果转换为16进制字符串,并大写
|
|
"""
|
|
hsobj = hashlib.sha256()
|
|
hsobj.update(value.encode("utf-8"))
|
|
return hsobj.hexdigest()
|
|
|
|
requests.packages.urllib3.disable_warnings()
|
|
sessions=requests.session()
|
|
data=[]
|
|
number=0
|
|
for i in range(1,101):#1524
|
|
print(i)
|
|
url_list=[]
|
|
if len(data)%50==0 and len(data)!=0:
|
|
time.sleep(60)
|
|
while True:
|
|
ts=str(int(time.time() * 1000))#ts,时间戳
|
|
l=ts
|
|
rs=generate_random_string(True, 4, 16)
|
|
p=rs#rs
|
|
b = '$d6eb7ff91ee257475%'
|
|
h='外卖 食品安全'#keywords
|
|
c='10'#page_size
|
|
d=str(i)#d["type" + e]=page
|
|
signature=''.join(sorted([l, p, b, h, c, d]))
|
|
signature=get_sha256(signature)
|
|
params = {
|
|
'ts': ts,
|
|
'rs': rs,
|
|
'signature': signature,
|
|
'keywords': h,
|
|
'page_size': c,
|
|
'page': d,
|
|
}
|
|
try:
|
|
response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies,
|
|
headers=headers,params=params,verify=False,allow_redirects=False)
|
|
response=json.loads(response.text)['result']['data']['lists']
|
|
#print(response)
|
|
for n in range(len(response)):
|
|
if response[n]['main']['evaluate_u']==None:
|
|
number+=1
|
|
continue
|
|
else:
|
|
url=response[n]['main']['url']
|
|
url_list.append(url)
|
|
number+=1
|
|
break
|
|
except Exception as e:
|
|
print(e,response.text,i)
|
|
time.sleep(300)
|
|
continue
|
|
for url in url_list:
|
|
while True:
|
|
try:
|
|
response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False)
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
u_date_elements = soup.find_all(class_='u-date')
|
|
u_list=soup.find('ul', class_='ts-q-list')
|
|
c_num=u_list.find_all('li')[0].text
|
|
endtime=u_date_elements[2].text
|
|
starttime=u_date_elements[6].text
|
|
data.append([starttime,endtime,c_num])
|
|
break
|
|
except Exception as e:
|
|
print(e,response.text,i)
|
|
time.sleep(60)
|
|
continue
|
|
data=pd.DataFrame(data,columns=['starttime','endtime','c_num'])
|