import requests import random import hashlib import time import json from bs4 import BeautifulSoup cookies = { #自己复制 "UOR": "www.baidu.com,tousu.sina.com.cn,", "SINAGLOBAL": "180.109.135.223_1734009009.184774", "Apache": "180.109.135.223_1734009009.184775", "ULV": "1737698014297:2:1:1:180.109.135.223_1734009009.184775:1734009008920", "ALF": "02_1742823137", "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WFTxn..WU0jomaP4CuWPAC85NHD95Q0ehzcS0-71KMpWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNMcxo1g-NIPLNd7tt", "U_TRS1": "000000d8.6b0c8bd59.67b9d1e3.a609672c", "U_TRS2": "000000d8.6b148bd59.67b9d1e3.70e955ed" } headers = { #自己复制 "accept": "*/*", "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", "cache-control": "no-cache", "pragma": "no-cache", "priority": "u=1, i", "referer": "https://cq.tousu.sina.com.cn/index/search/?keywords=%E5%92%B8%E9%B1%BC%E4%B9%8B%E7%8E%8B&t=1", "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"macOS\"", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", # 自行获取 "x-requested-with":"XMLHttpRequest" } #[l, p, b, h, c, d["type" + e]].sort().join("") def generate_random_string(e=True, t=4, r=16): chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" length = t if not e else random.randint(t, r) return ''.join(random.choice(chars) for _ in range(length)) def get_sha256(value): """ sha256加密 :param value: 加密字符串 :return: 加密结果转换为16进制字符串,并大写 """ hsobj = hashlib.sha256() hsobj.update(value.encode("utf-8")) return hsobj.hexdigest() requests.packages.urllib3.disable_warnings() sessions=requests.session() data=[] number=0 for i in range(1,101):#1524 print(i) url_list=[] if len(data)%50==0 and len(data)!=0: time.sleep(60) while True: ts=str(int(time.time() * 1000))#ts,时间戳 l=ts rs=generate_random_string(True, 4, 16) p=rs#rs b = '$d6eb7ff91ee257475%' h='外卖 食品安全'#keywords c='10'#page_size d=str(i)#d["type" + e]=page signature=''.join(sorted([l, p, b, h, c, d])) signature=get_sha256(signature) params = { 'ts': ts, 'rs': rs, 'signature': signature, 'keywords': h, 'page_size': c, 'page': d, } try: response = sessions.get('https://tousu.sina.com.cn/api/index/s',cookies=cookies, headers=headers,params=params,verify=False,allow_redirects=False) response=json.loads(response.text)['result']['data']['lists'] #print(response) for n in range(len(response)): if response[n]['main']['evaluate_u']==None: number+=1 continue else: url=response[n]['main']['url'] url_list.append(url) number+=1 break except Exception as e: print(e,response.text,i) time.sleep(300) continue for url in url_list: while True: try: response = sessions.get('https:'+url,cookies=cookies,headers=headers,verify=False,allow_redirects=False) soup = BeautifulSoup(response.text, 'html.parser') u_date_elements = soup.find_all(class_='u-date') u_list=soup.find('ul', class_='ts-q-list') c_num=u_list.find_all('li')[0].text endtime=u_date_elements[2].text starttime=u_date_elements[6].text data.append([starttime,endtime,c_num]) break except Exception as e: print(e,response.text,i) time.sleep(60) continue data=pd.DataFrame(data,columns=['starttime','endtime','c_num'])