爬虫相关
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

141 lines
4.3 KiB

import random
import time
from urllib.parse import urljoin, urlparse
from tools.class_int import RequestsInt
from tools.deal_html import deal_html,extract_chinese_and_english
from bs4 import BeautifulSoup
import json
class Start(object):
def __init__(self, url):
self.url = url
self.requests = RequestsInt(url)
self.res = []
# ❌ 要过滤的资源
self.exclude_ext = {
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv",
".mp3", ".wav",
".pdf", ".zip", ".rar", ".7z",
".css", ".js", ".ico", ".woff", ".woff2", ".ttf"
}
# ✅ 判断是否网页
def is_valid_page(self, url):
path = urlparse(url).path.lower()
for ext in self.exclude_ext:
if path.endswith(ext):
return False
return True
def get_internal_links(self, html):
soup = BeautifulSoup(html, "lxml")
base_domain = urlparse(self.url).netloc
links = set()
for a in soup.find_all("a", href=True):
href = a["href"].strip()
# ❌ 无效协议
if href.startswith(("javascript:", "#", "mailto:", "tel:")):
continue
# ✅ 转绝对路径
full_url = urljoin(self.url, href)
# ✅ 同域
if urlparse(full_url).netloc != base_domain:
continue
# ✅ 过滤资源文件
if not self.is_valid_page(full_url):
continue
links.add(full_url)
return list(links)
def deal(self, url, html=None):
if not html:
html = self.requests.get(url)
if html.status_code == 200:
html = html.text
else:
print(f'请求状态异常不处理:{html.status_code}')
return {}
data = deal_html(html, url)
# print(data)
self.res.append(data)
return data
def dealpage(self, url, html):
data = extract_chinese_and_english(html, url)
self.res.append(data)
return data
def _time_sleep(self, k=None):
if not k:
k = random.uniform(0, 1)
print('等待:', k)
time.sleep(k)
def fet_all_links(self, html):
all_links = self.get_internal_links(html)
print("二级页面链接:", all_links)
print('二级页面长度:', len(all_links))
return all_links
def run(self):
html = self.requests.get(self.url).text
all_links = self.fet_all_links(html)
if len(all_links) > 1:
print('静态页面---')
res = self.deal(url=self.url, html=html)
print(f'主页获取完成:{res.get("title")}')
for index, link in enumerate(all_links[:50]):
try:
res = self.deal(url=link)
print(f'{link}:{res.get("title")} {index + 1}')
self._time_sleep()
except Exception as e:
print(e)
print(f'异常:{link} {e}')
# print(f'成功获取:{self.res}')
print(f'全部获取完成:{self.url} {len(self.res)}')
return self.res
else:
print('动态页面---')
html = self.requests.get_page(self.url)
# print(html)
res = self.dealpage(url=self.url, html=html)
print(f'主页获取完成:{res.get("title")}')
all_links = self.fet_all_links(html)
for index, link in enumerate(all_links[:10]):
try:
html = self.requests.get_page(link)
res = self.dealpage(url=link,html=html)
print(f'{link}:{res.get("title")} {index + 1}')
self._time_sleep()
except Exception as e:
print(e)
print(f'异常:{link} {e}')
# print(f'成功获取:{self.res}')
print(f'全部获取完成:{self.url} {len(self.res)}')
return self.res
class StartPage(Start):
pass
if __name__ == '__main__':
url = 'https://www.essilor.com/cn-zh/'
d = Start(url).run()
with open('res.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(d, ensure_ascii=False))