You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
141 lines
4.3 KiB
141 lines
4.3 KiB
import random
|
|
import time
|
|
from urllib.parse import urljoin, urlparse
|
|
from tools.class_int import RequestsInt
|
|
from tools.deal_html import deal_html,extract_chinese_and_english
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
|
|
class Start(object):
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.requests = RequestsInt(url)
|
|
self.res = []
|
|
|
|
# ❌ 要过滤的资源
|
|
self.exclude_ext = {
|
|
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
|
|
".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv",
|
|
".mp3", ".wav",
|
|
".pdf", ".zip", ".rar", ".7z",
|
|
".css", ".js", ".ico", ".woff", ".woff2", ".ttf"
|
|
}
|
|
|
|
# ✅ 判断是否网页
|
|
def is_valid_page(self, url):
|
|
path = urlparse(url).path.lower()
|
|
|
|
for ext in self.exclude_ext:
|
|
if path.endswith(ext):
|
|
return False
|
|
|
|
return True
|
|
|
|
def get_internal_links(self, html):
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
base_domain = urlparse(self.url).netloc
|
|
links = set()
|
|
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"].strip()
|
|
|
|
# ❌ 无效协议
|
|
if href.startswith(("javascript:", "#", "mailto:", "tel:")):
|
|
continue
|
|
|
|
# ✅ 转绝对路径
|
|
full_url = urljoin(self.url, href)
|
|
|
|
# ✅ 同域
|
|
if urlparse(full_url).netloc != base_domain:
|
|
continue
|
|
|
|
# ✅ 过滤资源文件
|
|
if not self.is_valid_page(full_url):
|
|
continue
|
|
|
|
links.add(full_url)
|
|
|
|
return list(links)
|
|
|
|
def deal(self, url, html=None):
|
|
if not html:
|
|
html = self.requests.get(url)
|
|
if html.status_code == 200:
|
|
html = html.text
|
|
else:
|
|
print(f'请求状态异常不处理:{html.status_code}')
|
|
return {}
|
|
data = deal_html(html, url)
|
|
# print(data)
|
|
self.res.append(data)
|
|
return data
|
|
|
|
def dealpage(self, url, html):
|
|
data = extract_chinese_and_english(html, url)
|
|
self.res.append(data)
|
|
return data
|
|
|
|
def _time_sleep(self, k=None):
|
|
if not k:
|
|
k = random.uniform(0, 1)
|
|
print('等待:', k)
|
|
time.sleep(k)
|
|
|
|
def fet_all_links(self, html):
|
|
all_links = self.get_internal_links(html)
|
|
print("二级页面链接:", all_links)
|
|
print('二级页面长度:', len(all_links))
|
|
return all_links
|
|
|
|
def run(self):
|
|
html = self.requests.get(self.url).text
|
|
|
|
all_links = self.fet_all_links(html)
|
|
if len(all_links) > 1:
|
|
print('静态页面---')
|
|
res = self.deal(url=self.url, html=html)
|
|
print(f'主页获取完成:{res.get("title")}')
|
|
for index, link in enumerate(all_links[:50]):
|
|
try:
|
|
res = self.deal(url=link)
|
|
print(f'{link}:{res.get("title")} {index + 1}')
|
|
self._time_sleep()
|
|
except Exception as e:
|
|
print(e)
|
|
print(f'异常:{link} {e}')
|
|
# print(f'成功获取:{self.res}')
|
|
print(f'全部获取完成:{self.url} {len(self.res)}')
|
|
return self.res
|
|
else:
|
|
print('动态页面---')
|
|
html = self.requests.get_page(self.url)
|
|
# print(html)
|
|
res = self.dealpage(url=self.url, html=html)
|
|
print(f'主页获取完成:{res.get("title")}')
|
|
all_links = self.fet_all_links(html)
|
|
for index, link in enumerate(all_links[:10]):
|
|
try:
|
|
html = self.requests.get_page(link)
|
|
res = self.dealpage(url=link,html=html)
|
|
print(f'{link}:{res.get("title")} {index + 1}')
|
|
self._time_sleep()
|
|
except Exception as e:
|
|
print(e)
|
|
print(f'异常:{link} {e}')
|
|
# print(f'成功获取:{self.res}')
|
|
print(f'全部获取完成:{self.url} {len(self.res)}')
|
|
return self.res
|
|
|
|
|
|
class StartPage(Start):
|
|
pass
|
|
|
|
|
|
if __name__ == '__main__':
|
|
url = 'https://www.essilor.com/cn-zh/'
|
|
d = Start(url).run()
|
|
with open('res.json', 'w', encoding='utf-8') as f:
|
|
f.write(json.dumps(d, ensure_ascii=False))
|