import random import time from urllib.parse import urljoin, urlparse from tools.class_int import RequestsInt from tools.deal_html import deal_html from bs4 import BeautifulSoup import json class Start(object): def __init__(self, url): self.url = url self.requests = RequestsInt(url) self.res = [] # ❌ 要过滤的资源 self.exclude_ext = { ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".mp3", ".wav", ".pdf", ".zip", ".rar", ".7z", ".css", ".js", ".ico", ".woff", ".woff2", ".ttf" } # ✅ 判断是否网页 def is_valid_page(self, url): path = urlparse(url).path.lower() for ext in self.exclude_ext: if path.endswith(ext): return False return True def get_internal_links(self, html): soup = BeautifulSoup(html, "lxml") base_domain = urlparse(self.url).netloc links = set() for a in soup.find_all("a", href=True): href = a["href"].strip() # ❌ 无效协议 if href.startswith(("javascript:", "#", "mailto:", "tel:")): continue # ✅ 转绝对路径 full_url = urljoin(self.url, href) # ✅ 同域 if urlparse(full_url).netloc != base_domain: continue # ✅ 过滤资源文件 if not self.is_valid_page(full_url): continue links.add(full_url) return list(links) def deal(self, url, html=None): if not html: html = self.requests.get(url) if html.status_code == 200: html = html.text else: print(f'请求状态异常不处理:{html.status_code}') return {} data = deal_html(html, url) # print(data) self.res.append(data) return data def dealpage(self, url, html): data = deal_html(html, url) self.res.append(data) return data def _time_sleep(self, k=None): if not k: k = random.uniform(0, 1) print('等待:', k) time.sleep(k) def fet_all_links(self, html): all_links = self.get_internal_links(html) print("二级页面链接:", all_links) print('二级页面长度:', len(all_links)) return all_links def run(self): html = self.requests.get(self.url).text res = self.deal(url=self.url, html=html) print(f'主页获取完成:{res.get("title")}') all_links = self.fet_all_links(html) if len(all_links) > 0: print('静态页面---') for index, link in enumerate(all_links): try: res = self.deal(url=link) print(f'{link}:{res.get("title")} {index + 1}') self._time_sleep() except Exception as e: print(e) print(f'异常:{link} {e}') # print(f'成功获取:{self.res}') print(f'全部获取完成:{self.url} {len(self.res)}') return self.res else: print('动态页面---') html = self.requests.get_page(self.url) print(html) res = self.dealpage(url=self.url, html=html) print(f'主页获取完成:{res.get("title")}') all_links = self.fet_all_links(html) for index, link in enumerate(all_links): try: res = self.deal(url=link) print(f'{link}:{res.get("title")} {index + 1}') self._time_sleep() except Exception as e: print(e) print(f'异常:{link} {e}') # print(f'成功获取:{self.res}') print(f'全部获取完成:{self.url} {len(self.res)}') return self.res class StartPage(Start): pass if __name__ == '__main__': url = 'https://www.essilor.com/cn-zh/' d = Start(url).run() with open('res.json', 'w', encoding='utf-8') as f: f.write(json.dumps(d, ensure_ascii=False))