import requests from bs4 import BeautifulSoup import trafilatura def fetch_html(url): headers = {"User-Agent": "Mozilla/5.0"} resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding return resp.text def extract_meta(soup): title = soup.title.string.strip() if soup.title else "" def get_meta(name): tag = soup.find("meta", attrs={"name": name}) if tag and tag.get("content"): return tag["content"].strip() return "" # 兜底 OG def get_og(prop): tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content"): return tag["content"].strip() return "" keywords = get_meta("keywords") description = get_meta("description") if not title: title = get_og("og:title") if not description: description = get_og("og:description") return title, keywords, description def extract_text(html): # ⭐ 优先正文算法 text = trafilatura.extract(html) if text: return text # 🔁 兜底(防止失败) soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "noscript"]): tag.extract() return soup.get_text(separator="\n") def parse_page(url): html = fetch_html(url) soup = BeautifulSoup(html, "lxml") title, keywords, description = extract_meta(soup) content = extract_text(html) return { "url": url, "title": title, "keywords": keywords, "description": description, "content": content } if __name__ == "__main__": url = "https://www.lheia.com/hzhb.html" data = parse_page(url) for k, v in data.items(): print(f"{k}:\n{str(v)[:300]}\n")