You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
1.7 KiB
75 lines
1.7 KiB
import requests
|
|
from bs4 import BeautifulSoup
|
|
import trafilatura
|
|
|
|
def fetch_html(url):
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
resp = requests.get(url, headers=headers, timeout=10)
|
|
resp.encoding = resp.apparent_encoding
|
|
return resp.text
|
|
|
|
|
|
def extract_meta(soup):
|
|
title = soup.title.string.strip() if soup.title else ""
|
|
|
|
def get_meta(name):
|
|
tag = soup.find("meta", attrs={"name": name})
|
|
if tag and tag.get("content"):
|
|
return tag["content"].strip()
|
|
return ""
|
|
|
|
# 兜底 OG
|
|
def get_og(prop):
|
|
tag = soup.find("meta", attrs={"property": prop})
|
|
if tag and tag.get("content"):
|
|
return tag["content"].strip()
|
|
return ""
|
|
|
|
keywords = get_meta("keywords")
|
|
description = get_meta("description")
|
|
|
|
if not title:
|
|
title = get_og("og:title")
|
|
if not description:
|
|
description = get_og("og:description")
|
|
|
|
return title, keywords, description
|
|
|
|
|
|
def extract_text(html):
|
|
# ⭐ 优先正文算法
|
|
text = trafilatura.extract(html)
|
|
if text:
|
|
return text
|
|
|
|
# 🔁 兜底(防止失败)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
tag.extract()
|
|
|
|
return soup.get_text(separator="\n")
|
|
|
|
|
|
def parse_page(url):
|
|
html = fetch_html(url)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
title, keywords, description = extract_meta(soup)
|
|
content = extract_text(html)
|
|
|
|
return {
|
|
"url": url,
|
|
"title": title,
|
|
"keywords": keywords,
|
|
"description": description,
|
|
"content": content
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
url = "https://www.lheia.com/hzhb.html"
|
|
data = parse_page(url)
|
|
|
|
for k, v in data.items():
|
|
print(f"{k}:\n{str(v)[:300]}\n")
|