spider/htmldata_get/t.py


								import requests

								from bs4 import BeautifulSoup

								import trafilatura


								def fetch_html(url):

								    headers = {"User-Agent": "Mozilla/5.0"}

								    resp = requests.get(url, headers=headers, timeout=10)

								    resp.encoding = resp.apparent_encoding

								    return resp.text


								def extract_meta(soup):

								    title = soup.title.string.strip() if soup.title else ""


								    def get_meta(name):

								        tag = soup.find("meta", attrs={"name": name})

								        if tag and tag.get("content"):

								            return tag["content"].strip()

								        return ""


								    # 兜底 OG

								    def get_og(prop):

								        tag = soup.find("meta", attrs={"property": prop})

								        if tag and tag.get("content"):

								            return tag["content"].strip()

								        return ""


								    keywords = get_meta("keywords")

								    description = get_meta("description")


								    if not title:

								        title = get_og("og:title")

								    if not description:

								        description = get_og("og:description")


								    return title, keywords, description


								def extract_text(html):

								    # ⭐ 优先正文算法

								    text = trafilatura.extract(html)

								    if text:

								        return text


								    # 🔁 兜底（防止失败）

								    soup = BeautifulSoup(html, "lxml")


								    for tag in soup(["script", "style", "noscript"]):

								        tag.extract()


								    return soup.get_text(separator="\n")


								def parse_page(url):

								    html = fetch_html(url)

								    soup = BeautifulSoup(html, "lxml")


								    title, keywords, description = extract_meta(soup)

								    content = extract_text(html)


								    return {

								        "url": url,

								        "title": title,

								        "keywords": keywords,

								        "description": description,

								        "content": content

								    }


								if __name__ == "__main__":

								    url = "https://www.lheia.com/hzhb.html"

								    data = parse_page(url)


								    for k, v in data.items():

								        print(f"{k}:\n{str(v)[:300]}\n")