import requests
from bs4 import BeautifulSoup
import trafilatura

def fetch_html(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers, timeout=10)
    resp.encoding = resp.apparent_encoding
    return resp.text


def extract_meta(soup):
    title = soup.title.string.strip() if soup.title else ""

    def get_meta(name):
        tag = soup.find("meta", attrs={"name": name})
        if tag and tag.get("content"):
            return tag["content"].strip()
        return ""

    # 兜底 OG
    def get_og(prop):
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content"):
            return tag["content"].strip()
        return ""

    keywords = get_meta("keywords")
    description = get_meta("description")

    if not title:
        title = get_og("og:title")
    if not description:
        description = get_og("og:description")

    return title, keywords, description


def extract_text(html):
    # ⭐ 优先正文算法
    text = trafilatura.extract(html)
    if text:
        return text

    # 🔁 兜底（防止失败）
    soup = BeautifulSoup(html, "lxml")

    for tag in soup(["script", "style", "noscript"]):
        tag.extract()

    return soup.get_text(separator="\n")


def parse_page(url):
    html = fetch_html(url)
    soup = BeautifulSoup(html, "lxml")

    title, keywords, description = extract_meta(soup)
    content = extract_text(html)

    return {
        "url": url,
        "title": title,
        "keywords": keywords,
        "description": description,
        "content": content
    }


if __name__ == "__main__":
    url = "https://www.lheia.com/hzhb.html"
    data = parse_page(url)

    for k, v in data.items():
        print(f"{k}:\n{str(v)[:300]}\n")