import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import trafilatura


def extract_meta(soup):
    title = soup.title.string if soup.title else ""

    def get_meta(name):
        tag = soup.find("meta", attrs={"name": name})
        if tag and tag.get("content", ''):
            return tag["content"].strip()
        return ""

    # 兜底 OG
    def get_og(prop):
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content", ''):
            return tag["content"].strip()
        return ""

    keywords = get_meta("keywords")
    description = get_meta("description")

    if not title:
        title = get_og("og:title")
    if not description:
        description = get_og("og:description")

    return title, keywords, description


def extract_text(html):
    # ⭐ 优先正文算法
    text = trafilatura.extract(html)
    if text:
        return text

    # 🔁 兜底（防止失败）
    soup = BeautifulSoup(html, "lxml")

    for tag in soup(["script", "style", "noscript"]):
        tag.extract()

    return soup.get_text(separator="\n")


def extract_news(html):
    soup = BeautifulSoup(html, "lxml")
    results = []

    for a in soup.find_all("a", href=True):
        # 👉 标题
        h = a.find(["h1", "h2", "h3", "h4", "h5"])
        if not h:
            continue

        title = h.get_text(strip=True)

        # 👉 时间（常见 class：time / date）
        time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in (
                tag.get("class") or []))
        time_text = time_tag.get_text(strip=True) if time_tag else ""

        # ❗过滤无效
        if len(title) < 5:
            continue

        results.append({
            "title": title,
            "text": time_text,
        })

    return results


def extract_blocks(html):
    soup = BeautifulSoup(html, "lxml")

    results = []

    # 👉 找所有“可能是内容块”的 a / div
    candidates = soup.find_all(["a", "div"])

    for tag in candidates:
        text = tag.get_text(strip=True)

        # ❗过滤太短的（排除噪声）
        if len(text) < 10:
            continue

        # 👉 标题（优先找短文本）
        for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
            txt = t.get_text(strip=True)
            if len(txt) >= 10:
                print(txt)
                results.append({
                    "text": txt,
                })
                break

    return results


def extract_chinese_and_english(html, url):
    # 1. 匹配中文（通用版）
    # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
    # \u3400-\u4dbf 覆盖扩展A区（生僻字、古汉字）
    # 使用 re.UNICODE 确保编码兼容性
    chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
    chinese_list = chinese_pattern.findall(html)
    chinese_list = set(chinese_list)
    soup = BeautifulSoup(html, "lxml")

    title = soup.title.string if soup.title else ""

    return {
        'url': url,
        "title": title,
        "keywords": '',
        "description": '',
        "content": ','.join(chinese_list)
    }


def deal_html(html, url):
    soup = BeautifulSoup(html, "lxml")

    title, keywords, description = extract_meta(soup)
    content = extract_text(html)

    return {
        'url': url,
        "title": title,
        "keywords": keywords,
        "description": description,
        "content": content
    }


if __name__ == '__main__':
    with open('../t/1.html', 'r', encoding='utf-8') as f:
        h = f.read()
        d = extract_chinese_and_english(h)
        print(d)