spider/htmldata_get/tools/deal_html.py


								import re

								from urllib.parse import urljoin


								import requests

								from bs4 import BeautifulSoup

								import trafilatura


								def extract_meta(soup):

								    title = soup.title.string if soup.title else ""


								    def get_meta(name):

								        tag = soup.find("meta", attrs={"name": name})

								        if tag and tag.get("content", ''):

								            return tag["content"].strip()

								        return ""


								    # 兜底 OG

								    def get_og(prop):

								        tag = soup.find("meta", attrs={"property": prop})

								        if tag and tag.get("content", ''):

								            return tag["content"].strip()

								        return ""


								    keywords = get_meta("keywords")

								    description = get_meta("description")


								    if not title:

								        title = get_og("og:title")

								    if not description:

								        description = get_og("og:description")


								    return title, keywords, description


								def extract_text(html):

								    # ⭐ 优先正文算法

								    text = trafilatura.extract(html)

								    if text:

								        return text


								    # 🔁 兜底（防止失败）

								    soup = BeautifulSoup(html, "lxml")


								    for tag in soup(["script", "style", "noscript"]):

								        tag.extract()


								    return soup.get_text(separator="\n")


								def extract_news(html):

								    soup = BeautifulSoup(html, "lxml")

								    results = []


								    for a in soup.find_all("a", href=True):

								        # 👉 标题

								        h = a.find(["h1", "h2", "h3", "h4", "h5"])

								        if not h:

								            continue


								        title = h.get_text(strip=True)


								        # 👉 时间（常见 class：time / date）

								        time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in (

								                tag.get("class") or []))

								        time_text = time_tag.get_text(strip=True) if time_tag else ""


								        # ❗过滤无效

								        if len(title) < 5:

								            continue


								        results.append({

								            "title": title,

								            "text": time_text,

								        })


								    return results


								def extract_blocks(html):

								    soup = BeautifulSoup(html, "lxml")


								    results = []


								    # 👉 找所有“可能是内容块”的 a / div

								    candidates = soup.find_all(["a", "div"])


								    for tag in candidates:

								        text = tag.get_text(strip=True)


								        # ❗过滤太短的（排除噪声）

								        if len(text) < 10:

								            continue


								        # 👉 标题（优先找短文本）

								        for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):

								            txt = t.get_text(strip=True)

								            if len(txt) >= 10:

								                print(txt)

								                results.append({

								                    "text": txt,

								                })

								                break


								    return results


								def extract_chinese_and_english(html, url):

								    # 1. 匹配中文（通用版）

								    # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字

								    # \u3400-\u4dbf 覆盖扩展A区（生僻字、古汉字）

								    # 使用 re.UNICODE 确保编码兼容性

								    chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)

								    chinese_list = chinese_pattern.findall(html)

								    chinese_list = set(chinese_list)

								    soup = BeautifulSoup(html, "lxml")


								    title = soup.title.string if soup.title else ""


								    return {

								        'url': url,

								        "title": title,

								        "keywords": '',

								        "description": '',

								        "content": ','.join(chinese_list)

								    }


								def deal_html(html, url):

								    soup = BeautifulSoup(html, "lxml")


								    title, keywords, description = extract_meta(soup)

								    content = extract_text(html)


								    return {

								        'url': url,

								        "title": title,

								        "keywords": keywords,

								        "description": description,

								        "content": content

								    }


								if __name__ == '__main__':

								    with open('../t/1.html', 'r', encoding='utf-8') as f:

								        h = f.read()

								        d = extract_chinese_and_english(h)

								        print(d)