import re from urllib.parse import urljoin import requests from bs4 import BeautifulSoup import trafilatura def extract_meta(soup): title = soup.title.string if soup.title else "" def get_meta(name): tag = soup.find("meta", attrs={"name": name}) if tag and tag.get("content", ''): return tag["content"].strip() return "" # 兜底 OG def get_og(prop): tag = soup.find("meta", attrs={"property": prop}) if tag and tag.get("content", ''): return tag["content"].strip() return "" keywords = get_meta("keywords") description = get_meta("description") if not title: title = get_og("og:title") if not description: description = get_og("og:description") return title, keywords, description def extract_text(html): # ⭐ 优先正文算法 text = trafilatura.extract(html) if text: return text # 🔁 兜底(防止失败) soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "noscript"]): tag.extract() return soup.get_text(separator="\n") def extract_news(html): soup = BeautifulSoup(html, "lxml") results = [] for a in soup.find_all("a", href=True): # 👉 标题 h = a.find(["h1", "h2", "h3", "h4", "h5"]) if not h: continue title = h.get_text(strip=True) # 👉 时间(常见 class:time / date) time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in ( tag.get("class") or [])) time_text = time_tag.get_text(strip=True) if time_tag else "" # ❗过滤无效 if len(title) < 5: continue results.append({ "title": title, "text": time_text, }) return results def extract_blocks(html): soup = BeautifulSoup(html, "lxml") results = [] # 👉 找所有“可能是内容块”的 a / div candidates = soup.find_all(["a", "div"]) for tag in candidates: text = tag.get_text(strip=True) # ❗过滤太短的(排除噪声) if len(text) < 10: continue # 👉 标题(优先找短文本) for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]): txt = t.get_text(strip=True) if len(txt) >= 10: print(txt) results.append({ "text": txt, }) break return results def extract_chinese_and_english(html, url): # 1. 匹配中文(通用版) # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字 # \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字) # 使用 re.UNICODE 确保编码兼容性 chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE) chinese_list = chinese_pattern.findall(html) chinese_list = set(chinese_list) soup = BeautifulSoup(html, "lxml") title = soup.title.string if soup.title else "" return { 'url': url, "title": title, "keywords": '', "description": '', "content": ','.join(chinese_list) } def deal_html(html, url): soup = BeautifulSoup(html, "lxml") title, keywords, description = extract_meta(soup) content = extract_text(html) return { 'url': url, "title": title, "keywords": keywords, "description": description, "content": content } if __name__ == '__main__': with open('../t/1.html', 'r', encoding='utf-8') as f: h = f.read() d = extract_chinese_and_english(h) print(d)