You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
3.4 KiB
138 lines
3.4 KiB
import re
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import trafilatura
|
|
|
|
|
|
def extract_meta(soup):
|
|
title = soup.title.string if soup.title else ""
|
|
|
|
def get_meta(name):
|
|
tag = soup.find("meta", attrs={"name": name})
|
|
if tag and tag.get("content", ''):
|
|
return tag["content"].strip()
|
|
return ""
|
|
|
|
# 兜底 OG
|
|
def get_og(prop):
|
|
tag = soup.find("meta", attrs={"property": prop})
|
|
if tag and tag.get("content", ''):
|
|
return tag["content"].strip()
|
|
return ""
|
|
|
|
keywords = get_meta("keywords")
|
|
description = get_meta("description")
|
|
|
|
if not title:
|
|
title = get_og("og:title")
|
|
if not description:
|
|
description = get_og("og:description")
|
|
|
|
return title, keywords, description
|
|
|
|
|
|
def extract_text(html):
|
|
# ⭐ 优先正文算法
|
|
text = trafilatura.extract(html)
|
|
if text:
|
|
return text
|
|
|
|
# 🔁 兜底(防止失败)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
for tag in soup(["script", "style", "noscript"]):
|
|
tag.extract()
|
|
|
|
return soup.get_text(separator="\n")
|
|
|
|
|
|
def extract_news(html):
|
|
soup = BeautifulSoup(html, "lxml")
|
|
results = []
|
|
|
|
for a in soup.find_all("a", href=True):
|
|
# 👉 标题
|
|
h = a.find(["h1", "h2", "h3", "h4", "h5"])
|
|
if not h:
|
|
continue
|
|
|
|
title = h.get_text(strip=True)
|
|
|
|
# 👉 时间(常见 class:time / date)
|
|
time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in (
|
|
tag.get("class") or []))
|
|
time_text = time_tag.get_text(strip=True) if time_tag else ""
|
|
|
|
# ❗过滤无效
|
|
if len(title) < 5:
|
|
continue
|
|
|
|
results.append({
|
|
"title": title,
|
|
"text": time_text,
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def extract_blocks(html):
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
results = []
|
|
|
|
# 👉 找所有“可能是内容块”的 a / div
|
|
candidates = soup.find_all(["a", "div"])
|
|
|
|
for tag in candidates:
|
|
text = tag.get_text(strip=True)
|
|
|
|
# ❗过滤太短的(排除噪声)
|
|
if len(text) < 10:
|
|
continue
|
|
|
|
# 👉 标题(优先找短文本)
|
|
for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
|
|
txt = t.get_text(strip=True)
|
|
if len(txt) >=10:
|
|
print(txt)
|
|
results.append({
|
|
"text": txt,
|
|
})
|
|
break
|
|
|
|
return results
|
|
def extract_chinese_and_english(html):
|
|
if not html:
|
|
return [], []
|
|
|
|
# 1. 匹配中文(通用版)
|
|
# [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
|
|
# \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字)
|
|
# 使用 re.UNICODE 确保编码兼容性
|
|
chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
|
|
chinese_list = chinese_pattern.findall(html)
|
|
|
|
|
|
return chinese_list
|
|
def deal_html(html, url):
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
title, keywords, description = extract_meta(soup)
|
|
content = extract_text(html)
|
|
|
|
return {
|
|
'url': url,
|
|
"title": title,
|
|
"keywords": keywords,
|
|
"description": description,
|
|
"content": content
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
with open('../t/1.html', 'r', encoding='utf-8') as f:
|
|
h = f.read()
|
|
d = extract_chinese_and_english(h)
|
|
print(d)
|