爬虫相关
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

148 lines
3.6 KiB

import re
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import trafilatura
def extract_meta(soup):
title = soup.title.string if soup.title else ""
def get_meta(name):
tag = soup.find("meta", attrs={"name": name})
if tag and tag.get("content", ''):
return tag["content"].strip()
return ""
# 兜底 OG
def get_og(prop):
tag = soup.find("meta", attrs={"property": prop})
if tag and tag.get("content", ''):
return tag["content"].strip()
return ""
keywords = get_meta("keywords")
description = get_meta("description")
if not title:
title = get_og("og:title")
if not description:
description = get_og("og:description")
return title, keywords, description
def extract_text(html):
# ⭐ 优先正文算法
text = trafilatura.extract(html)
if text:
return text
# 🔁 兜底(防止失败)
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.extract()
return soup.get_text(separator="\n")
def extract_news(html):
soup = BeautifulSoup(html, "lxml")
results = []
for a in soup.find_all("a", href=True):
# 👉 标题
h = a.find(["h1", "h2", "h3", "h4", "h5"])
if not h:
continue
title = h.get_text(strip=True)
# 👉 时间(常见 class:time / date)
time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in (
tag.get("class") or []))
time_text = time_tag.get_text(strip=True) if time_tag else ""
# ❗过滤无效
if len(title) < 5:
continue
results.append({
"title": title,
"text": time_text,
})
return results
def extract_blocks(html):
soup = BeautifulSoup(html, "lxml")
results = []
# 👉 找所有“可能是内容块”的 a / div
candidates = soup.find_all(["a", "div"])
for tag in candidates:
text = tag.get_text(strip=True)
# ❗过滤太短的(排除噪声)
if len(text) < 10:
continue
# 👉 标题(优先找短文本)
for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
txt = t.get_text(strip=True)
if len(txt) >= 10:
print(txt)
results.append({
"text": txt,
})
break
return results
def extract_chinese_and_english(html, url):
# 1. 匹配中文(通用版)
# [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
# \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字)
# 使用 re.UNICODE 确保编码兼容性
chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
chinese_list = chinese_pattern.findall(html)
chinese_list = set(chinese_list)
soup = BeautifulSoup(html, "lxml")
title = soup.title.string if soup.title else ""
return {
'url': url,
"title": title,
"keywords": '',
"description": '',
"content": ','.join(chinese_list)
}
def deal_html(html, url):
soup = BeautifulSoup(html, "lxml")
title, keywords, description = extract_meta(soup)
content = extract_text(html)
return {
'url': url,
"title": title,
"keywords": keywords,
"description": description,
"content": content
}
if __name__ == '__main__':
with open('../t/1.html', 'r', encoding='utf-8') as f:
h = f.read()
d = extract_chinese_and_english(h)
print(d)