爬虫相关
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

75 lines
1.7 KiB

import requests
from bs4 import BeautifulSoup
import trafilatura
def fetch_html(url):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers, timeout=10)
resp.encoding = resp.apparent_encoding
return resp.text
def extract_meta(soup):
title = soup.title.string.strip() if soup.title else ""
def get_meta(name):
tag = soup.find("meta", attrs={"name": name})
if tag and tag.get("content"):
return tag["content"].strip()
return ""
# 兜底 OG
def get_og(prop):
tag = soup.find("meta", attrs={"property": prop})
if tag and tag.get("content"):
return tag["content"].strip()
return ""
keywords = get_meta("keywords")
description = get_meta("description")
if not title:
title = get_og("og:title")
if not description:
description = get_og("og:description")
return title, keywords, description
def extract_text(html):
# ⭐ 优先正文算法
text = trafilatura.extract(html)
if text:
return text
# 🔁 兜底(防止失败)
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.extract()
return soup.get_text(separator="\n")
def parse_page(url):
html = fetch_html(url)
soup = BeautifulSoup(html, "lxml")
title, keywords, description = extract_meta(soup)
content = extract_text(html)
return {
"url": url,
"title": title,
"keywords": keywords,
"description": description,
"content": content
}
if __name__ == "__main__":
url = "https://www.lheia.com/hzhb.html"
data = parse_page(url)
for k, v in data.items():
print(f"{k}:\n{str(v)[:300]}\n")