12 changed files with 576 additions and 0 deletions
-
8htmldata_get/.idea/htmldata_get.iml
-
118htmldata_get/api_main.py
-
139htmldata_get/main.py
-
1htmldata_get/res.json
-
BINhtmldata_get/spider.db
-
18htmldata_get/sql.py
-
75htmldata_get/t.py
-
2htmldata_get/t/1.html
-
19htmldata_get/t/t2.py
-
39htmldata_get/tools/class_int.py
-
138htmldata_get/tools/deal_html.py
-
19htmldata_get/tools/retry.py
@ -0,0 +1,8 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module type="PYTHON_MODULE" version="4"> |
|||
<component name="NewModuleRootManager"> |
|||
<content url="file://$MODULE_DIR$" /> |
|||
<orderEntry type="inheritedJdk" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
</component> |
|||
</module> |
|||
@ -0,0 +1,118 @@ |
|||
from main import Start |
|||
from fastapi import FastAPI |
|||
from pydantic import BaseModel |
|||
from threading import Thread |
|||
import queue |
|||
import uuid |
|||
import sqlite3 |
|||
import json |
|||
import hashlib |
|||
|
|||
app = FastAPI() |
|||
|
|||
task_queue = queue.Queue() |
|||
|
|||
|
|||
def get_conn(): |
|||
return sqlite3.connect("spider.db", check_same_thread=False) |
|||
|
|||
|
|||
class Spider(BaseModel): |
|||
url: str |
|||
|
|||
|
|||
# 🔁 worker |
|||
def worker(): |
|||
while True: |
|||
task_id, url = task_queue.get() |
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
try: |
|||
data = Start(url).run() |
|||
|
|||
# cur.execute(""" |
|||
# UPDATE tasks |
|||
# SET status=?, |
|||
# result=? |
|||
# WHERE task_id = ? |
|||
# """, (0, json.dumps(data, ensure_ascii=False), task_id)) |
|||
cur.execute(""" |
|||
INSERT OR REPLACE INTO tasks (task_id, status, result) |
|||
VALUES (?, ?, ?) |
|||
""", (task_id, 0, json.dumps(data, ensure_ascii=False))) |
|||
|
|||
except Exception as e: |
|||
cur.execute(""" |
|||
UPDATE tasks |
|||
SET status=?, |
|||
error=? |
|||
WHERE task_id = ? |
|||
""", ("error", str(e), task_id)) |
|||
|
|||
conn.commit() |
|||
conn.close() |
|||
task_queue.task_done() |
|||
|
|||
|
|||
Thread(target=worker, daemon=True).start() |
|||
|
|||
|
|||
# ✅ 提交任务 |
|||
@app.post("/crawler/put", summary='提交爬虫') |
|||
def put_task(req: Spider): |
|||
""" |
|||
url:爬取的url |
|||
""" |
|||
task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest()) |
|||
print(task_id) |
|||
|
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
cur.execute(""" |
|||
INSERT OR REPLACE INTO tasks (task_id, url, status) |
|||
VALUES (?, ?, ?) |
|||
""", (task_id, req.url, -2)) |
|||
|
|||
conn.commit() |
|||
conn.close() |
|||
|
|||
task_queue.put((task_id, req.url)) |
|||
|
|||
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'} |
|||
|
|||
|
|||
# ✅ 获取结果 |
|||
@app.get("/crawler/get/{task_id}", summary='获取数据') |
|||
def get_result(task_id: str): |
|||
""" |
|||
task_id: 提交返回的的task_id |
|||
code: 异常有问题:-1 正在处理:-2 成功:0 |
|||
""" |
|||
conn = get_conn() |
|||
cur = conn.cursor() |
|||
|
|||
cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,)) |
|||
row = cur.fetchone() |
|||
conn.close() |
|||
|
|||
if not row: |
|||
return {"code": -1, 'data': task_id, 'msg': '无此task_id'} |
|||
|
|||
status, result, error = row |
|||
if status == "error": |
|||
return {"code": -1, 'data': task_id, 'msg': '任务处理失败'} |
|||
if int(status) == 0: |
|||
msg = '操作成功' |
|||
else: |
|||
msg = '任务正在处理' |
|||
|
|||
return { |
|||
"code": int(status), |
|||
"data": json.loads(result) if result else None, |
|||
"msg": msg |
|||
} |
|||
|
|||
# uvicorn api_main:app --host 0.0.0.0 --port 8000 |
|||
# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug |
|||
@ -0,0 +1,139 @@ |
|||
import random |
|||
import time |
|||
from urllib.parse import urljoin, urlparse |
|||
from tools.class_int import RequestsInt |
|||
from tools.deal_html import deal_html |
|||
from bs4 import BeautifulSoup |
|||
import json |
|||
|
|||
|
|||
class Start(object): |
|||
def __init__(self, url): |
|||
self.url = url |
|||
self.requests = RequestsInt(url) |
|||
self.res = [] |
|||
|
|||
# ❌ 要过滤的资源 |
|||
self.exclude_ext = { |
|||
".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", |
|||
".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", |
|||
".mp3", ".wav", |
|||
".pdf", ".zip", ".rar", ".7z", |
|||
".css", ".js", ".ico", ".woff", ".woff2", ".ttf" |
|||
} |
|||
|
|||
# ✅ 判断是否网页 |
|||
def is_valid_page(self, url): |
|||
path = urlparse(url).path.lower() |
|||
|
|||
for ext in self.exclude_ext: |
|||
if path.endswith(ext): |
|||
return False |
|||
|
|||
return True |
|||
|
|||
def get_internal_links(self, html): |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
base_domain = urlparse(self.url).netloc |
|||
links = set() |
|||
|
|||
for a in soup.find_all("a", href=True): |
|||
href = a["href"].strip() |
|||
|
|||
# ❌ 无效协议 |
|||
if href.startswith(("javascript:", "#", "mailto:", "tel:")): |
|||
continue |
|||
|
|||
# ✅ 转绝对路径 |
|||
full_url = urljoin(self.url, href) |
|||
|
|||
# ✅ 同域 |
|||
if urlparse(full_url).netloc != base_domain: |
|||
continue |
|||
|
|||
# ✅ 过滤资源文件 |
|||
if not self.is_valid_page(full_url): |
|||
continue |
|||
|
|||
links.add(full_url) |
|||
|
|||
return list(links) |
|||
|
|||
def deal(self, url, html=None): |
|||
if not html: |
|||
html = self.requests.get(url) |
|||
if html.status_code == 200: |
|||
html = html.text |
|||
else: |
|||
print(f'请求状态异常不处理:{html.status_code}') |
|||
return {} |
|||
data = deal_html(html, url) |
|||
# print(data) |
|||
self.res.append(data) |
|||
return data |
|||
|
|||
def dealpage(self, url, html): |
|||
data = deal_html(html, url) |
|||
self.res.append(data) |
|||
return data |
|||
|
|||
def _time_sleep(self, k=None): |
|||
if not k: |
|||
k = random.uniform(0, 1) |
|||
print('等待:', k) |
|||
time.sleep(k) |
|||
|
|||
def fet_all_links(self, html): |
|||
all_links = self.get_internal_links(html) |
|||
print("二级页面链接:", all_links) |
|||
print('二级页面长度:', len(all_links)) |
|||
return all_links |
|||
|
|||
def run(self): |
|||
html = self.requests.get(self.url).text |
|||
res = self.deal(url=self.url, html=html) |
|||
print(f'主页获取完成:{res.get("title")}') |
|||
all_links = self.fet_all_links(html) |
|||
if len(all_links) > 0: |
|||
print('静态页面---') |
|||
for index, link in enumerate(all_links): |
|||
try: |
|||
res = self.deal(url=link) |
|||
print(f'{link}:{res.get("title")} {index + 1}') |
|||
self._time_sleep() |
|||
except Exception as e: |
|||
print(e) |
|||
print(f'异常:{link} {e}') |
|||
# print(f'成功获取:{self.res}') |
|||
print(f'全部获取完成:{self.url} {len(self.res)}') |
|||
return self.res |
|||
else: |
|||
print('动态页面---') |
|||
html = self.requests.get_page(self.url) |
|||
print(html) |
|||
res = self.dealpage(url=self.url, html=html) |
|||
print(f'主页获取完成:{res.get("title")}') |
|||
all_links = self.fet_all_links(html) |
|||
for index, link in enumerate(all_links): |
|||
try: |
|||
res = self.deal(url=link) |
|||
print(f'{link}:{res.get("title")} {index + 1}') |
|||
self._time_sleep() |
|||
except Exception as e: |
|||
print(e) |
|||
print(f'异常:{link} {e}') |
|||
# print(f'成功获取:{self.res}') |
|||
print(f'全部获取完成:{self.url} {len(self.res)}') |
|||
return self.res |
|||
|
|||
|
|||
class StartPage(Start): |
|||
pass |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
url = 'https://www.essilor.com/cn-zh/' |
|||
d = Start(url).run() |
|||
with open('res.json', 'w', encoding='utf-8') as f: |
|||
f.write(json.dumps(d, ensure_ascii=False)) |
|||
@ -0,0 +1 @@ |
|||
[{"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}, {"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}] |
|||
@ -0,0 +1,18 @@ |
|||
import sqlite3 |
|||
|
|||
conn = sqlite3.connect("spider.db") |
|||
cursor = conn.cursor() |
|||
|
|||
cursor.execute(""" |
|||
CREATE TABLE IF NOT EXISTS tasks ( |
|||
task_id TEXT PRIMARY KEY, |
|||
url TEXT, |
|||
status TEXT, |
|||
result TEXT, |
|||
error TEXT, |
|||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP |
|||
) |
|||
""") |
|||
|
|||
conn.commit() |
|||
conn.close() |
|||
@ -0,0 +1,75 @@ |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
import trafilatura |
|||
|
|||
def fetch_html(url): |
|||
headers = {"User-Agent": "Mozilla/5.0"} |
|||
resp = requests.get(url, headers=headers, timeout=10) |
|||
resp.encoding = resp.apparent_encoding |
|||
return resp.text |
|||
|
|||
|
|||
def extract_meta(soup): |
|||
title = soup.title.string.strip() if soup.title else "" |
|||
|
|||
def get_meta(name): |
|||
tag = soup.find("meta", attrs={"name": name}) |
|||
if tag and tag.get("content"): |
|||
return tag["content"].strip() |
|||
return "" |
|||
|
|||
# 兜底 OG |
|||
def get_og(prop): |
|||
tag = soup.find("meta", attrs={"property": prop}) |
|||
if tag and tag.get("content"): |
|||
return tag["content"].strip() |
|||
return "" |
|||
|
|||
keywords = get_meta("keywords") |
|||
description = get_meta("description") |
|||
|
|||
if not title: |
|||
title = get_og("og:title") |
|||
if not description: |
|||
description = get_og("og:description") |
|||
|
|||
return title, keywords, description |
|||
|
|||
|
|||
def extract_text(html): |
|||
# ⭐ 优先正文算法 |
|||
text = trafilatura.extract(html) |
|||
if text: |
|||
return text |
|||
|
|||
# 🔁 兜底(防止失败) |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
for tag in soup(["script", "style", "noscript"]): |
|||
tag.extract() |
|||
|
|||
return soup.get_text(separator="\n") |
|||
|
|||
|
|||
def parse_page(url): |
|||
html = fetch_html(url) |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
title, keywords, description = extract_meta(soup) |
|||
content = extract_text(html) |
|||
|
|||
return { |
|||
"url": url, |
|||
"title": title, |
|||
"keywords": keywords, |
|||
"description": description, |
|||
"content": content |
|||
} |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
url = "https://www.lheia.com/hzhb.html" |
|||
data = parse_page(url) |
|||
|
|||
for k, v in data.items(): |
|||
print(f"{k}:\n{str(v)[:300]}\n") |
|||
2
htmldata_get/t/1.html
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,19 @@ |
|||
# # 导入 |
|||
# from DrissionPage import SessionPage |
|||
# # 创建页面对象 |
|||
# page = SessionPage() |
|||
# # 访问网页 |
|||
# page.get('https://www.essilor.com/cn-zh/') |
|||
# print(page.html) |
|||
# print(page) |
|||
# page.close() |
|||
|
|||
# 导入 |
|||
from DrissionPage import ChromiumPage |
|||
# 创建页面对象 |
|||
page = ChromiumPage() |
|||
# 访问网页 |
|||
page.get('https://www.essilor.com/cn-zh/') |
|||
print(page.html) |
|||
print(page) |
|||
page.close() |
|||
@ -0,0 +1,39 @@ |
|||
import requests |
|||
from tools.retry import retry |
|||
from DrissionPage import SessionPage |
|||
|
|||
|
|||
class RequestsInt(object): |
|||
def __init__(self, url): |
|||
self.ref = url |
|||
self.session = requests.Session() |
|||
|
|||
def get_headers(self): |
|||
return { |
|||
'referer': self.ref, |
|||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36' |
|||
} |
|||
|
|||
@retry('get请求', 3) |
|||
def get(self, url, params=None, headers=None, timeout=5, **kwargs): |
|||
if headers is None: |
|||
headers = self.get_headers() |
|||
r = self.session.get(url=url, params=params, headers=headers, timeout=timeout, **kwargs) |
|||
r.encoding = 'utf-8' |
|||
return r |
|||
|
|||
@retry('post请求', 3) |
|||
def post(self, url, params=None, headers=None, timeout=5, **kwargs): |
|||
if headers is None: |
|||
headers = self.get_headers() |
|||
r = self.session.post(url=url, params=params, headers=headers, timeout=timeout, **kwargs) |
|||
r.encoding = 'utf-8' |
|||
|
|||
return r |
|||
|
|||
def get_page(self, url): |
|||
page = SessionPage() |
|||
# 访问网页 |
|||
page.get(url) |
|||
page.close() |
|||
return page.html |
|||
@ -0,0 +1,138 @@ |
|||
import re |
|||
from urllib.parse import urljoin |
|||
|
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
import trafilatura |
|||
|
|||
|
|||
def extract_meta(soup): |
|||
title = soup.title.string if soup.title else "" |
|||
|
|||
def get_meta(name): |
|||
tag = soup.find("meta", attrs={"name": name}) |
|||
if tag and tag.get("content", ''): |
|||
return tag["content"].strip() |
|||
return "" |
|||
|
|||
# 兜底 OG |
|||
def get_og(prop): |
|||
tag = soup.find("meta", attrs={"property": prop}) |
|||
if tag and tag.get("content", ''): |
|||
return tag["content"].strip() |
|||
return "" |
|||
|
|||
keywords = get_meta("keywords") |
|||
description = get_meta("description") |
|||
|
|||
if not title: |
|||
title = get_og("og:title") |
|||
if not description: |
|||
description = get_og("og:description") |
|||
|
|||
return title, keywords, description |
|||
|
|||
|
|||
def extract_text(html): |
|||
# ⭐ 优先正文算法 |
|||
text = trafilatura.extract(html) |
|||
if text: |
|||
return text |
|||
|
|||
# 🔁 兜底(防止失败) |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
for tag in soup(["script", "style", "noscript"]): |
|||
tag.extract() |
|||
|
|||
return soup.get_text(separator="\n") |
|||
|
|||
|
|||
def extract_news(html): |
|||
soup = BeautifulSoup(html, "lxml") |
|||
results = [] |
|||
|
|||
for a in soup.find_all("a", href=True): |
|||
# 👉 标题 |
|||
h = a.find(["h1", "h2", "h3", "h4", "h5"]) |
|||
if not h: |
|||
continue |
|||
|
|||
title = h.get_text(strip=True) |
|||
|
|||
# 👉 时间(常见 class:time / date) |
|||
time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in ( |
|||
tag.get("class") or [])) |
|||
time_text = time_tag.get_text(strip=True) if time_tag else "" |
|||
|
|||
# ❗过滤无效 |
|||
if len(title) < 5: |
|||
continue |
|||
|
|||
results.append({ |
|||
"title": title, |
|||
"text": time_text, |
|||
}) |
|||
|
|||
return results |
|||
|
|||
|
|||
def extract_blocks(html): |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
results = [] |
|||
|
|||
# 👉 找所有“可能是内容块”的 a / div |
|||
candidates = soup.find_all(["a", "div"]) |
|||
|
|||
for tag in candidates: |
|||
text = tag.get_text(strip=True) |
|||
|
|||
# ❗过滤太短的(排除噪声) |
|||
if len(text) < 10: |
|||
continue |
|||
|
|||
# 👉 标题(优先找短文本) |
|||
for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]): |
|||
txt = t.get_text(strip=True) |
|||
if len(txt) >=10: |
|||
print(txt) |
|||
results.append({ |
|||
"text": txt, |
|||
}) |
|||
break |
|||
|
|||
return results |
|||
def extract_chinese_and_english(html): |
|||
if not html: |
|||
return [], [] |
|||
|
|||
# 1. 匹配中文(通用版) |
|||
# [\u4e00-\u9fff] 覆盖常用简体/繁体汉字 |
|||
# \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字) |
|||
# 使用 re.UNICODE 确保编码兼容性 |
|||
chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE) |
|||
chinese_list = chinese_pattern.findall(html) |
|||
|
|||
|
|||
return chinese_list |
|||
def deal_html(html, url): |
|||
soup = BeautifulSoup(html, "lxml") |
|||
|
|||
title, keywords, description = extract_meta(soup) |
|||
content = extract_text(html) |
|||
|
|||
return { |
|||
'url': url, |
|||
"title": title, |
|||
"keywords": keywords, |
|||
"description": description, |
|||
"content": content |
|||
} |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
with open('../t/1.html', 'r', encoding='utf-8') as f: |
|||
h = f.read() |
|||
d = extract_chinese_and_english(h) |
|||
print(d) |
|||
@ -0,0 +1,19 @@ |
|||
from functools import wraps |
|||
|
|||
|
|||
def retry(name='name', for_word=3): |
|||
def decorator(func): |
|||
@wraps(func) |
|||
def wrapper(*args, **kwargs): |
|||
for i in range(for_word): |
|||
try: |
|||
res = func(*args, **kwargs) |
|||
|
|||
return res |
|||
except Exception as e: |
|||
print(f'[{name}]:{e}') |
|||
return False |
|||
|
|||
return wrapper |
|||
|
|||
return decorator |
|||
Write
Preview
Loading…
Cancel
Save
Reference in new issue