diff --git a/htmldata_get/.idea/htmldata_get.iml b/htmldata_get/.idea/htmldata_get.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/htmldata_get/.idea/htmldata_get.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/htmldata_get/api_main.py b/htmldata_get/api_main.py new file mode 100644 index 0000000..f44ac9b --- /dev/null +++ b/htmldata_get/api_main.py @@ -0,0 +1,118 @@ +from main import Start +from fastapi import FastAPI +from pydantic import BaseModel +from threading import Thread +import queue +import uuid +import sqlite3 +import json +import hashlib + +app = FastAPI() + +task_queue = queue.Queue() + + +def get_conn(): + return sqlite3.connect("spider.db", check_same_thread=False) + + +class Spider(BaseModel): + url: str + + +# 🔁 worker +def worker(): + while True: + task_id, url = task_queue.get() + conn = get_conn() + cur = conn.cursor() + + try: + data = Start(url).run() + + # cur.execute(""" + # UPDATE tasks + # SET status=?, + # result=? + # WHERE task_id = ? + # """, (0, json.dumps(data, ensure_ascii=False), task_id)) + cur.execute(""" + INSERT OR REPLACE INTO tasks (task_id, status, result) + VALUES (?, ?, ?) + """, (task_id, 0, json.dumps(data, ensure_ascii=False))) + + except Exception as e: + cur.execute(""" + UPDATE tasks + SET status=?, + error=? + WHERE task_id = ? + """, ("error", str(e), task_id)) + + conn.commit() + conn.close() + task_queue.task_done() + + +Thread(target=worker, daemon=True).start() + + +# ✅ 提交任务 +@app.post("/crawler/put", summary='提交爬虫') +def put_task(req: Spider): + """ + url:爬取的url + """ + task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest()) + print(task_id) + + conn = get_conn() + cur = conn.cursor() + + cur.execute(""" + INSERT OR REPLACE INTO tasks (task_id, url, status) + VALUES (?, ?, ?) + """, (task_id, req.url, -2)) + + conn.commit() + conn.close() + + task_queue.put((task_id, req.url)) + + return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'} + + +# ✅ 获取结果 +@app.get("/crawler/get/{task_id}", summary='获取数据') +def get_result(task_id: str): + """ + task_id: 提交返回的的task_id + code: 异常有问题:-1 正在处理:-2 成功:0 + """ + conn = get_conn() + cur = conn.cursor() + + cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,)) + row = cur.fetchone() + conn.close() + + if not row: + return {"code": -1, 'data': task_id, 'msg': '无此task_id'} + + status, result, error = row + if status == "error": + return {"code": -1, 'data': task_id, 'msg': '任务处理失败'} + if int(status) == 0: + msg = '操作成功' + else: + msg = '任务正在处理' + + return { + "code": int(status), + "data": json.loads(result) if result else None, + "msg": msg + } + +# uvicorn api_main:app --host 0.0.0.0 --port 8000 +# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug \ No newline at end of file diff --git a/htmldata_get/main.py b/htmldata_get/main.py new file mode 100644 index 0000000..b0416a1 --- /dev/null +++ b/htmldata_get/main.py @@ -0,0 +1,139 @@ +import random +import time +from urllib.parse import urljoin, urlparse +from tools.class_int import RequestsInt +from tools.deal_html import deal_html +from bs4 import BeautifulSoup +import json + + +class Start(object): + def __init__(self, url): + self.url = url + self.requests = RequestsInt(url) + self.res = [] + + # ❌ 要过滤的资源 + self.exclude_ext = { + ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", + ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", + ".mp3", ".wav", + ".pdf", ".zip", ".rar", ".7z", + ".css", ".js", ".ico", ".woff", ".woff2", ".ttf" + } + + # ✅ 判断是否网页 + def is_valid_page(self, url): + path = urlparse(url).path.lower() + + for ext in self.exclude_ext: + if path.endswith(ext): + return False + + return True + + def get_internal_links(self, html): + soup = BeautifulSoup(html, "lxml") + + base_domain = urlparse(self.url).netloc + links = set() + + for a in soup.find_all("a", href=True): + href = a["href"].strip() + + # ❌ 无效协议 + if href.startswith(("javascript:", "#", "mailto:", "tel:")): + continue + + # ✅ 转绝对路径 + full_url = urljoin(self.url, href) + + # ✅ 同域 + if urlparse(full_url).netloc != base_domain: + continue + + # ✅ 过滤资源文件 + if not self.is_valid_page(full_url): + continue + + links.add(full_url) + + return list(links) + + def deal(self, url, html=None): + if not html: + html = self.requests.get(url) + if html.status_code == 200: + html = html.text + else: + print(f'请求状态异常不处理:{html.status_code}') + return {} + data = deal_html(html, url) + # print(data) + self.res.append(data) + return data + + def dealpage(self, url, html): + data = deal_html(html, url) + self.res.append(data) + return data + + def _time_sleep(self, k=None): + if not k: + k = random.uniform(0, 1) + print('等待:', k) + time.sleep(k) + + def fet_all_links(self, html): + all_links = self.get_internal_links(html) + print("二级页面链接:", all_links) + print('二级页面长度:', len(all_links)) + return all_links + + def run(self): + html = self.requests.get(self.url).text + res = self.deal(url=self.url, html=html) + print(f'主页获取完成:{res.get("title")}') + all_links = self.fet_all_links(html) + if len(all_links) > 0: + print('静态页面---') + for index, link in enumerate(all_links): + try: + res = self.deal(url=link) + print(f'{link}:{res.get("title")} {index + 1}') + self._time_sleep() + except Exception as e: + print(e) + print(f'异常:{link} {e}') + # print(f'成功获取:{self.res}') + print(f'全部获取完成:{self.url} {len(self.res)}') + return self.res + else: + print('动态页面---') + html = self.requests.get_page(self.url) + print(html) + res = self.dealpage(url=self.url, html=html) + print(f'主页获取完成:{res.get("title")}') + all_links = self.fet_all_links(html) + for index, link in enumerate(all_links): + try: + res = self.deal(url=link) + print(f'{link}:{res.get("title")} {index + 1}') + self._time_sleep() + except Exception as e: + print(e) + print(f'异常:{link} {e}') + # print(f'成功获取:{self.res}') + print(f'全部获取完成:{self.url} {len(self.res)}') + return self.res + + +class StartPage(Start): + pass + + +if __name__ == '__main__': + url = 'https://www.essilor.com/cn-zh/' + d = Start(url).run() + with open('res.json', 'w', encoding='utf-8') as f: + f.write(json.dumps(d, ensure_ascii=False)) diff --git a/htmldata_get/res.json b/htmldata_get/res.json new file mode 100644 index 0000000..89c0ebb --- /dev/null +++ b/htmldata_get/res.json @@ -0,0 +1 @@ +[{"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}, {"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}] \ No newline at end of file diff --git a/htmldata_get/spider.db b/htmldata_get/spider.db new file mode 100644 index 0000000..8f73212 Binary files /dev/null and b/htmldata_get/spider.db differ diff --git a/htmldata_get/sql.py b/htmldata_get/sql.py new file mode 100644 index 0000000..64b0a68 --- /dev/null +++ b/htmldata_get/sql.py @@ -0,0 +1,18 @@ +import sqlite3 + +conn = sqlite3.connect("spider.db") +cursor = conn.cursor() + +cursor.execute(""" +CREATE TABLE IF NOT EXISTS tasks ( + task_id TEXT PRIMARY KEY, + url TEXT, + status TEXT, + result TEXT, + error TEXT, + create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +""") + +conn.commit() +conn.close() \ No newline at end of file diff --git a/htmldata_get/t.py b/htmldata_get/t.py new file mode 100644 index 0000000..9e6c3bf --- /dev/null +++ b/htmldata_get/t.py @@ -0,0 +1,75 @@ +import requests +from bs4 import BeautifulSoup +import trafilatura + +def fetch_html(url): + headers = {"User-Agent": "Mozilla/5.0"} + resp = requests.get(url, headers=headers, timeout=10) + resp.encoding = resp.apparent_encoding + return resp.text + + +def extract_meta(soup): + title = soup.title.string.strip() if soup.title else "" + + def get_meta(name): + tag = soup.find("meta", attrs={"name": name}) + if tag and tag.get("content"): + return tag["content"].strip() + return "" + + # 兜底 OG + def get_og(prop): + tag = soup.find("meta", attrs={"property": prop}) + if tag and tag.get("content"): + return tag["content"].strip() + return "" + + keywords = get_meta("keywords") + description = get_meta("description") + + if not title: + title = get_og("og:title") + if not description: + description = get_og("og:description") + + return title, keywords, description + + +def extract_text(html): + # ⭐ 优先正文算法 + text = trafilatura.extract(html) + if text: + return text + + # 🔁 兜底(防止失败) + soup = BeautifulSoup(html, "lxml") + + for tag in soup(["script", "style", "noscript"]): + tag.extract() + + return soup.get_text(separator="\n") + + +def parse_page(url): + html = fetch_html(url) + soup = BeautifulSoup(html, "lxml") + + title, keywords, description = extract_meta(soup) + content = extract_text(html) + + return { + "url": url, + "title": title, + "keywords": keywords, + "description": description, + "content": content + } + + +if __name__ == "__main__": + url = "https://www.lheia.com/hzhb.html" + data = parse_page(url) + + for k, v in data.items(): + print(f"{k}:\n{str(v)[:300]}\n") \ No newline at end of file diff --git a/htmldata_get/t/1.html b/htmldata_get/t/1.html new file mode 100644 index 0000000..7012d09 --- /dev/null +++ b/htmldata_get/t/1.html @@ -0,0 +1,2 @@ + +World leader in prescription lenses | Essilor
wrapper-img
\ No newline at end of file diff --git a/htmldata_get/t/t2.py b/htmldata_get/t/t2.py new file mode 100644 index 0000000..2aac017 --- /dev/null +++ b/htmldata_get/t/t2.py @@ -0,0 +1,19 @@ +# # 导入 +# from DrissionPage import SessionPage +# # 创建页面对象 +# page = SessionPage() +# # 访问网页 +# page.get('https://www.essilor.com/cn-zh/') +# print(page.html) +# print(page) +# page.close() + +# 导入 +from DrissionPage import ChromiumPage +# 创建页面对象 +page = ChromiumPage() +# 访问网页 +page.get('https://www.essilor.com/cn-zh/') +print(page.html) +print(page) +page.close() \ No newline at end of file diff --git a/htmldata_get/tools/class_int.py b/htmldata_get/tools/class_int.py new file mode 100644 index 0000000..87048b6 --- /dev/null +++ b/htmldata_get/tools/class_int.py @@ -0,0 +1,39 @@ +import requests +from tools.retry import retry +from DrissionPage import SessionPage + + +class RequestsInt(object): + def __init__(self, url): + self.ref = url + self.session = requests.Session() + + def get_headers(self): + return { + 'referer': self.ref, + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36' + } + + @retry('get请求', 3) + def get(self, url, params=None, headers=None, timeout=5, **kwargs): + if headers is None: + headers = self.get_headers() + r = self.session.get(url=url, params=params, headers=headers, timeout=timeout, **kwargs) + r.encoding = 'utf-8' + return r + + @retry('post请求', 3) + def post(self, url, params=None, headers=None, timeout=5, **kwargs): + if headers is None: + headers = self.get_headers() + r = self.session.post(url=url, params=params, headers=headers, timeout=timeout, **kwargs) + r.encoding = 'utf-8' + + return r + + def get_page(self, url): + page = SessionPage() + # 访问网页 + page.get(url) + page.close() + return page.html diff --git a/htmldata_get/tools/deal_html.py b/htmldata_get/tools/deal_html.py new file mode 100644 index 0000000..e4a2572 --- /dev/null +++ b/htmldata_get/tools/deal_html.py @@ -0,0 +1,138 @@ +import re +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +import trafilatura + + +def extract_meta(soup): + title = soup.title.string if soup.title else "" + + def get_meta(name): + tag = soup.find("meta", attrs={"name": name}) + if tag and tag.get("content", ''): + return tag["content"].strip() + return "" + + # 兜底 OG + def get_og(prop): + tag = soup.find("meta", attrs={"property": prop}) + if tag and tag.get("content", ''): + return tag["content"].strip() + return "" + + keywords = get_meta("keywords") + description = get_meta("description") + + if not title: + title = get_og("og:title") + if not description: + description = get_og("og:description") + + return title, keywords, description + + +def extract_text(html): + # ⭐ 优先正文算法 + text = trafilatura.extract(html) + if text: + return text + + # 🔁 兜底(防止失败) + soup = BeautifulSoup(html, "lxml") + + for tag in soup(["script", "style", "noscript"]): + tag.extract() + + return soup.get_text(separator="\n") + + +def extract_news(html): + soup = BeautifulSoup(html, "lxml") + results = [] + + for a in soup.find_all("a", href=True): + # 👉 标题 + h = a.find(["h1", "h2", "h3", "h4", "h5"]) + if not h: + continue + + title = h.get_text(strip=True) + + # 👉 时间(常见 class:time / date) + time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in ( + tag.get("class") or [])) + time_text = time_tag.get_text(strip=True) if time_tag else "" + + # ❗过滤无效 + if len(title) < 5: + continue + + results.append({ + "title": title, + "text": time_text, + }) + + return results + + +def extract_blocks(html): + soup = BeautifulSoup(html, "lxml") + + results = [] + + # 👉 找所有“可能是内容块”的 a / div + candidates = soup.find_all(["a", "div"]) + + for tag in candidates: + text = tag.get_text(strip=True) + + # ❗过滤太短的(排除噪声) + if len(text) < 10: + continue + + # 👉 标题(优先找短文本) + for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]): + txt = t.get_text(strip=True) + if len(txt) >=10: + print(txt) + results.append({ + "text": txt, + }) + break + + return results +def extract_chinese_and_english(html): + if not html: + return [], [] + + # 1. 匹配中文(通用版) + # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字 + # \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字) + # 使用 re.UNICODE 确保编码兼容性 + chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE) + chinese_list = chinese_pattern.findall(html) + + + return chinese_list +def deal_html(html, url): + soup = BeautifulSoup(html, "lxml") + + title, keywords, description = extract_meta(soup) + content = extract_text(html) + + return { + 'url': url, + "title": title, + "keywords": keywords, + "description": description, + "content": content + } + + +if __name__ == '__main__': + with open('../t/1.html', 'r', encoding='utf-8') as f: + h = f.read() + d = extract_chinese_and_english(h) + print(d) diff --git a/htmldata_get/tools/retry.py b/htmldata_get/tools/retry.py new file mode 100644 index 0000000..f54f35c --- /dev/null +++ b/htmldata_get/tools/retry.py @@ -0,0 +1,19 @@ +from functools import wraps + + +def retry(name='name', for_word=3): + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + for i in range(for_word): + try: + res = func(*args, **kwargs) + + return res + except Exception as e: + print(f'[{name}]:{e}') + return False + + return wrapper + + return decorator