处理网站主页

2 weeks ago · 49235b7de0
12 changed files with 576 additions and 0 deletions
--- a/htmldata_get/.idea/htmldata_get.iml
+++ b/htmldata_get/.idea/htmldata_get.iml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/htmldata_get/api_main.py
+++ b/htmldata_get/api_main.py
@ -0,0 +1,118 @@
 from main import Start
 from fastapi import FastAPI
 from pydantic import BaseModel
 from threading import Thread
 import queue
 import uuid
 import sqlite3
 import json
 import hashlib
 app = FastAPI()
 task_queue = queue.Queue()
 def get_conn():
    return sqlite3.connect("spider.db", check_same_thread=False)
 class Spider(BaseModel):
    url: str
 # 🔁 worker
 def worker():
    while True:
        task_id, url = task_queue.get()
        conn = get_conn()
        cur = conn.cursor()
        try:
            data = Start(url).run()
            # cur.execute("""
            #             UPDATE tasks
            #             SET status=?,
            #                 result=?
            #             WHERE task_id = ?
            #             """, (0, json.dumps(data, ensure_ascii=False), task_id))
            cur.execute("""
                INSERT OR REPLACE INTO tasks (task_id, status, result)
                VALUES (?, ?, ?)
            """, (task_id, 0, json.dumps(data, ensure_ascii=False)))
        except Exception as e:
            cur.execute("""
                        UPDATE tasks
                        SET status=?,
                            error=?
                        WHERE task_id = ?
                        """, ("error", str(e), task_id))
        conn.commit()
        conn.close()
        task_queue.task_done()
 Thread(target=worker, daemon=True).start()
 # ✅ 提交任务
@app.post("/crawler/put", summary='提交爬虫')
 def put_task(req: Spider):
    """
    url:爬取的url
    """
    task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest())
    print(task_id)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("""
                INSERT OR REPLACE  INTO tasks (task_id, url, status)
                VALUES (?, ?, ?)
                """, (task_id, req.url, -2))
    conn.commit()
    conn.close()
    task_queue.put((task_id, req.url))
    return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}
 # ✅ 获取结果
@app.get("/crawler/get/{task_id}", summary='获取数据')
 def get_result(task_id: str):
    """
     task_id: 提交返回的的task_id
     code： 异常有问题：-1   正在处理：-2   成功：0
    """
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,))
    row = cur.fetchone()
    conn.close()
    if not row:
        return {"code": -1, 'data': task_id, 'msg': '无此task_id'}
    status, result, error = row
    if status == "error":
        return {"code": -1, 'data': task_id, 'msg': '任务处理失败'}
    if int(status) == 0:
        msg = '操作成功'
    else:
        msg = '任务正在处理'
    return {
        "code": int(status),
        "data": json.loads(result) if result else None,
        "msg": msg
    }
 # uvicorn api_main:app --host 0.0.0.0 --port 8000
 # uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug
--- a/htmldata_get/main.py
+++ b/htmldata_get/main.py
@ -0,0 +1,139 @@
 import random
 import time
 from urllib.parse import urljoin, urlparse
 from tools.class_int import RequestsInt
 from tools.deal_html import deal_html
 from bs4 import BeautifulSoup
 import json
 class Start(object):
    def __init__(self, url):
        self.url = url
        self.requests = RequestsInt(url)
        self.res = []
        # ❌ 要过滤的资源
        self.exclude_ext = {
            ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg",
            ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv",
            ".mp3", ".wav",
            ".pdf", ".zip", ".rar", ".7z",
            ".css", ".js", ".ico", ".woff", ".woff2", ".ttf"
        }
    # ✅ 判断是否网页
    def is_valid_page(self, url):
        path = urlparse(url).path.lower()
        for ext in self.exclude_ext:
            if path.endswith(ext):
                return False
        return True
    def get_internal_links(self, html):
        soup = BeautifulSoup(html, "lxml")
        base_domain = urlparse(self.url).netloc
        links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            # ❌ 无效协议
            if href.startswith(("javascript:", "#", "mailto:", "tel:")):
                continue
            # ✅ 转绝对路径
            full_url = urljoin(self.url, href)
            # ✅ 同域
            if urlparse(full_url).netloc != base_domain:
                continue
            # ✅ 过滤资源文件
            if not self.is_valid_page(full_url):
                continue
            links.add(full_url)
        return list(links)
    def deal(self, url, html=None):
        if not html:
            html = self.requests.get(url)
            if html.status_code == 200:
                html = html.text
            else:
                print(f'请求状态异常不处理:{html.status_code}')
                return {}
        data = deal_html(html, url)
        # print(data)
        self.res.append(data)
        return data
    def dealpage(self, url, html):
        data = deal_html(html, url)
        self.res.append(data)
        return data
    def _time_sleep(self, k=None):
        if not k:
            k = random.uniform(0, 1)
        print('等待:', k)
        time.sleep(k)
    def fet_all_links(self, html):
        all_links = self.get_internal_links(html)
        print("二级页面链接：", all_links)
        print('二级页面长度:', len(all_links))
        return all_links
    def run(self):
        html = self.requests.get(self.url).text
        res = self.deal(url=self.url, html=html)
        print(f'主页获取完成：{res.get("title")}')
        all_links = self.fet_all_links(html)
        if len(all_links) > 0:
            print('静态页面---')
            for index, link in enumerate(all_links):
                try:
                    res = self.deal(url=link)
                    print(f'{link}：{res.get("title")} {index + 1}')
                    self._time_sleep()
                except Exception as e:
                    print(e)
                    print(f'异常:{link} {e}')
            # print(f'成功获取:{self.res}')
            print(f'全部获取完成:{self.url} {len(self.res)}')
            return self.res
        else:
            print('动态页面---')
            html = self.requests.get_page(self.url)
            print(html)
            res = self.dealpage(url=self.url, html=html)
            print(f'主页获取完成：{res.get("title")}')
            all_links = self.fet_all_links(html)
            for index, link in enumerate(all_links):
                try:
                    res = self.deal(url=link)
                    print(f'{link}：{res.get("title")} {index + 1}')
                    self._time_sleep()
                except Exception as e:
                    print(e)
                    print(f'异常:{link} {e}')
            # print(f'成功获取:{self.res}')
            print(f'全部获取完成:{self.url} {len(self.res)}')
            return self.res
 class StartPage(Start):
    pass
 if __name__ == '__main__':
    url = 'https://www.essilor.com/cn-zh/'
    d = Start(url).run()
    with open('res.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(d, ensure_ascii=False))
--- a/htmldata_get/res.json
+++ b/htmldata_get/res.json
@ -0,0 +1 @@
 [{"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}, {"url": "https://www.essilor.com/cn-zh/", "title": "World leader in prescription lenses | Essilor", "keywords": "", "description": "Everyone everywhere should experience the life changing benefits of vision correction and vision protection. Choose your lenses from a committed brand.", "content": "World leader in prescription lenses | Essilor"}]
--- a/htmldata_get/spider.db
+++ b/htmldata_get/spider.db
--- a/htmldata_get/sql.py
+++ b/htmldata_get/sql.py
@ -0,0 +1,18 @@
 import sqlite3
 conn = sqlite3.connect("spider.db")
 cursor = conn.cursor()
 cursor.execute("""
 CREATE TABLE IF NOT EXISTS tasks (
    task_id TEXT PRIMARY KEY,
    url TEXT,
    status TEXT,
    result TEXT,
    error TEXT,
    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 )
 """)
 conn.commit()
 conn.close()
--- a/htmldata_get/t.py
+++ b/htmldata_get/t.py
@ -0,0 +1,75 @@
 import requests
 from bs4 import BeautifulSoup
 import trafilatura
 def fetch_html(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers, timeout=10)
    resp.encoding = resp.apparent_encoding
    return resp.text
 def extract_meta(soup):
    title = soup.title.string.strip() if soup.title else ""
    def get_meta(name):
        tag = soup.find("meta", attrs={"name": name})
        if tag and tag.get("content"):
            return tag["content"].strip()
        return ""
    # 兜底 OG
    def get_og(prop):
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content"):
            return tag["content"].strip()
        return ""
    keywords = get_meta("keywords")
    description = get_meta("description")
    if not title:
        title = get_og("og:title")
    if not description:
        description = get_og("og:description")
    return title, keywords, description
 def extract_text(html):
    # ⭐ 优先正文算法
    text = trafilatura.extract(html)
    if text:
        return text
    # 🔁 兜底（防止失败）
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.extract()
    return soup.get_text(separator="\n")
 def parse_page(url):
    html = fetch_html(url)
    soup = BeautifulSoup(html, "lxml")
    title, keywords, description = extract_meta(soup)
    content = extract_text(html)
    return {
        "url": url,
        "title": title,
        "keywords": keywords,
        "description": description,
        "content": content
    }
 if __name__ == "__main__":
    url = "https://www.lheia.com/hzhb.html"
    data = parse_page(url)
    for k, v in data.items():
        print(f"{k}:\n{str(v)[:300]}\n")
--- a/htmldata_get/t/1.html
+++ b/htmldata_get/t/1.html
--- a/htmldata_get/t/t2.py
+++ b/htmldata_get/t/t2.py
@ -0,0 +1,19 @@
 # # 导入
 # from DrissionPage import SessionPage
 # # 创建页面对象
 # page = SessionPage()
 # # 访问网页
 # page.get('https://www.essilor.com/cn-zh/')
 # print(page.html)
 # print(page)
 # page.close()
 # 导入
 from DrissionPage import ChromiumPage
 # 创建页面对象
 page = ChromiumPage()
 # 访问网页
 page.get('https://www.essilor.com/cn-zh/')
 print(page.html)
 print(page)
 page.close()
--- a/htmldata_get/tools/class_int.py
+++ b/htmldata_get/tools/class_int.py
@ -0,0 +1,39 @@
 import requests
 from tools.retry import retry
 from DrissionPage import SessionPage
 class RequestsInt(object):
    def __init__(self, url):
        self.ref = url
        self.session = requests.Session()
    def get_headers(self):
        return {
            'referer': self.ref,
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36'
        }
    @retry('get请求', 3)
    def get(self, url, params=None, headers=None, timeout=5, **kwargs):
        if headers is None:
            headers = self.get_headers()
        r = self.session.get(url=url, params=params, headers=headers, timeout=timeout, **kwargs)
        r.encoding = 'utf-8'
        return r
    @retry('post请求', 3)
    def post(self, url, params=None, headers=None, timeout=5, **kwargs):
        if headers is None:
            headers = self.get_headers()
        r = self.session.post(url=url, params=params, headers=headers, timeout=timeout, **kwargs)
        r.encoding = 'utf-8'
        return r
    def get_page(self, url):
        page = SessionPage()
        # 访问网页
        page.get(url)
        page.close()
        return page.html
--- a/htmldata_get/tools/deal_html.py
+++ b/htmldata_get/tools/deal_html.py
@ -0,0 +1,138 @@
 import re
 from urllib.parse import urljoin
 import requests
 from bs4 import BeautifulSoup
 import trafilatura
 def extract_meta(soup):
    title = soup.title.string if soup.title else ""
    def get_meta(name):
        tag = soup.find("meta", attrs={"name": name})
        if tag and tag.get("content", ''):
            return tag["content"].strip()
        return ""
    # 兜底 OG
    def get_og(prop):
        tag = soup.find("meta", attrs={"property": prop})
        if tag and tag.get("content", ''):
            return tag["content"].strip()
        return ""
    keywords = get_meta("keywords")
    description = get_meta("description")
    if not title:
        title = get_og("og:title")
    if not description:
        description = get_og("og:description")
    return title, keywords, description
 def extract_text(html):
    # ⭐ 优先正文算法
    text = trafilatura.extract(html)
    if text:
        return text
    # 🔁 兜底（防止失败）
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.extract()
    return soup.get_text(separator="\n")
 def extract_news(html):
    soup = BeautifulSoup(html, "lxml")
    results = []
    for a in soup.find_all("a", href=True):
        # 👉 标题
        h = a.find(["h1", "h2", "h3", "h4", "h5"])
        if not h:
            continue
        title = h.get_text(strip=True)
        # 👉 时间（常见 class：time / date）
        time_tag = a.find(lambda tag: tag.name in ["div", "span"] and "time" in (tag.get("class") or []) or "date" in (
                    tag.get("class") or []))
        time_text = time_tag.get_text(strip=True) if time_tag else ""
        # ❗过滤无效
        if len(title) < 5:
            continue
        results.append({
            "title": title,
            "text": time_text,
        })
    return results
 def extract_blocks(html):
    soup = BeautifulSoup(html, "lxml")
    results = []
    # 👉 找所有“可能是内容块”的 a / div
    candidates = soup.find_all(["a", "div"])
    for tag in candidates:
        text = tag.get_text(strip=True)
        # ❗过滤太短的（排除噪声）
        if len(text) < 10:
            continue
        # 👉 标题（优先找短文本）
        for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
            txt = t.get_text(strip=True)
            if len(txt) >=10:
                print(txt)
                results.append({
                    "text": txt,
                })
                break
    return results
 def extract_chinese_and_english(html):
    if not html:
        return [], []
    # 1. 匹配中文（通用版）
    # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
    # \u3400-\u4dbf 覆盖扩展A区（生僻字、古汉字）
    # 使用 re.UNICODE 确保编码兼容性
    chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
    chinese_list = chinese_pattern.findall(html)
    return chinese_list
 def deal_html(html, url):
    soup = BeautifulSoup(html, "lxml")
    title, keywords, description = extract_meta(soup)
    content = extract_text(html)
    return {
        'url': url,
        "title": title,
        "keywords": keywords,
        "description": description,
        "content": content
    }
 if __name__ == '__main__':
    with open('../t/1.html', 'r', encoding='utf-8') as f:
        h = f.read()
        d = extract_chinese_and_english(h)
        print(d)
--- a/htmldata_get/tools/retry.py
+++ b/htmldata_get/tools/retry.py
@ -0,0 +1,19 @@
 from functools import wraps
 def retry(name='name', for_word=3):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for i in range(for_word):
                try:
                    res = func(*args, **kwargs)
                    return res
                except Exception as e:
                    print(f'[{name}]:{e}')
            return False
        return wrapper
    return decorator