处理网站主页

2 weeks ago · 3a0a14d957
6 changed files with 277 additions and 57 deletions
--- a/htmldata_get/api_main.py
+++ b/htmldata_get/api_main.py
@ -10,8 +10,6 @@ import hashlib
 app = FastAPI()
 task_queue = queue.Queue()
 def get_conn():
    return sqlite3.connect("spider.db", check_same_thread=False)
@ -22,21 +20,12 @@ class Spider(BaseModel):
 # 🔁 worker
 def worker():
    while True:
        task_id, url = task_queue.get()
 def worker(task_id, url):
    conn = get_conn()
    cur = conn.cursor()
    try:
        data = Start(url).run()
            # cur.execute("""
            #             UPDATE tasks
            #             SET status=?,
            #                 result=?
            #             WHERE task_id = ?
            #             """, (0, json.dumps(data, ensure_ascii=False), task_id))
        cur.execute("""
            INSERT OR REPLACE INTO tasks (task_id, status, result)
            VALUES (?, ?, ?)
@ -49,13 +38,9 @@ def worker():
                        error=?
                    WHERE task_id = ?
                    """, ("error", str(e), task_id))
    finally:
        conn.commit()
        conn.close()
        task_queue.task_done()
 Thread(target=worker, daemon=True).start()
 # ✅ 提交任务
@ -78,7 +63,7 @@ def put_task(req: Spider):
    conn.commit()
    conn.close()
    task_queue.put((task_id, req.url))
    Thread(target=worker, args=(task_id, req.url)).start()
    return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}
--- a/htmldata_get/api_main.py_
+++ b/htmldata_get/api_main.py_
@ -0,0 +1,118 @@
 from main import Start
 from fastapi import FastAPI
 from pydantic import BaseModel
 from threading import Thread
 import queue
 import uuid
 import sqlite3
 import json
 import hashlib
 app = FastAPI()
 task_queue = queue.Queue()
 def get_conn():
    return sqlite3.connect("spider.db", check_same_thread=False)
 class Spider(BaseModel):
    url: str
 # 🔁 worker
 def worker():
    while True:
        task_id, url = task_queue.get()
        conn = get_conn()
        cur = conn.cursor()
        try:
            data = Start(url).run()
            # cur.execute("""
            #             UPDATE tasks
            #             SET status=?,
            #                 result=?
            #             WHERE task_id = ?
            #             """, (0, json.dumps(data, ensure_ascii=False), task_id))
            cur.execute("""
                INSERT OR REPLACE INTO tasks (task_id, status, result)
                VALUES (?, ?, ?)
            """, (task_id, 0, json.dumps(data, ensure_ascii=False)))
        except Exception as e:
            cur.execute("""
                        UPDATE tasks
                        SET status=?,
                            error=?
                        WHERE task_id = ?
                        """, ("error", str(e), task_id))
        conn.commit()
        conn.close()
        task_queue.task_done()
 Thread(target=worker, daemon=True).start()
 # ✅ 提交任务
@app.post("/crawler/put", summary='提交爬虫')
 def put_task(req: Spider):
    """
    url:爬取的url
    """
    task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest())
    print(task_id)
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("""
                INSERT OR REPLACE  INTO tasks (task_id, url, status)
                VALUES (?, ?, ?)
                """, (task_id, req.url, -2))
    conn.commit()
    conn.close()
    task_queue.put((task_id, req.url))
    return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}
 # ✅ 获取结果
@app.get("/crawler/get/{task_id}", summary='获取数据')
 def get_result(task_id: str):
    """
     task_id: 提交返回的的task_id
     code： 异常有问题：-1   正在处理：-2   成功：0
    """
    conn = get_conn()
    cur = conn.cursor()
    cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,))
    row = cur.fetchone()
    conn.close()
    if not row:
        return {"code": -1, 'data': task_id, 'msg': '无此task_id'}
    status, result, error = row
    if status == "error":
        return {"code": -1, 'data': task_id, 'msg': '任务处理失败'}
    if int(status) == 0:
        msg = '操作成功'
    else:
        msg = '任务正在处理'
    return {
        "code": int(status),
        "data": json.loads(result) if result else None,
        "msg": msg
    }
 # uvicorn api_main:app --host 0.0.0.0 --port 8000
 # uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug
--- a/htmldata_get/main.py
+++ b/htmldata_get/main.py
@ -2,7 +2,7 @@ import random
 import time
 from urllib.parse import urljoin, urlparse
 from tools.class_int import RequestsInt
 from tools.deal_html import deal_html
 from tools.deal_html import deal_html,extract_chinese_and_english
 from bs4 import BeautifulSoup
 import json
@ -74,7 +74,7 @@ class Start(object):
        return data
    def dealpage(self, url, html):
        data = deal_html(html, url)
        data = extract_chinese_and_english(html, url)
        self.res.append(data)
        return data
@ -92,12 +92,13 @@ class Start(object):
    def run(self):
        html = self.requests.get(self.url).text
        res = self.deal(url=self.url, html=html)
        print(f'主页获取完成：{res.get("title")}')
        all_links = self.fet_all_links(html)
        if len(all_links) > 0:
        if len(all_links) > 1:
            print('静态页面---')
            for index, link in enumerate(all_links):
            res = self.deal(url=self.url, html=html)
            print(f'主页获取完成：{res.get("title")}')
            for index, link in enumerate(all_links[:50]):
                try:
                    res = self.deal(url=link)
                    print(f'{link}：{res.get("title")} {index + 1}')
@ -111,13 +112,14 @@ class Start(object):
        else:
            print('动态页面---')
            html = self.requests.get_page(self.url)
            print(html)
            # print(html)
            res = self.dealpage(url=self.url, html=html)
            print(f'主页获取完成：{res.get("title")}')
            all_links = self.fet_all_links(html)
            for index, link in enumerate(all_links):
            for index, link in enumerate(all_links[:10]):
                try:
                    res = self.deal(url=link)
                    html = self.requests.get_page(link)
                    res = self.dealpage(url=link,html=html)
                    print(f'{link}：{res.get("title")} {index + 1}')
                    self._time_sleep()
                except Exception as e:
--- a/htmldata_get/res.json
+++ b/htmldata_get/res.json
--- a/htmldata_get/tools/class_int.py
+++ b/htmldata_get/tools/class_int.py
@ -1,6 +1,22 @@
 import requests
 from DrissionPage._configs.chromium_options import ChromiumOptions
 from tools.retry import retry
 from DrissionPage import SessionPage
 from DrissionPage import SessionPage, ChromiumPage, Chromium
 def co_int():
    co = ChromiumOptions()
    # 设置不加载图片、静音
    co.no_imgs(True).mute(True)
    co.incognito()  # 匿名模式
    co.headless()  # 无头模式
    co.set_argument('--no-sandbox')  # 无沙盒模式
    co.auto_port(on_off=True)
    return co
 class RequestsInt(object):
@ -31,9 +47,20 @@ class RequestsInt(object):
        return r
    @retry('get_page请求', 3)
    def get_page(self, url):
        page = SessionPage()
        try:
            co = co_int()
            page = Chromium(addr_or_opts=co).new_tab()
            # 访问网页
            page.get(url)
        page.close()
            return page.html
        except Exception as e:
            print('e', e)
            return ''
        finally:
            try:
                pass
                # page.close()
            except:
                pass
--- a/htmldata_get/tools/deal_html.py
+++ b/htmldata_get/tools/deal_html.py
@ -95,7 +95,7 @@ def extract_blocks(html):
        # 👉 标题（优先找短文本）
        for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
            txt = t.get_text(strip=True)
            if len(txt) >=10:
            if len(txt) >= 10:
                print(txt)
                results.append({
                    "text": txt,
@ -103,19 +103,29 @@ def extract_blocks(html):
                break
    return results
 def extract_chinese_and_english(html):
    if not html:
        return [], []
 def extract_chinese_and_english(html, url):
    # 1. 匹配中文（通用版）
    # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
    # \u3400-\u4dbf 覆盖扩展A区（生僻字、古汉字）
    # 使用 re.UNICODE 确保编码兼容性
    chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
    chinese_list = chinese_pattern.findall(html)
    chinese_list = set(chinese_list)
    soup = BeautifulSoup(html, "lxml")
    title = soup.title.string if soup.title else ""
    return {
        'url': url,
        "title": title,
        "keywords": '',
        "description": '',
        "content": ','.join(chinese_list)
    }
    return chinese_list
 def deal_html(html, url):
    soup = BeautifulSoup(html, "lxml")