Browse Source

处理网站主页

master
zhangwen 2 weeks ago
parent
commit
3a0a14d957
  1. 21
      htmldata_get/api_main.py
  2. 118
      htmldata_get/api_main.py_
  3. 20
      htmldata_get/main.py
  4. 80
      htmldata_get/res.json
  5. 33
      htmldata_get/tools/class_int.py
  6. 20
      htmldata_get/tools/deal_html.py

21
htmldata_get/api_main.py

@ -10,8 +10,6 @@ import hashlib
app = FastAPI() app = FastAPI()
task_queue = queue.Queue()
def get_conn(): def get_conn():
return sqlite3.connect("spider.db", check_same_thread=False) return sqlite3.connect("spider.db", check_same_thread=False)
@ -22,21 +20,12 @@ class Spider(BaseModel):
# 🔁 worker # 🔁 worker
def worker():
while True:
task_id, url = task_queue.get()
def worker(task_id, url):
conn = get_conn() conn = get_conn()
cur = conn.cursor() cur = conn.cursor()
try: try:
data = Start(url).run() data = Start(url).run()
# cur.execute("""
# UPDATE tasks
# SET status=?,
# result=?
# WHERE task_id = ?
# """, (0, json.dumps(data, ensure_ascii=False), task_id))
cur.execute(""" cur.execute("""
INSERT OR REPLACE INTO tasks (task_id, status, result) INSERT OR REPLACE INTO tasks (task_id, status, result)
VALUES (?, ?, ?) VALUES (?, ?, ?)
@ -49,13 +38,9 @@ def worker():
error=? error=?
WHERE task_id = ? WHERE task_id = ?
""", ("error", str(e), task_id)) """, ("error", str(e), task_id))
finally:
conn.commit() conn.commit()
conn.close() conn.close()
task_queue.task_done()
Thread(target=worker, daemon=True).start()
# ✅ 提交任务 # ✅ 提交任务
@ -78,7 +63,7 @@ def put_task(req: Spider):
conn.commit() conn.commit()
conn.close() conn.close()
task_queue.put((task_id, req.url))
Thread(target=worker, args=(task_id, req.url)).start()
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'} return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}

118
htmldata_get/api_main.py_

@ -0,0 +1,118 @@
from main import Start
from fastapi import FastAPI
from pydantic import BaseModel
from threading import Thread
import queue
import uuid
import sqlite3
import json
import hashlib
app = FastAPI()
task_queue = queue.Queue()
def get_conn():
return sqlite3.connect("spider.db", check_same_thread=False)
class Spider(BaseModel):
url: str
# 🔁 worker
def worker():
while True:
task_id, url = task_queue.get()
conn = get_conn()
cur = conn.cursor()
try:
data = Start(url).run()
# cur.execute("""
# UPDATE tasks
# SET status=?,
# result=?
# WHERE task_id = ?
# """, (0, json.dumps(data, ensure_ascii=False), task_id))
cur.execute("""
INSERT OR REPLACE INTO tasks (task_id, status, result)
VALUES (?, ?, ?)
""", (task_id, 0, json.dumps(data, ensure_ascii=False)))
except Exception as e:
cur.execute("""
UPDATE tasks
SET status=?,
error=?
WHERE task_id = ?
""", ("error", str(e), task_id))
conn.commit()
conn.close()
task_queue.task_done()
Thread(target=worker, daemon=True).start()
# ✅ 提交任务
@app.post("/crawler/put", summary='提交爬虫')
def put_task(req: Spider):
"""
url:爬取的url
"""
task_id = str(hashlib.md5(req.url.encode('utf-8')).hexdigest())
print(task_id)
conn = get_conn()
cur = conn.cursor()
cur.execute("""
INSERT OR REPLACE INTO tasks (task_id, url, status)
VALUES (?, ?, ?)
""", (task_id, req.url, -2))
conn.commit()
conn.close()
task_queue.put((task_id, req.url))
return {'code': 0, 'data': {"task_id": task_id}, 'msg': '操作成功'}
# ✅ 获取结果
@app.get("/crawler/get/{task_id}", summary='获取数据')
def get_result(task_id: str):
"""
task_id: 提交返回的的task_id
code: 异常有问题:-1 正在处理:-2 成功:0
"""
conn = get_conn()
cur = conn.cursor()
cur.execute("SELECT status, result, error FROM tasks WHERE task_id=?", (task_id,))
row = cur.fetchone()
conn.close()
if not row:
return {"code": -1, 'data': task_id, 'msg': '无此task_id'}
status, result, error = row
if status == "error":
return {"code": -1, 'data': task_id, 'msg': '任务处理失败'}
if int(status) == 0:
msg = '操作成功'
else:
msg = '任务正在处理'
return {
"code": int(status),
"data": json.loads(result) if result else None,
"msg": msg
}
# uvicorn api_main:app --host 0.0.0.0 --port 8000
# uvicorn api_main:app --host 0.0.0.0 --port 32000 --log-level debug

20
htmldata_get/main.py

@ -2,7 +2,7 @@ import random
import time import time
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from tools.class_int import RequestsInt from tools.class_int import RequestsInt
from tools.deal_html import deal_html
from tools.deal_html import deal_html,extract_chinese_and_english
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json import json
@ -74,7 +74,7 @@ class Start(object):
return data return data
def dealpage(self, url, html): def dealpage(self, url, html):
data = deal_html(html, url)
data = extract_chinese_and_english(html, url)
self.res.append(data) self.res.append(data)
return data return data
@ -92,12 +92,13 @@ class Start(object):
def run(self): def run(self):
html = self.requests.get(self.url).text html = self.requests.get(self.url).text
res = self.deal(url=self.url, html=html)
print(f'主页获取完成:{res.get("title")}')
all_links = self.fet_all_links(html) all_links = self.fet_all_links(html)
if len(all_links) > 0:
if len(all_links) > 1:
print('静态页面---') print('静态页面---')
for index, link in enumerate(all_links):
res = self.deal(url=self.url, html=html)
print(f'主页获取完成:{res.get("title")}')
for index, link in enumerate(all_links[:50]):
try: try:
res = self.deal(url=link) res = self.deal(url=link)
print(f'{link}:{res.get("title")} {index + 1}') print(f'{link}:{res.get("title")} {index + 1}')
@ -111,13 +112,14 @@ class Start(object):
else: else:
print('动态页面---') print('动态页面---')
html = self.requests.get_page(self.url) html = self.requests.get_page(self.url)
print(html)
# print(html)
res = self.dealpage(url=self.url, html=html) res = self.dealpage(url=self.url, html=html)
print(f'主页获取完成:{res.get("title")}') print(f'主页获取完成:{res.get("title")}')
all_links = self.fet_all_links(html) all_links = self.fet_all_links(html)
for index, link in enumerate(all_links):
for index, link in enumerate(all_links[:10]):
try: try:
res = self.deal(url=link)
html = self.requests.get_page(link)
res = self.dealpage(url=link,html=html)
print(f'{link}:{res.get("title")} {index + 1}') print(f'{link}:{res.get("title")} {index + 1}')
self._time_sleep() self._time_sleep()
except Exception as e: except Exception as e:

80
htmldata_get/res.json
File diff suppressed because it is too large
View File

33
htmldata_get/tools/class_int.py

@ -1,6 +1,22 @@
import requests import requests
from DrissionPage._configs.chromium_options import ChromiumOptions
from tools.retry import retry from tools.retry import retry
from DrissionPage import SessionPage
from DrissionPage import SessionPage, ChromiumPage, Chromium
def co_int():
co = ChromiumOptions()
# 设置不加载图片、静音
co.no_imgs(True).mute(True)
co.incognito() # 匿名模式
co.headless() # 无头模式
co.set_argument('--no-sandbox') # 无沙盒模式
co.auto_port(on_off=True)
return co
class RequestsInt(object): class RequestsInt(object):
@ -31,9 +47,20 @@ class RequestsInt(object):
return r return r
@retry('get_page请求', 3)
def get_page(self, url): def get_page(self, url):
page = SessionPage()
try:
co = co_int()
page = Chromium(addr_or_opts=co).new_tab()
# 访问网页 # 访问网页
page.get(url) page.get(url)
page.close()
return page.html return page.html
except Exception as e:
print('e', e)
return ''
finally:
try:
pass
# page.close()
except:
pass

20
htmldata_get/tools/deal_html.py

@ -95,7 +95,7 @@ def extract_blocks(html):
# 👉 标题(优先找短文本) # 👉 标题(优先找短文本)
for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]): for t in tag.find_all(["h1", "h2", "h3", "p", "span", "div"]):
txt = t.get_text(strip=True) txt = t.get_text(strip=True)
if len(txt) >=10:
if len(txt) >= 10:
print(txt) print(txt)
results.append({ results.append({
"text": txt, "text": txt,
@ -103,19 +103,29 @@ def extract_blocks(html):
break break
return results return results
def extract_chinese_and_english(html):
if not html:
return [], []
def extract_chinese_and_english(html, url):
# 1. 匹配中文(通用版) # 1. 匹配中文(通用版)
# [\u4e00-\u9fff] 覆盖常用简体/繁体汉字 # [\u4e00-\u9fff] 覆盖常用简体/繁体汉字
# \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字) # \u3400-\u4dbf 覆盖扩展A区(生僻字、古汉字)
# 使用 re.UNICODE 确保编码兼容性 # 使用 re.UNICODE 确保编码兼容性
chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE) chinese_pattern = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf]+', re.UNICODE)
chinese_list = chinese_pattern.findall(html) chinese_list = chinese_pattern.findall(html)
chinese_list = set(chinese_list)
soup = BeautifulSoup(html, "lxml")
title = soup.title.string if soup.title else ""
return {
'url': url,
"title": title,
"keywords": '',
"description": '',
"content": ','.join(chinese_list)
}
return chinese_list
def deal_html(html, url): def deal_html(html, url):
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")

Loading…
Cancel
Save