业务场景是这样的:需要升级一下后台管理系统的模板,原来使用的那套相对于semantic ui 太过复杂。想使用semantic ui 重新写一套。于是就从网络上寻找对应的工具。后来发现https://yellowred.info/andiamo/andiamo/restaurant-dashboard.html 这个做的比较漂亮。功能点也算丰富。于是要将全部的文件下载下来。期初想使用wget下载的,发现资源文件,样式文件并不能下载下来。于是自动动手写了这个工具。
下载网站所有文件,说起来简单,实际操作起来,也不是特别麻烦。不过需要注意的功能点如下:
import os
import traceback # 导入traceback模块,用于打印异常信息
import concurrent.futures # 导入线程池模块
import re
import requests
from bs4 import BeautifulSoup # 导入解析HTML的BeautifulSoup模块
from urllib.parse import urljoin, urlparse # 导入生成绝对网址的urljoin和解析网址的urlparse
download_config_path = 'downloaded_resources' # 下载资源保存目录
had_download_resources_set = set() # 已下载的资源URL集合。防止重复下载,如果大型项目,可以使用redis
# target_url = 'https://yellowred.info/andiamo/andiamo/restaurant-dashboard.html'
target_url = 'https://semantic-ui.com/examples/homepage.html' #抓取的目标地址
parse_result = urlparse(target_url)
base_hostname = parse_result.hostname # 获取主机名,用于判定是否为目标站地址,非目标站地址不进行下载
def get_local_path(resource_url):
path = urlparse(resource_url).path # 获取资源路径
local_path = os.path.join(download_config_path, path.lstrip('/')) # 拼接完整本地保存路径
local_dir = os.path.dirname(local_path)
if not os.path.exists(local_dir):
os.makedirs(local_dir) # 创建目录
return local_path
# 判断网址是否合法
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
# 下载资源
def download_resource(url):
try:
local_path = get_local_path(url)
response = requests.get(url, stream=True)
# 将已经获取的内容保存到应处理的文件目录中
had_download_resources_set.add(url)
if os.path.exists(local_path):
return True
print(f"Downloading {url} to {local_path}")
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return True
except Exception as e:
print(url)
return False
# 判断是否为相对路径
def is_relative_path(path):
pattern = re.compile(r'^https?://|ftp://|file://')
if pattern.search(path):
return False
else:
# 如果是图片等资源文件的base64模式,无需进行下载。
if path.startswith("data:"):
return False
return True
# 从css中提取资源网址
def get_resources_urls_from_css(css_url):
content = get_url_content(css_url)
font_urls = []
# 使用正则表达式匹配css中的字体资源路径
pattern = re.compile(r"url\((.+?)\)", re.IGNORECASE)
for match in pattern.finditer(content):
font_url = match.group(1).replace("'", '').replace('"', '')
if is_relative_path(font_url):
resource_url = urljoin(css_url, font_url)
if resource_url in had_download_resources_set:
continue
else:
font_urls.append(resource_url)
return font_urls
# 获取网址内容
def get_url_content(resource_url):
local_path = get_local_path(resource_url)
if os.path.exists(local_path): #如果资源已经存在,将直接返回内容。为了优化页面保存后,需要递归分析页面内容
content = open(local_path).read()
return content
else:
# 如果需要设置代理,或者请求头信息自行修改该内容即可
print(resource_url)
response = requests.get(resource_url)
content = response.content
with open(local_path, 'wb') as f:
f.write(content)
return content.decode("utf-8")
# 递归获取所有资源
def get_all_resources(origin_url):
try:
# 获取文件的本地保存路径信息,判定是否已经存在,如果已经存在直接从文件中获取
content = get_url_content(origin_url)
soup = BeautifulSoup(content, 'html.parser')
thread_counter = 20 # 设置线程池的数量
# 使用线程池下载文件
with concurrent.futures.ThreadPoolExecutor(max_workers=thread_counter,
thread_name_prefix='download resources') as executor:
resources_urls = []
# 从页面中提取所有的资源网址
for tag in soup.find_all():
if tag.has_attr("href"):
url = tag['href']
if is_valid(url) and base_hostname in url:
resources_urls.append(url)
if tag.has_attr("src"):
url = tag['src']
if is_valid(url):
resources_urls.append(url)
# 使用线程池下载资源
results = executor.map(download_resource, resources_urls)
# 递归爬取css中的资源
css_urls = [url for url in resources_urls if url.endswith('.css')]
for css_url in css_urls:
font_urls = get_resources_urls_from_css(css_url)
executor.map(download_resource, font_urls)
print('All resources downloaded successfully!')
except Exception as e:
traceback.print_exc()
if __name__ == "__main__":
# Replace with the website you want to download resources from website
get_all_resources(target_url)
以上代码完成了从指定网页递归爬取所有资源的功能,主要步骤包括: