import requests
import urllib
import os
from urllib import parse
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, wait
def get_html(url):
res = requests.get(url)
res.encoding = "GBK"
return res.text
def get_img_li(html):
tree = etree.HTML(html)
all_href = tree.xpath('//ul[contains(@class,"clearfix")]//a/@href')
return all_href
def down_img(url):
html = get_html(url)
tree = etree.HTML(html)
title = tree.xpath('//div[contains(@class,"photo-pic")]/a/img/@data-pic')[0]
src = tree.xpath('//div[contains(@class,"photo-pic")]/a/img/@src')[0]
f_url = urllib.parse.urljoin(url, src)
print(f"开始下载{f_url}")
res = requests.get(f_url)
if not os.path.exists("./img"):
os.makedirs("./img")
with open(f'./img/{title}.jpg', "wb") as f:
f.write(res.content)
print(f"完成下载{f_url}")
def down(url, all_href):
pool = ThreadPoolExecutor(30)
tasks = []
for h in all_href:
full_url = urllib.parse.urljoin(url, h)
tasks.append(pool.submit(down_img, full_url))
wait(tasks)
def main(url):
html = get_html(url)
all_href = get_img_li(html)
down(url, all_href)
print("任务完成")
if __name__ == "__main__":
url = "https://pic.netbian.com/shoujibizhi/"
main(url)