import requests
from bs4 import BeautifulSoup
import re
import os
import json
import sys
downsize = 512
geturl = "https://www.veryicon.com/icons/healthcate-medical/medical-icon-two-color-icon"
baseurl = "https://www.veryicon.com"
headers = {
#"referer": geturl,
"user-agent": "Mozilla/5.0 (Windows NT 10.0; "
"Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/95.0.4638.54 "
"Safari/537.36 Edg/95.0.1020.40"}
save_path = "./savepic/"
def getHtml(url, headers=headers):
try:
r = requests.get(url, headers=headers)
#r.raise_for_status()
r.encoding = "utf-8"
return r
except BaseException:
print("获取失败")
def getHtmlCode(url):
try:
r1 = requests.get(url,'lxml')
#r1.raise_for_status()
r1.encoding='utf-8'
return r1
except BaseException:
print("获取失败")
def savefile(url,filename):
#print(url)
#h_split = url.split('/')
#filename = h_split[len(h_split)-1]
picture = getHtml(url)
#print(dir(picture))
#print(picture.__attrs__)
#print(picture.__bool__)
#print(picture.__class__)
#print(picture.__delattr__)
#print(picture.__dict__)
#print(picture.__dir__)
#print(picture.__doc__)
#print(picture.__enter__)
#print(picture.__eq__)
#print(picture.__exit__)
#print(picture.__format__)
#print(picture.__ge__)
#print(picture.__getattribute__)
#print(picture.__getstate__)
#print(picture.__gt__)
#print(picture.__hash__)
#print(picture.__init__)
#print(picture.__init_subclass__)
#print(picture.__iter__)
#print(picture.__le__)
#print(picture.__lt__)
#print(picture.__module__)
#print(picture.__ne__)
#print(picture.__new__)
#print(picture.__nonzero__)
#print(picture.__reduce__)
#print(picture.__reduce_ex__)
#print(picture.__setattr__)
#print(picture.__setstate__)
#print(picture.__sizeof__)
#print(picture.__str__)
#print(picture.__subclasshook__)
#print(picture.__weakref__)
#print(picture._content)
#print(picture._content_consumed)
#print(picture._next)
#print(picture.apparent_encoding)
#print(picture.close)
#print(picture.connection)
#print(picture.content)
#print(picture.cookies)
#print(picture.elapsed)
#print(picture.encoding)
#print(picture.headers)
#print(picture.history)
#print(picture.is_permanent_redirect)
#print(picture.is_redirect)
#print(picture.iter_content)
#print(picture.iter_lines)
#print(picture.json)
#print(picture.links)
#print(picture.next)
#print(picture.ok)
#print(picture.raise_for_status)
#print(picture.raw)
#print(picture.reason)
#print(picture.request)
#print(picture.status_code)
#print(picture.text)
#print(picture.url)
#print(dir(picture))
#sys.exit()
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(save_path+filename,"wb") as f:
f.write(picture.content)
print(filename+"下载完毕")
picture.close()
def parserHtml(url):
herflist = []
r = getHtmlCode(url)
soup = BeautifulSoup(r.text,"html.parser")
pages = soup.find(name='div', class_='pager')
if pages==None :
herflist = herflist + getonepage(soup)
else :
numlist = pages.find(name='a')
mxnum = int(max(numlist.text))
num = 1
while num <= mxnum:
r001 = getHtmlCode(url+"/"+str(num))
soup001 = BeautifulSoup(r001.text,"html.parser")
herflist = herflist + getonepage(soup001)
r001.close()
num = num + 1
r.close()
return herflist
def getonepage(bs4_soup):
herflist = []
#soup = BeautifulSoup(htmltext,"html.parser")
soup = bs4_soup
icons = soup.find(name='div', id='sider-left')
icons_items = icons.find_all(name='a')
for icon_one in icons_items:
#print(icon_one)
if icon_one.parent.name == 'h4':
herflist.append((icon_one.attrs["href"],icon_one.attrs["title"]))
return herflist
def downimg(href,filename):
r = getHtmlCode(baseurl + href)
soup = BeautifulSoup(r.text,"html.parser")
filename = filename.replace(' ' , '_')
filename = filename.replace(',' , '_')
filename = filename.replace('__' , '_')
filename = filename.replace("'" , '')
filename = filename.replace("?" , '')
h_split = href.split('/')
filename1 = h_split[len(h_split)-1]
filename1 = filename1.split('.')
filename = filename1[0]
if not os.path.exists(save_path+filename + "_"+str(downsize)+".png"):
png_data = soup.find(name='div', class_='sub-downloads')
png_items = png_data.find_all(name='a')
for png_dow in png_items:
add_name = png_dow.text
add_name = add_name.replace('PNG ' , '')
add_name = add_name.replace('px' , '')
if int(add_name)==downsize :
if png_dow.attrs["href"][0:2] == "//":
savefile("https:"+png_dow.attrs["href"],filename+"_"+add_name + ".png")
else:
savefile(baseurl + png_dow.attrs["href"],filename+"_"+add_name + ".png")
def main():
if not os.path.exists(save_path+"task_down.json"):
herflist = parserHtml(geturl)
task_down = json.dumps(herflist)
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(save_path+"task_down.json","w") as f:
f.write(task_down)
else :
file_obj = open(save_path+"task_down.json",'r')
try:
task_down = file_obj.read()
herflist = json.loads(task_down)
finally:
file_obj.close()
#print(herflist)
for herf,filename in herflist:
downimg(herf,filename)
main()
有些分类文件过多,无法一次全部下载成功,所以增加了本地缓存记录和对比下载代码:
#此部分将下载的图片列表页面缓存到本地JSON文件,本地有文件则载入数据
if not os.path.exists(save_path+"task_down.json"):
herflist = parserHtml(geturl)
task_down = json.dumps(herflist)
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(save_path+"task_down.json","w") as f:
f.write(task_down)
else :
file_obj = open(save_path+"task_down.json",'r')
try:
task_down = file_obj.read()
herflist = json.loads(task_down)
finally:
file_obj.close()
以下是判读本地是否已经有图片文件,没有则下载:
if not os.path.exists(save_path+filename + "_512.png"):
png_data = soup.find(name='div', class_='sub-downloads')
png_items = png_data.find_all(name='a')
for png_dow in png_items:
add_name = png_dow.text
add_name = add_name.replace('PNG ' , '')
add_name = add_name.replace('px' , '')
if int(add_name)==512 :
savefile(baseurl + png_dow.attrs["href"],filename+"_"+add_name + ".png")