网址:https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4
运行如下:
结果:
源码:
from bs4 import BeautifulSoup
import requests
import os
import urllib.request
import random
import time
user_agent = ['Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.11 (KHTML, like Gecko)\
Chrome/23.0.1271.64 Safari/537.11','Mozilla/5.0 (Windows NT 6.1; WOW64)\
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/47.0.2526.106 Safari/537.36','Mozilla/5.0 \
(Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0',"Mozilla/5.0\
(X11; Linux x86_64) AppleWebKit/537.17 (KHTML, like Gecko)\
Chrome/24.0.1312.56 Safari/537.17",'Mozilla/5.0\
(Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0']
url=[]
for i in range(100):
if i==0:
url.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?')
else:
url.append('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={0}&type=T'.format(i*20))
print("url is done!")
b=0
for i in url:
time.sleep(1)
agent = random.choice(user_agent)
header= {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': '%s' %agent}
soup=BeautifulSoup(requests.get(i,headers = header).text,"html.parser")
items=soup('li','subject-item')
if len(items)==0:
break
else:
for item in items:
urllib.request.urlretrieve(item.find('div','pic').img.get('src'),
os.path.basename(item.find('div','info').a.get('title')+'.jpg'))
b+=1
print("下载%d张"%int(b))
print("Finish Down %d Picture" %int(b))