脚本学习
爬虫
爬取小姐姐图片
import requests
import re
import os
import json
import time
import random
UserAgent_List = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
"User-Agent": random.choice(UserAgent_List),
'Proxy-Connection': 'keep-alive'
}
for page in range(1,47):
r = requests.get(
"https://diskgirl.com/imageslist?page={}".format(page), headers=headers)
img_title = re.findall(
r"<h2 class=\"imagetuijian\">(.*?)</h2>", r.content.decode(), re.S)
# 判断文章中是否有img资源
img_list = re.findall(
r"<div class=\"col-md-4\"><div class=\"card nopadding images-relative\"><a href=\"(.*?)\">", r.content.decode(), re.S)
for i in range(len(img_list)):
# 构造网页地址
page_url = "https://diskgirl.com"+img_list[i]
try:
r1 = requests.get(page_url, headers=headers,timeout=2)
img_counts = re.findall(r"i< (.*?);", r1.content.decode(), re.S)
except requests.exceptions.RequestException:
img_counts=["100"]
print("当前在第:"+str(page)+"页,第"+str(i)+"个", img_title[i])
print("当前文章共计图片"+img_counts[0]+"张")
# 创建文件夹
path = "/app/down/"+img_title[i]
if not os.path.exists(path):
os.makedirs(path)
for j in range(int(img_counts[0])): # 构造图片请求地址
img_url = "https://diskgirl.com"+img_list[i].replace("image", "images")+"/"+str(j)+".jpg"
print(img_url)
img_name = path+"/"+str(j)+".jpg"
# 设置重试次数
try:
res = requests.get(img_url, stream=True,
headers=headers, timeout=2)
start = time.time() # 下载开始时间
end=time.time()
size = 0 # 初始化已下载大小
chunk_size = 1024 # 每次下载的数据大小
if res.status_code == 200: # 判断是否响应成功
content_size = int(
res.headers['content-length']) # 下载文件总大小
print('Start download,[File size]:{size:.2f} MB'.format(
size=content_size / chunk_size / 1024)) # 开始下载,显示下载文件大小
with open(img_name, 'wb') as file: # 显示进度条
for data in res.iter_content(chunk_size=chunk_size):
file.write(data)
size += len(data)
#print('\r'+'[下载进度]:%s%.2f%%' % ('>'*int(size*50 /content_size), float(size / content_size * 100)), end=' ')
end = time.time() # 下载结束时间
print('Download completed!,times: %.2f秒' % (end - start)) # 输出下载用时时间
except requests.exceptions.RequestException:
print("资源错误,已跳过")
快速运行
docker run -it --rm -v "$(pwd)":/app -w /app python:3.8 sh -c "pip install requests && python woc.py"
原文出处: