user_agent = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
headers = {
'User-Agent': random.choice(user_agent)
}
后面这种需要先下载: fake_useragent
from fake_useragent import UserAgent
ua = UserAgent()
requestHeader = {
'User-Agent': ua.random
}
#读取代理ip配置文件
# 随机选择列表中的字典,直到匹配上可以使用的ip
txtPath ="AgencyIP.txt"
def GetIPList():
f = open(txtPath, "r")
data = []
for line in f: # 设置文件对象并读取每一行文件
line = line[:-1]
dic = {'http': line}
data.append(dic)
f.close()
# print(data)
return data
requests.get的封装
判断获取是否成功: page.status_code //状态码
page.status_code = 200则成功
time.sleep()
python自带time模块的休眠功能
需要导入
import time
def getPage(url,header='',proxies=''):
try:
random.randint(1,3) # 用python自带time模块的休眠功能
iplist = GetIPList()
ua = UserAgent()
requestHeader = {
'User-Agent': ua.random
}
# 填写代理地址
proxy = random.choice(iplist)
page = requests.get(url, headers=requestHeader, proxies=proxy, timeout=30)
print(proxy)
if (page.status_code != 200):
print("网络错误正在等待!!!")
while 1:
# 填写代理地址
proxy = random.choice(iplist)
time.sleep(5)
page = requests.get(url, headers=headers, proxies=proxy, timeout=30)
if (page.status_code == 200):
print("成功获取网页!!")
break
else:
print("获取网页失败!!等待继续重新获取!")
return page
except:
print(page.status_code)
return ""
获取网页转换编码
这里做网页源码的编码加工
#获取网页源码
def getHTML(url):
try:
r = getPage(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(r.status_code)
print('gethtml_error')
return ""
先在一个章节里获取全部图片链接
这里比较坑的是,浏览器里面F12看的时候是这样的,就直接写代码了查找
结果不管怎么查找都查找不到,开始怀疑正则表达式问题
';
var imgInfo = '
(' + page + '/' + SinMH.getChapterImageCount() + ')
';
var loading = $('#imgLoading');
loading.show();
if (getChapterScroll()) {
chapter.imageList.append(imgStr);
} else {
$("#page").text(page);
chapter.imageList.html(imgStr);
}
var lastObj = chapter.imageList.find('img').last();
lastObj.hide();
var showAndPreload = function () {
lastObj.show();
loading.hide();
if (chapter.scrollStart > 1) {
chapter.imageList.prepend('
查看完整章节
');
chapter.scrollStart = 0;
}
SinMH.getChapterImageCount() > 0 && chapter.imageList.append(imgInfo);
renderPageSelect(SinMH.getChapterImageCount(),chapter.currentIndex,'.pageSelect');
preloadImage();
};
if (getChapterAutoSize()) {
lastObj.imgAutoSize(chapter.imageList.width(), 0, function () {
showAndPreload();
});
} else {
$.preLoadImg(imgUrl, function () {
showAndPreload();
});
}
}
F12看HTML源码的script中出现了比较让人感兴趣的东西.
var chapterImagesshuzh
var chapterPath
例:https://res.xiaoqinre.com/images/comic/123/244953/1512990454Oy96QbmOEIO_NL2R.jpg
可看成三段
1: https://res.xiaoqinre.com/
2: images/comic/123/244953/
3: 1512990454Oy96QbmOEIO_NL2R.jpg
;var siteName = "";var siteUrl = "https://www.gufengmh8.com";;var chapterImages = ["1512990451xQbK-tMVcpYvvW-Q.jpg","1512990453c0RNuvuhKMRfy_Mt.jpg","1512990454Oy96QbmOEIO_NL2R.jpg","1512990455z5eemhDedrAWj7DJ.jpg","1512990456nQKlABXzA-7hk3U2.jpg","1512990458UWSPTY4isZA0Le-5.jpg","1512990459rL2yAqtQ5JoagL5x.jpg"];
var chapterPath = "images/comic/123/244953/";
var pageTitle = "堕玄师序章在线观看";
var comicUrl = "https://www.gufengmh8.com/manhua/duoxuanshi/";
var pageUrl = "https://www.gufengmh8.com/manhua/duoxuanshi/";
var pageImage = "https://res.xiaoqinre.com/images/cover/201712/1513565570IVbMltgdBGcHMO3f.jpg";var pageDomain = "https://www.gufengmh8.com";var pageId = "comic.7099";
var prevChapterData ={"id":null,"comic_id":null,"comic_name":null,"status":null,"vip":null,"is_end":null,"name":null,"type":null,"rtl":null,"image_mode":null,"category":null,"link":null,"link_name":null,"image_type":null,"count":null,"sort":null,"price":null,"created_at":null,"updated_at":null};var nextChapterData = {"id":245852,"comic_id":7099,"comic_name":"堕玄师","status":1,"vip":0,"is_end":0,"name":"第1话 九荒镜","type":0,"rtl":0,"image_mode":0,"category":1,"link":"","link_name":"","image_type":8,"count":33,"sort":999,"price":0,"created_at":1513408087,"updated_at":1513408131};
代码顺利完成
#获取当前章节的图片地址列表返回
def getImagelist(url):
html = getHTML(url)
soup = BeautifulSoup(html, 'lxml')
# title = soup.select('body > script')[0]
laterbody = soup.find('body')
try:
title = laterbody.find('script')
except:
print(laterbody)
# print(html)
#获取每部中间变动的地址
imgPathRe = re.compile('chapterPath.*?".*?"')
#图片名称
imgPathchar = imgPathRe.findall(str(title))#['"1574474397PEkNn-r1a038oGyx.jpg"', '"1574474396fO6jANuAPUich78W.jpg"'...
pathArray = imgPathchar[0].split('=')
path = pathArray[1].replace('"','').replace(' ','')#images/comic/568/1135637/
#拼接地址
path = 'https://res.xiaoqinre.com/'+path
print(path)
#获取图片名称列表
listre = re.compile('("\d.*?jpg")')
namelist=listre.findall(str(title))
print(namelist)
_list =[]
#拼接完整地址
for name in namelist:
name = name.replace('"','')
s = path + name
# print(s)
_list.append(s)
print(_list)
return _list
获取每个章节链接
不得不说python的库真强大
BeautifulSoup 简直是爬虫的神器之一
#获取目录
def getCatalog(url):
html = getHTML(url)
soup = BeautifulSoup(html, 'lxml')
# laterbody = soup.find('body')
perentdiv = soup.find('ul',attrs={'id':'chapter-list-1'}).find_all('a')
# print(perentdiv)
catlist = []
for a in perentdiv:
p = a['href']
p ='https://www.gufengmh8.com'+p
# print(p)
catlist.append(p)
return catlist
全部代码如下
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
#针对古风漫画网抓取
#https://www.gufengmh8.com/
user_agent = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
headers = {
'User-Agent': random.choice(user_agent)
}
# 随机选择列表中的字典,直到匹配上可以使用的ip
txtPath ="AgencyIP.txt"
def GetIPList():
f = open(txtPath, "r")
data = []
for line in f: # 设置文件对象并读取每一行文件
line = line[:-1]
dic = {'http': line}
data.append(dic)
f.close()
# print(data)
return data
def getPage(url,header='',proxies=''):
try:
random.randint(1,3) # 用python自带time模块的休眠功能
iplist = GetIPList()
ua = UserAgent()
requestHeader = {
'User-Agent': ua.random
}
# 填写代理地址
proxy = random.choice(iplist)
page = requests.get(url, headers=requestHeader, proxies=proxy, timeout=30)
print(proxy)
if (page.status_code != 200):
print("网络错误正在等待!!!")
while 1:
# 填写代理地址
proxy = random.choice(iplist)
time.sleep(5)
page = requests.get(url, headers=headers, proxies=proxy, timeout=30)
if (page.status_code == 200):
print("成功获取网页!!")
break
else:
print("获取网页失败!!等待继续重新获取!")
return page
except:
print(page.status_code)
print('ssssss')
return ""
#获取网页源码
def getHTML(url):
try:
r = getPage(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(r.status_code)
print('gethtml_error')
return ""
#获取当前章节的图片地址列表返回
def getImagelist(url):
html = getHTML(url)
soup = BeautifulSoup(html, 'lxml')
# title = soup.select('body > script')[0]
laterbody = soup.find('body')
try:
title = laterbody.find('script')
except:
print(laterbody)
# print(html)
#获取每部中间变动的地址
imgPathRe = re.compile('chapterPath.*?".*?"')
#图片名称
imgPathchar = imgPathRe.findall(str(title))#['"1574474397PEkNn-r1a038oGyx.jpg"', '"1574474396fO6jANuAPUich78W.jpg"'...
pathArray = imgPathchar[0].split('=')
path = pathArray[1].replace('"','').replace(' ','')#images/comic/568/1135637/
#拼接地址
path = 'https://res.xiaoqinre.com/'+path
print(path)
#获取图片名称列表
listre = re.compile('("\d.*?jpg")')
namelist=listre.findall(str(title))
print(namelist)
_list =[]
#拼接完整地址
for name in namelist:
name = name.replace('"','')
s = path + name
# print(s)
_list.append(s)
print(_list)
return _list
#图片地址列表 保存路径 当前的章节数 图片索引
def down_pic_list(pic_urls,savePath ,loadIndex=0):
"""给出图片链接列表, 下载所有图片"""
nowIndex = 0
for pic_url in pic_urls:
nowIndex += 1
if(nowIndex < loadIndex):
continue
try:
pic = getPage(pic_url)
print(pic)
print(pic.status_code)
string = savePath + str(nowIndex) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(nowIndex ), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(nowIndex ), str(pic_url)))
print(e)
continue
#图片地址 保存路径 当前的章节数
# def down_pic(pic_urls,savePath,index):
# for pic_url in pic_urls:
# try:
# pic = getPage(pic_url)
# print(pic.status_code)
# string = savePath + str(index) + '.jpg'
# with open(string, 'wb') as f:
# f.write(pic.content)
# print('成功下载第%s张图片: %s' % (str(index), str(pic_url)))
# except Exception as e:
# print('下载第%s张图片时失败: %s' % (str(index), str(pic_url)))
# print(e)
# continue
#获取目录
def getCatalog(url):
html = getHTML(url)
soup = BeautifulSoup(html, 'lxml')
# laterbody = soup.find('body')
perentdiv = soup.find('ul',attrs={'id':'chapter-list-1'}).find_all('a')
# print(perentdiv)
catlist = []
for a in perentdiv:
p = a['href']
p ='https://www.gufengmh8.com'+p
# print(p)
catlist.append(p)
return catlist
imagUrlPath = "ImageUrl.txt"
def main():
#需要抓取漫画的目录页网址
url = 'https://www.gufengmh8.com/manhua/woweicangsheng/'
#图片保存位置
savePath = r'C:\Users\11975\Desktop\AnimeTest/'
catlist = getCatalog(url)
loadIndex = 0
imageList = []
#
# for caturl in catlist:
# print(caturl)
# #
# for img in getImagelist(caturl):
# loadIndex+=1
# print(img+' '+str(loadIndex))
# imageList.append(img)
# with open(imagUrlPath, "a") as file: # 只需要将之前的”w"改为“a"即可,代表追加内容
# file.write(str(img) + "\n")
# file.close()
f = open(imagUrlPath, "r")
for line in f: # 设置文件对象并读取每一行文件
line = line[:-1]
imageList.append(line)
f.close()
down_pic_list(imageList,savePath)
main()
作者:igsove