from bs4 import BeautifulSoup
import requests
import json
class Spider():
def __init__(self):
self.urls=['https://www.d4j.cn/download.php?id={}'.format(str(i))for i in range(10000,20000)]
self.allbook=[]
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0',
'Cookie':'Hm_lvt_fc36dd13d291f50d4944e1947213dcc0=1583824277,1584262814; __gads=ID=f073b0179e5c7628:T=1583824282:S=ALNI_MbjUXHUJZtX0Ub2eDGbZ-lDPStrLA; wordpress_logged_in_465e1b74d84ae866dbadf45fe0ad17ac=xiejunyan%7C1585641184%7Cgw4Mpq9HnpCxLLUiFZ566OPppiEhXYZNTyBGIjdNlgn%7C78ff3743a1bfa3aad5cc76a52d3b6fa49234027ba144dea4780f07b0d5c2b704; wp-settings-time-23618=1584516125; Hm_lpvt_fc36dd13d291f50d4944e1947213dcc0=1584588726; wordpress_test_cookie=WP+Cookie+check'
}
self.s=requests.Session()
def _fetch_contents(self,url):
try:
response=self.s.get(url,headers=self.headers,timeout=5)
html=response.text
except:
pass
else:
return html
def _analysis(self,html):
try:
soup=BeautifulSoup(html,'lxml')
title=soup.select('.plus_l > ul:nth-child(1) > li:nth-child(1)')
name=soup.select('.plus_l > ul:nth-child(1) > li:nth-child(3)')
password=soup.select('.plus_l > ul:nth-child(1) > li:nth-child(4) > font:nth-child(1)')
sourse=soup.select('span.downfile:nth-child(4) > a:nth-child(1)')
if sourse != []:
data={'title':title[0].get_text(),'name':name[0].get_text(),
'password':password[0].get_text(),'sourse':sourse[0].get('href'),
}
self.allbook.append(data)
except:
pass
def _write_json(self):
with open('allbook.json','w',encoding='utf-8') as file:
file.write(json.dumps(self.allbook,indent=2,ensure_ascii=False))
def go(self):
for num,url in enumerate(self.urls):
html=self._fetch_contents(url)
self._analysis(html)
print(num)
self._write_json()
spider=Spider()
spider.go()
这个爬虫结构和使用的方法都比较简单,使用requests请求,BeautifulSoup解析,最后写入文件。所以就不多注释了。
挺实用的关键是。
在这个过程中有遇到:'gbk' codec can't encode character '\xXX' in position XX的问题。我从下面两篇文章中得到了解答:
https://blog.csdn.net/yq0632/article/details/80254587?depth_1-utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task
https://www.cnblogs.com/feng18/p/5646925.html
作者:小云同学