2020-02-22日爬虫练习
爬取网站:别逗了 需求:爬取分页段子,并将段子标题和正文以字典的形式通过json序列化后存储到本地
技术路线: 1.requests BeautifulSoup Json 通过zip函数将列表中奇数位置设为字典的key,偶数位置为相应的value'''
爬虫实战爬取笑话网
version:01
author:金鞍少年
date:2020-03-22
'''
from bs4 import BeautifulSoup
import requests
import json
class biedoul:
def __init__(self, url,count):
self.url = url
self.count = int(count)
self.path = r'./别逗了笑话网/index'
self.headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"referer": "https://www.biedoul.com/"
}
# 获取Html
def getHtml(self,url):
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
html = BeautifulSoup(res.text, 'html.parser')
return html
# 获取段子
def getcontent(self, html):
p = html.select("dd")
content = [i.get_text() for i in p] # 列表中奇数位是标题,偶数位是段子
return dict(zip(content[::2], content[1::2])) # 将列表中奇数位置对应字典的key,偶数位置为相应的value,生成字典
# 将笑话字典通过json 存储到本地
def saveJoke(self, Joke, i):
path_flie = self.path + str(i) # 每一页段子存储一个json文件
json.dump(Joke, open(path_flie +'.json', 'w', encoding='utf-8'))
# 逻辑
def func(self):
for i in range(self.count):
url = self.url + str(i) + '/' # 拼接分页url
self.saveJoke(self.getcontent(self.getHtml(url)), i)
if __name__ == '__main__':
b = biedoul('https://www.biedoul.com/wenzi/', 3) # 获取前三页的段子
b.func()
print('笑话段子保存成功!')