实现目标:爬取某地历史天气情况(以深圳2019年为例)
需要的库:requests,bs4,pandas
PS:requests 和 bs4 库很小,大概150K左右
第一步:
找目标url;
第二步:获取网页源代码
url = 'http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html'
req = requests.get(url)
html = req.text
第三步:数据提取,提取自己需要的内容
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
a = sub_data[0].replace('年','/')#将年月日用'/'代替,此处可省略,看个人需求
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
打印效果如下:
第四步:保存数据
_data = pd.DataFrame() # 创建一个表格
_data['日期'] = datas #向表格内添加数据
_data['天气状况'] = condition
_data['温度'] = temp
_data.to_csv('深圳2019.01天气记录.csv',index=False, encoding='utf-8')
效果如下:
如下是一个月和多个月的整体代码。
import requests
from bs4 import BeautifulSoup
import pandas as pd
'''====================================深圳2019.1历史天气数据======================================'''
# 目标url
url = 'http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html'
# 获取网页源代码
req = requests.get(url)
html = req.text
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
a = sub_data[0].replace('年','/')
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
# 数据保存
_data = pd.DataFrame() # 创建一个表格
_data['日期'] = datas #向表格内添加数据
_data['天气状况'] = condition
_data['温度'] = temp
_data.to_csv('深圳2019.01天气记录.csv',index=False, encoding='utf-8')
'''====================================深圳2019 1——4月的历史天气数据======================================'''
# 获取url
def get_data(url):
req = requests.get(url)
html = req.text
# 数据提取
soup = BeautifulSoup(html,'html.parser')
tr_list = soup.find_all('tr')
datas,condition,temp = [], [], []
for data in tr_list[1:]:
sub_data = data.text.split()
# print(sub_data)
a = sub_data[0].replace('年','/')
b = a.replace('月','/')
c = b.replace('日', '')
datas.append(c)
condition.append(''.join(sub_data[1:3]))
temp.append(''.join(sub_data[3:6]))
# 数据保存
_data = pd.DataFrame() # 创建一个表格
_data['日期'] = datas #向表格内添加数据
_data['天气状况'] = condition
_data['温度'] = temp
# print(_data)
return _data
data_01 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201901.html')
data_02 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201902.html')
data_03 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201903.html')
data_04 = get_data('http://www.tianqihoubao.com/lishi/shenzhen/month/201904.html')
data = pd.concat([data_01,data_02,data_03,data_04]).reset_index(drop=True)
data.to_csv('深圳2019.01-04月天气记录.csv',index=False, encoding='utf-8')