导入模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
加载文件
df = pd.read_csv(r'D:\学习文件\python案例\Python案例\beijing_houst_price.csv')
查看文件
df.head()
#数值类型的常用统计值
df.describe()
#查看各列非空值数量
df.count()
#查看是否有重复值
df[df.duplicated()]
#假如ID都是正确的值,看看有诶呦重复的
df[df['id'].duplicated()]
#tradeTime列数据清洗
df['tradeTime'].value_counts()
#查看数据类型
df.dtypes
#先将字符串转化为日期格式,方便内置函数
df['tradeTime'] = pd.to_datetime(df['tradeTime'])
#统计下每年的数据量
df['tradeTime'].dt.year.value_counts()
#删除2012年前的数据
df.drop(df[df['tradeTime'].dt.year 2017].index,inplace=True)```
#查看50万以下的房子数量
df[df['totalPrice'] <50]
#删除房价50万以下的数据
df.drop(df[df['totalPrice'] <50].index,inplace=True)
#小区为null的有没有
df[df['communityAverage'].isnull()]
#小区为null的值也删除
df.drop(df[df['communityAverage'].isnull()].index,inplace=True)
#查看一下目前的数据详情
df.head()
df.describe()
#取每日房源的平均价
df_price = df.groupby('tradeTime').mean()['price']
#用趋势图展示
df_price.plot()
(https://img-blog.csdnimg.cn/20200502123925420.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1JvY2t5XzUwOA==,size_16,color_FFFFFF,t_70)
#总价各区间的房源数(以2017年为例)
df['year'] = df['tradeTime'].dt.year
df_2017 = df[df['year'] == 2017]
#看一下数据
df_2017
#绘制一下图更直观
bins_arr = np.arange(50,2000,50)
bins = pd.cut(df_2017['totalPrice'],bins_arr)
bin_count = df_2017['totalPrice'].groupby(bins).count()
bin_count.plot()
(https://img-blog.csdnimg.cn/20200502124313167.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1JvY2t5XzUwOA==,size_16,color_FFFFFF,t_70)
bins_arr = np.arange(5000,155000,5000)
bins = pd.cut(df_2017['price'],bins_arr)
price_count = df_2017['price'].groupby(bins).count()
price_count
price_count.plot()
(https://img-blog.csdnimg.cn/20200502124322621.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1JvY2t5XzUwOA==,size_16,color_FFFFFF,t_70)
bins_arr = np.arange(10,210,10)
bins = pd.cut(df_2017['square'],bins_arr)
square_count = df_2017['square'].groupby(bins).count()
square_count
square_count.plot()
(https://img-blog.csdnimg.cn/20200502124332817.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L1JvY2t5XzUwOA==,size_16,color_FFFFFF,t_70)
Rocky_508
原创文章 2获赞 2访问量 41
关注
私信
展开阅读全文