透视图与交叉表
在数据分析中,数据透视表是常见的工具之一,需要根据行或列对数据进行各个维度数据的汇总,在pandas中,提供了相关函数解决此类问题
交叉表更多用于频数的分析
pivot_table(data, index, columns, values, aggfunc, fill_value, margins, margins_name=)
data:数据
index: 行分组键
columns:列分组键
values:分组的字段,只能为数值型变量
aggfunc:聚合函数
fill_value: 缺失值填补
margins:是否需要总计 (字段均值/总和)
margins_name: 总计名称
pd.crosstab(index, columns, normalize)
index: 行索引
columns:列索引
normalize:对数据进行标准化,index表示行,column表示列
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据统计'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('online_order.csv', encoding='gbk', dtype={'cunstomer':str, 'order':str})
df.head(5)
|
customer |
order |
total_items |
discount% |
weekday |
hour |
Food% |
Fresh% |
Drinks% |
Home% |
Beauty% |
Health% |
Baby% |
Pets% |
0 |
0 |
0 |
45 |
23.03 |
4 |
13 |
9.46 |
87.06 |
3.48 |
0.00 |
0.00 |
0.00 |
0.0 |
0.0 |
1 |
0 |
1 |
38 |
1.22 |
5 |
13 |
15.87 |
75.80 |
6.22 |
2.12 |
0.00 |
0.00 |
0.0 |
0.0 |
2 |
0 |
2 |
51 |
18.08 |
4 |
13 |
16.88 |
56.75 |
3.37 |
16.48 |
6.53 |
0.00 |
0.0 |
0.0 |
3 |
1 |
3 |
57 |
16.51 |
1 |
12 |
28.81 |
35.99 |
11.78 |
4.62 |
2.87 |
15.92 |
0.0 |
0.0 |
4 |
1 |
4 |
53 |
18.31 |
2 |
11 |
24.13 |
60.38 |
7.78 |
7.72 |
0.00 |
0.00 |
0.0 |
0.0 |
df.info()
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 14 columns):
customer 30000 non-null int64
order 30000 non-null object
total_items 30000 non-null int64
discount% 30000 non-null float64
weekday 30000 non-null int64
hour 30000 non-null int64
Food% 30000 non-null float64
Fresh% 30000 non-null float64
Drinks% 30000 non-null float64
Home% 30000 non-null float64
Beauty% 30000 non-null float64
Health% 30000 non-null float64
Baby% 30000 non-null float64
Pets% 30000 non-null float64
dtypes: float64(9), int64(4), object(1)
memory usage: 3.2+ MB
df.columns
Index(['customer', 'order', 'total_items', 'discount%', 'weekday', 'hour',
'Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%'],
dtype='object')
pd.pivot_table(data=df, index='weekday', values='total_items', aggfunc=[np.mean, np.sum],
margins=True, margins_name='总计商品情况')
|
mean |
sum |
|
total_items |
total_items |
weekday |
|
|
1 |
30.662177 |
191240 |
2 |
31.868612 |
158387 |
3 |
31.869796 |
150043 |
4 |
32.251899 |
131620 |
5 |
31.406619 |
79710 |
6 |
32.154814 |
74149 |
7 |
32.373837 |
167049 |
总计 |
31.739933 |
952198 |
pd.pivot_table(data=df, index='weekday', columns='customer', values='total_items', aggfunc=[np.mean, np.sum],
margins=True, margins_name='总计商品情况', fill_value=0)
|
mean |
... |
sum |
customer |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
... |
10230 |
10231 |
10232 |
10233 |
10234 |
10235 |
10236 |
10237 |
10238 |
总计商品情况 |
weekday |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
0 |
32.538462 |
0 |
28.571429 |
15.818182 |
0 |
40.000000 |
40 |
26.333333 |
9.000000 |
... |
13 |
15 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
191240 |
2 |
0 |
42.333333 |
0 |
9.500000 |
13.400000 |
21 |
40.500000 |
0 |
54.500000 |
18.500000 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
158387 |
3 |
0 |
0.000000 |
0 |
0.000000 |
22.333333 |
0 |
38.333333 |
0 |
0.000000 |
0.000000 |
... |
39 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
150043 |
4 |
48 |
12.333333 |
0 |
41.500000 |
12.666667 |
0 |
44.000000 |
0 |
28.500000 |
0.000000 |
... |
31 |
8 |
2 |
5 |
1 |
0 |
0 |
54 |
2 |
131620 |
5 |
38 |
36.000000 |
0 |
28.000000 |
3.000000 |
0 |
0.000000 |
0 |
37.000000 |
0.000000 |
... |
0 |
15 |
0 |
0 |
0 |
4 |
0 |
10 |
0 |
79710 |
6 |
0 |
0.000000 |
26 |
0.000000 |
0.000000 |
0 |
0.000000 |
0 |
0.000000 |
26.333333 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
74149 |
7 |
0 |
0.000000 |
0 |
45.000000 |
24.500000 |
0 |
46.000000 |
0 |
0.000000 |
20.333333 |
... |
0 |
22 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
167049 |
总计商品情况 |
44 |
31.150000 |
26 |
27.782609 |
17.103448 |
21 |
40.769231 |
40 |
35.250000 |
17.750000 |
... |
83 |
60 |
2 |
5 |
1 |
4 |
1 |
64 |
2 |
952198 |
8 rows × 20480 columns
# 不同折扣下的样本(订单)数量
pd.crosstab(index=df['weekday'], columns=df['discount%'])
discount% |
-65.15 |
-63.64 |
-47.26 |
-39.84 |
-26.15 |
-23.98 |
-23.89 |
-20.59 |
-19.79 |
-18.06 |
... |
95.49 |
95.79 |
96.65 |
97.12 |
98.02 |
98.36 |
98.39 |
98.54 |
98.6 |
100.0 |
weekday |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
71 |
2 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
2 |
1 |
1 |
... |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
78 |
3 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
59 |
4 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
59 |
5 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
52 |
6 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
25 |
7 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
52 |
7 rows × 3579 columns
pd.crosstab(index=df['weekday'], columns=df['discount%'], margins=True, normalize='index')
discount% |
-65.15 |
-63.64 |
-47.26 |
-39.84 |
-26.15 |
-23.98 |
-23.89 |
-20.59 |
-19.79 |
-18.06 |
... |
95.49 |
95.79 |
96.65 |
97.12 |
98.02 |
98.36 |
98.39 |
98.54 |
98.6 |
100.0 |
weekday |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
0.000160 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000160 |
... |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000160 |
0.000000 |
0.000160 |
0.000000 |
0.011384 |
2 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000201 |
0.000402 |
0.000201 |
0.000201 |
... |
0.000000 |
0.000000 |
0.000000 |
0.000201 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.015694 |
3 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000212 |
0.000212 |
0.000000 |
0.000212 |
0.000000 |
0.000000 |
... |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.012532 |
4 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.014457 |
5 |
0.000000 |
0.000000 |
0.000394 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
0.000394 |
0.000394 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000394 |
0.020489 |
6 |
0.000000 |
0.000000 |
0.000000 |
0.000434 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000434 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.010841 |
7 |
0.000000 |
0.000194 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
0.000000 |
0.000000 |
0.000194 |
0.000000 |
0.000000 |
0.000000 |
0.000194 |
0.000000 |
0.000000 |
0.010078 |
All |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000100 |
0.000033 |
0.000067 |
... |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.000033 |
0.013200 |
8 rows × 3579 columns
作者:若尘
交叉表
透视图
数据
交叉
视图
数据清洗