Frequency Analysis

魔法师LQ

最关键的就一句代码,直接跳到最后即可。

数据挖掘课程作业貌似要用到MATLAB以及SPSS,而 实验设计和数据分析课程主要使用Excel。除了SPSS之外其余两个都接触过,今天安装了SPSS,安装后的新手指引很贴心,给了一个收入分析的例子,全程不到5分钟,最后出来了很棒的统计以及可视化结果,长成下面这样。感觉Pandas,NumPy,Matplotlib也能做,试着复现一下,同时回顾一下Pandas的操作。

1
import pandas as pd
1
import numpy as np
1
2
3
# 导入数据,../DataSets/demoFreq1.csv
demoFreq_path = "../DataSets/demoFreq1.csv"
df = pd.read_csv(demoFreq_path)
1
df
id gender bdate age educ jobcat salbegin salary service prevexp
0 1 m 7/26/1981 36 15 4 44010.0 98799.81000 98 144
1 2 m 11/13/1987 30 16 5 30562.5 69679.86600 98 36
2 3 f 1/16/1959 58 12 2 19560.0 37179.92850 98 381
3 4 f 10/6/1994 23 8 1 21516.0 37959.92700 98 190
4 5 m 8/1/1984 33 15 5 34230.0 77999.85000 98 138
5 6 m 2/12/1988 29 15 5 22005.0 55639.89300 98 67
6 7 m 10/17/1985 32 15 5 30562.5 62399.88000 98 114
7 8 f 10/27/1995 22 12 2 15892.5 37959.92700 98 0
8 9 f 7/16/1975 42 15 5 20782.5 48359.90700 98 115
9 10 f 8/6/1975 42 12 2 22005.0 41599.92000 98 244
10 11 f 7/31/1979 38 16 5 26895.0 52519.89900 98 143
11 12 m 7/3/1993 24 8 1 19560.0 49139.90550 98 26
12 13 m 1/7/1990 27 15 5 23227.5 48099.90750 98 34
13 14 f 8/19/1978 39 15 5 27384.0 60839.88300 98 137
14 15 m 2/19/1992 25 12 2 22005.0 47319.90900 97 66
15 16 m 5/10/1994 23 12 2 24450.0 70719.86400 97 24
16 17 m 1/8/1992 25 15 5 23227.5 79733.18000 97 48
17 18 m 9/10/1985 32 16 4 44841.3 179832.98750 97 70
18 19 m 2/9/1992 25 12 2 23227.5 73319.85900 97 103
19 20 f 7/15/1969 48 12 2 18826.5 45499.91250 97 48
20 21 f 8/11/1992 25 16 5 24450.0 67339.87050 97 17
21 22 m 3/17/1970 47 12 2 20782.5 37699.92750 97 315
22 23 f 9/5/1994 23 15 5 18093.0 41599.92000 97 75
23 24 f 9/17/1962 55 12 2 14670.0 29379.94350 97 124
24 25 f 12/22/1971 45 15 5 14670.0 36659.92950 97 171
25 26 m 4/30/1996 21 15 5 20538.0 53819.89650 96 14
26 27 m 9/9/1983 34 19 4 44792.4 104649.79875 96 96
27 28 m 10/1/1992 25 15 5 23227.5 56419.89150 96 43
28 29 m 7/20/1973 44 19 6 130367.4 233999.55000 96 199
29 30 m 3/10/1991 26 15 5 23227.5 54079.89600 96 54
... ... ... ... ... ... ... ... ... ... ...
444 445 m 1/24/1993 24 15 5 26895.0 48619.90650 66 84
445 446 m 1/24/1988 29 16 4 71883.0 173333.00000 66 128
446 447 m 12/23/1990 26 15 5 33496.5 84933.17000 66 86
447 448 f 11/26/1962 54 12 2 16626.0 28339.94550 66 163
448 449 m 6/25/1995 22 16 4 35452.5 121333.10000 65 19
449 450 m 1/11/1984 33 19 4 57017.4 95333.15000 65 129
450 451 m 1/9/1999 18 15 5 23227.5 49399.90500 65 20
451 452 m 2/3/1983 34 12 2 29340.0 49919.90400 65 210
452 453 m 1/28/1960 57 15 5 25672.5 42379.91850 65 338
453 454 m 1/18/1995 22 19 4 50937.5 157083.03125 65 18
454 455 m 7/9/1993 24 16 4 31785.0 75659.85450 65 19
455 456 m 4/8/1989 28 19 4 69291.3 129999.75000 65 54
456 457 m 11/17/1997 19 15 5 23227.5 54859.89450 65 10
457 458 m 12/27/1994 22 19 4 46846.2 107249.79375 65 26
458 459 f 8/2/2000 17 12 2 18337.5 37699.92750 65 0
459 460 f 2/2/1999 18 12 2 20782.5 38999.92500 65 24
460 461 f 5/1/1995 22 8 1 22005.0 37439.92800 65 173
461 462 f 4/9/1993 24 16 4 31785.0 59643.88530 65 79
462 463 f 4/6/1964 53 15 5 23227.5 35879.93100 65 241
463 464 m 9/10/1991 26 19 4 53790.0 82419.84150 64 27
464 465 m 1/10/1992 25 12 2 26895.0 58759.88700 64 106
465 466 f 12/6/1977 39 12 2 22005.0 40559.92200 64 198
466 467 f 2/7/1997 20 16 5 31785.0 56939.89050 64 20
467 468 f 5/21/1995 22 16 4 32567.4 96633.14750 64 36
468 469 f 11/22/1993 23 15 5 22738.5 43679.91600 64 57
469 470 m 7/14/1993 24 12 2 25672.5 45499.91250 64 69
470 471 m 1/24/1996 21 15 5 25672.5 45759.91200 64 32
471 472 m 8/14/1995 22 15 5 25672.5 67859.86950 63 46
472 473 f 5/18/1967 50 12 2 20782.5 37179.92850 63 139
473 474 f 4/28/1998 19 12 2 23227.5 50959.90200 63 9

474 rows × 10 columns

1
print(df['salary'])
0       98799.81000
1       69679.86600
2       37179.92850
3       37959.92700
4       77999.85000
5       55639.89300
6       62399.88000
7       37959.92700
8       48359.90700
9       41599.92000
10      52519.89900
11      49139.90550
12      48099.90750
13      60839.88300
14      47319.90900
15      70719.86400
16      79733.18000
17     179832.98750
18      73319.85900
19      45499.91250
20      67339.87050
21      37699.92750
22      41599.92000
23      29379.94350
24      36659.92950
25      53819.89650
26     104649.79875
27      56419.89150
28     233999.55000
29      54079.89600
           ...     
444     48619.90650
445    173333.00000
446     84933.17000
447     28339.94550
448    121333.10000
449     95333.15000
450     49399.90500
451     49919.90400
452     42379.91850
453    157083.03125
454     75659.85450
455    129999.75000
456     54859.89450
457    107249.79375
458     37699.92750
459     38999.92500
460     37439.92800
461     59643.88530
462     35879.93100
463     82419.84150
464     58759.88700
465     40559.92200
466     56939.89050
467     96633.14750
468     43679.91600
469     45499.91250
470     45759.91200
471     67859.86950
472     37179.92850
473     50959.90200
Name: salary, Length: 474, dtype: float64
1
print(type(df['salary'])) # 每一列就是一个Series,相当于SPSS中的变量值
<class 'pandas.core.series.Series'>
1
df.head(10) # 查看前n条记录
id gender bdate age educ jobcat salbegin salary service prevexp
0 1 m 7/26/1981 36 15 4 44010.0 98799.8100 98 144
1 2 m 11/13/1987 30 16 5 30562.5 69679.8660 98 36
2 3 f 1/16/1959 58 12 2 19560.0 37179.9285 98 381
3 4 f 10/6/1994 23 8 1 21516.0 37959.9270 98 190
4 5 m 8/1/1984 33 15 5 34230.0 77999.8500 98 138
5 6 m 2/12/1988 29 15 5 22005.0 55639.8930 98 67
6 7 m 10/17/1985 32 15 5 30562.5 62399.8800 98 114
7 8 f 10/27/1995 22 12 2 15892.5 37959.9270 98 0
8 9 f 7/16/1975 42 15 5 20782.5 48359.9070 98 115
9 10 f 8/6/1975 42 12 2 22005.0 41599.9200 98 244
1
df[0:10] # 类似于数组的形式来操作
id gender bdate age educ jobcat salbegin salary service prevexp
0 1 m 7/26/1981 36 15 4 44010.0 98799.8100 98 144
1 2 m 11/13/1987 30 16 5 30562.5 69679.8660 98 36
2 3 f 1/16/1959 58 12 2 19560.0 37179.9285 98 381
3 4 f 10/6/1994 23 8 1 21516.0 37959.9270 98 190
4 5 m 8/1/1984 33 15 5 34230.0 77999.8500 98 138
5 6 m 2/12/1988 29 15 5 22005.0 55639.8930 98 67
6 7 m 10/17/1985 32 15 5 30562.5 62399.8800 98 114
7 8 f 10/27/1995 22 12 2 15892.5 37959.9270 98 0
8 9 f 7/16/1975 42 15 5 20782.5 48359.9070 98 115
9 10 f 8/6/1975 42 12 2 22005.0 41599.9200 98 244
1
df['age']  # 根据变量名(关键码)来查看
0      36
1      30
2      58
3      23
4      33
5      29
6      32
7      22
8      42
9      42
10     38
11     24
12     27
13     39
14     25
15     23
16     25
17     32
18     25
19     48
20     25
21     47
22     23
23     55
24     45
25     21
26     34
27     25
28     44
29     26
       ..
444    24
445    29
446    26
447    54
448    22
449    33
450    18
451    34
452    57
453    22
454    24
455    28
456    19
457    22
458    17
459    18
460    22
461    24
462    53
463    26
464    25
465    39
466    20
467    22
468    23
469    24
470    21
471    22
472    50
473    19
Name: age, Length: 474, dtype: object
1
print(df.columns)  # 查看所有变量名
Index(['id', 'gender', 'bdate', 'age', 'educ', 'jobcat', 'salbegin', 'salary',
       'service', 'prevexp'],
      dtype='object')
1
2
var_names = df.columns.tolist() # 转换为list
print(var_names)
['id', 'gender', 'bdate', 'age', 'educ', 'jobcat', 'salbegin', 'salary', 'service', 'prevexp']
1
df.describe()  # 基本统计信息
id educ jobcat salbegin salary service prevexp
count 474.000000 474.000000 474.000000 474.000000 474.000000 474.000000 474.000000
mean 237.500000 13.491561 3.278481 27736.220992 59660.468953 81.109705 95.860759
std 136.976275 2.884846 1.462166 12829.140192 29597.756286 10.060945 104.586236
min 1.000000 8.000000 1.000000 14670.000000 27299.947500 63.000000 0.000000
25% 119.250000 12.000000 2.000000 20354.625000 41599.920000 72.000000 19.250000
50% 237.500000 12.000000 3.000000 24450.000000 50049.903750 81.000000 55.000000
75% 355.750000 15.000000 5.000000 28508.700000 64024.876875 90.000000 138.750000
max 474.000000 21.000000 6.000000 130367.400000 233999.550000 98.000000 476.000000
1
df.describe(include=[np.number])  # 只统计数值型的内容
id educ jobcat salbegin salary service prevexp
count 474.000000 474.000000 474.000000 474.000000 474.000000 474.000000 474.000000
mean 237.500000 13.491561 3.278481 27736.220992 59660.468953 81.109705 95.860759
std 136.976275 2.884846 1.462166 12829.140192 29597.756286 10.060945 104.586236
min 1.000000 8.000000 1.000000 14670.000000 27299.947500 63.000000 0.000000
25% 119.250000 12.000000 2.000000 20354.625000 41599.920000 72.000000 19.250000
50% 237.500000 12.000000 3.000000 24450.000000 50049.903750 81.000000 55.000000
75% 355.750000 15.000000 5.000000 28508.700000 64024.876875 90.000000 138.750000
max 474.000000 21.000000 6.000000 130367.400000 233999.550000 98.000000 476.000000
1
df['salary'].describe()  # 没有统计missing的数量,当其余信息比spss丰富
count       474.000000
mean      59660.468953
std       29597.756286
min       27299.947500
25%       41599.920000
50%       50049.903750
75%       64024.876875
max      233999.550000
Name: salary, dtype: float64
1
df.describe(include=[np.object])  # 只统计字符串类型的内容
gender bdate age
count 474 474 474
unique 2 458 44
top m 5/3/1995 24
freq 258 3 44
1
2
3
'''
问题1 收入分布
'''
'\n问题1    收入分布\n'
1
2
# 纵坐标,数量;横坐标,收入
salary = df['salary'].unique()
1
len(salary)
221
1
min(salary), max(salary)  # 收入差距上下限
(27299.9475, 233999.55)
1
df.salary
0       98799.81000
1       69679.86600
2       37179.92850
3       37959.92700
4       77999.85000
5       55639.89300
6       62399.88000
7       37959.92700
8       48359.90700
9       41599.92000
10      52519.89900
11      49139.90550
12      48099.90750
13      60839.88300
14      47319.90900
15      70719.86400
16      79733.18000
17     179832.98750
18      73319.85900
19      45499.91250
20      67339.87050
21      37699.92750
22      41599.92000
23      29379.94350
24      36659.92950
25      53819.89650
26     104649.79875
27      56419.89150
28     233999.55000
29      54079.89600
           ...     
444     48619.90650
445    173333.00000
446     84933.17000
447     28339.94550
448    121333.10000
449     95333.15000
450     49399.90500
451     49919.90400
452     42379.91850
453    157083.03125
454     75659.85450
455    129999.75000
456     54859.89450
457    107249.79375
458     37699.92750
459     38999.92500
460     37439.92800
461     59643.88530
462     35879.93100
463     82419.84150
464     58759.88700
465     40559.92200
466     56939.89050
467     96633.14750
468     43679.91600
469     45499.91250
470     45759.91200
471     67859.86950
472     37179.92850
473     50959.90200
Name: salary, Length: 474, dtype: float64
1
df.iloc[0, 0]  # 控制其中任意位置
1
1
df.loc[0, 'id']  # 这种方式也可以为DataFrame添加行 https://blog.csdn.net/tanzuozhev/article/details/76735660
1
1
2
# 收入,计数,占比,累计占比  salary, frequency, percent, cumulative percent
df_new = pd.DataFrame(columns=['Salary', 'Frequency', 'Percent', 'CumulativePercent'])
1
counts = df.salary.value_counts()
1
2
print(type(counts))
print(counts)
<class 'pandas.core.series.Series'>
53299.89750     13
41599.92000      8
42379.91850      8
45499.91250      7
46279.91100      7
48099.90750      7
38999.92500      7
38739.92550      6
58759.88700      6
34059.93450      6
49399.90500      6
59799.88500      5
46019.91150      5
50959.90200      5
47579.90850      5
50439.90300      5
43679.91600      5
36139.93050      5
47319.90900      5
51999.90000      5
37959.92700      5
52519.89900      4
55379.89350      4
69679.86600      4
39779.92350      4
42899.91750      4
36919.92900      4
51739.90050      4
62399.88000      4
44979.91350      4
                ..
60007.88460      1
91259.82450      1
68119.86900      1
145166.38750     1
31979.93850      1
127833.08750     1
50855.90220      1
83199.84000      1
34319.93400      1
27299.94750      1
71239.86300      1
96199.81500      1
49659.90450      1
64999.87500      1
50179.90350      1
28339.94550      1
102959.80200     1
179399.65500     1
52779.89850      1
94249.81875      1
173333.00000     1
72799.86000      1
92083.15625      1
75243.85530      1
191749.63125     1
149499.71250     1
98019.81150      1
88833.16250      1
77999.85000      1
69419.86650      1
Name: salary, Length: 221, dtype: int64
1
2
3
4
## 绘制柱状图

p = counts.plot(kind='bar').get_figure()
p.show()
D:\Anaconda3\Anaconda3_py36\lib\site-packages\matplotlib\figure.py:459: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

png

1
p.savefig('demo_01.png')
1
2
# 分段累积,绘制柱状图
counts.plot.hist() # 注意,使用counts来绘制柱状图显然是错误的
<matplotlib.axes._subplots.AxesSubplot at 0x1742a430588>

png

1
counts.sort_index()
27299.94750     1
27559.94700     1
28079.94600     3
28339.94550     1
28599.94500     1
28859.94450     1
29119.94400     1
29379.94350     3
29639.94300     2
29899.94250     1
30159.94200     2
30679.94100     1
31459.93950     2
31979.93850     1
32499.93750     1
33279.93600     2
34059.93450     6
34319.93400     1
34579.93350     2
34839.93300     2
35359.93200     3
35619.93150     1
35879.93100     2
36139.93050     5
36399.93000     2
36659.92950     2
36919.92900     4
37179.92850     3
37439.92800     3
37699.92750     3
               ..
114833.11250    1
115699.77750    1
115916.44375    2
116999.77500    1
118083.10625    2
119166.43750    2
120033.10250    1
121333.10000    2
122849.76375    1
125666.42500    1
127399.75500    1
127833.08750    1
129999.75000    2
135416.40625    1
135633.07250    1
136066.40500    1
138666.40000    1
140833.06250    1
142999.72500    1
145166.38750    1
149499.71250    1
157083.03125    1
158166.36250    1
159466.36000    1
168133.01000    1
173333.00000    1
179399.65500    1
179832.98750    1
191749.63125    1
233999.55000    1
Name: salary, Length: 221, dtype: int64
1
counts.plot.hist(color='k', alpha=0.5, bins=30)
<matplotlib.axes._subplots.AxesSubplot at 0x17429f56710>

png

1
counts.plot.hist(color='g', alpha=0.6, bins=18, figsize=(16, 9))
<matplotlib.axes._subplots.AxesSubplot at 0x17424e1db38>

png

1
counts.diff().hist(color='k', alpha=0.5, bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x174257b5748>

png

1
plt_hist = df['salary'].plot.hist(alpha=0.8, bins=28, figsize=(16, 9))

png

1
plt_hist.set_xlabel('Current Salary($)')
Text(0.5,3.2,'Current Salary($)')
1
plt_hist
<matplotlib.axes._subplots.AxesSubplot at 0x17429117390>
1
2
import matplotlib.pyplot as plt
plt_hist = df['salary'].plot.hist(alpha=0.8, bins=28, figsize=(16, 9)).set_title('Histogram')

收入分析柱状图(待完善)