Pandas数据操作归纳总结

Pandas建立在NumPy之上,旨在与许多其他第三方库完美地集成在科学计算环境中。它的两个主要数据结构是Series(1维)和DataFrame(2维)。

一、Pandas数据结构:一维数组(Series)

1、创建Series

Series是一个一维的数据类型,其中每一个元素都有一个标签。类似于Numpy中元素带标签的数组。其中,标签可以是数字或者字符串。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import numpy as np
import pandas as pd

# 直接创建
a = pd.Series([1, 2, 5, 6, 8])
print(a)

# 用标量创建
b = pd.Series(5, index = [0,1,2,3])
print(b)

# 从numpy数组创建
data1 = np.array(['a','b', 3, 4])
c = pd.Series(data1, index = ['a','b','c','d'])
print(c)

# 从字典创建
data2 = {'a':1, 'b':2, 'c':3}
d = pd.Series(data2, index = ['a','b','c','d'])
print(d)

2、Series的索引

Series对象提供了几种不同的索引方法。

(1)枚举索引

枚举索引按照index值进行索引。
index值默认使用下标,下标总是从0开始的,而且索引值总是数字。也可以用关键字为index赋值,关键字既可以是数字,也可以是字符串等。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
import pandas as pd

# 使用下标作为index(默认)
a = pd.Series([1, 2, 5, 6, 8])
print(a.index)
print(a.values) # [1 2 5 6 8]
print(a[4]) # 8
print(a[[1,3]].values) # [2, 6]

# 使用关键字作为index
b = pd.Series([1, 2, 5, 6, 8], index = [1, 2, 'k', 'j', 'k'])
print(b.index)
print(b.values) # [1 2 5 6 8]
print(b[4]) # 优先查找关键字如果没有匹配的关键字则按照下标查
print(b[[2,'j']].values) # [2, 6]
print(b[[2,'k']].values) # [2 5 8]
(2)布尔索引
1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd

# 使用布尔索引
a = pd.Series([1, 2, 5, 6, 8], index=['a', 'b', 'c', 'd', 'd'])
print(a.index)
print(a.values) # [1 2 5 6 8]
print(a>3) # False False True True True
print(a[a>3].values) # [5 6 8]
(3)切片索引

切片索引使用的是下标,不会受到index的影响

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd

a = pd.Series([1, 2, 5, 6, 8])
print(a[1:3].values) # [2 5]

b = pd.Series([1, 2, 5, 6, 8], index = [1, 2, 3, 4, 5])
print(b[1:3].values) # [2 5]

二、Pandas数据结构:数据表(Dataframe)

Pandas提供了一种名为Dataframe的二维表结构。可以把DataFrame想象成一个电子表格,它由行名(index)、列名(columns)和数据(values)组成。

1、创建Dataframe

(1)从字典创建
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

# 使用默认的下标作为行(index)
df1 = pd.DataFrame(data)
print(df1)

# 使用提供的关键字作为行(index)
df2 = pd.DataFrame(data, index=['A','B','C'])
print(df2)
(2)从Series创建
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import pandas as pd

data = {"a":pd.Series([1,2,3],['A','B','C']),
"b":pd.Series([4,5,6],['A','B','C']),
"c":pd.Series([7,8,9],['A','B','C'])}

df = pd.DataFrame(data)
print(df)

data2 = {"a":pd.Series([1,2,3]),
"b":pd.Series([4,5,6]),
"c":pd.Series([7,8,9])}

df2 = pd.DataFrame(data2)
print(df2)
(3)从二维数组直接创建

由二维数组直接创建DataFrame,得到一样形状的结果数据,如果不指定index和columns,两者均返回默认数字格式

1
2
3
4
5
6
7
import numpy as np
import pandas as pd

data = np.random.rand(9).reshape(3,3)

df = pd.DataFrame(data, index = ['a', 'b', 'c'], columns = ['one','two','three'])
print(df)

2、Dataframe的查看

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import numpy as np
import pandas as pd

data = np.random.rand(9).reshape(3,3)

df = pd.DataFrame(data, columns = ['one','two','three'])

print(df.head(2)) # 查看DataFrame对象的前2行
print(df.tail(2)) # 查看DataFrame对象的最后2行

print(df.shape) # 查看行数和列数
print(df.info()) # 查看索引、数据类型和内存信息

print(df.columns) # 查看字段名称

3、Dataframe的遍历

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data, index=['x','y','z'])
print(df)

# 按行遍历, 每一行迭代为(行名, Series)对
for i,v in df.iterrows():
# print(i) # 输出每行的索引值
print(v[1]) # 输出每一行第二列元素

# 按行遍历, 每一行迭代为元组
for row in df.itertuples():
print(getattr(row, 'a')) # 输出'a'列元素
print(row[1]) # 输出'a'列元素(行序号占了一列)

# 按列遍历, 每一列迭代为(列名, Series)对
for i,v in df.iteritems():
# print(i) # 输出每行的索引值
print(v[1]) # 输出每一列第二行元素

4、设置Dataframe的索引

(1)普通索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd

df = pd.DataFrame({ 'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

print(df)

# 将某列设为索引列,可接受列表参数,即同时设置多个索引列
drop_col = df.set_index('A', drop=True, append=False, inplace=False, verify_integrity=False)
print(drop_col)
drop_col2 = df.set_index(['A','D'], drop=True, append=False, inplace=False, verify_integrity=False)
print(drop_col2)

no_drop_col = df.set_index('A', drop=False, append=False, inplace=False, verify_integrity=False)
print(no_drop_col)

# 还原索引列为普通列
reset_drop_col = drop_col.reset_index(drop=False)
print(reset_drop_col)
reset_no_drop_col = no_drop_col.reset_index(drop=True)
print(reset_no_drop_col)
(2)多级索引
1
2
3
4
5
6
7
8
9
10
import pandas as pd

midx = pd.MultiIndex(levels=[['Tom', 'Bob', 'Jam'], ['income', 'weight', 'length']],
codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],[0, 1, 2, 0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(index=midx,
columns=['max', 'min'],
data=[[200, 100],[55, 50], [1.5, 1.4], # Tom收入,体重,身高
[400, 300],[65, 60], [1.6, 1.5], # Bob收入,体重,身高
[600, 500],[75, 70], [1.8, 1.7]]) # Jam收入,体重,身高
print(df)

5、用Dataframe的索引筛选数据

(1)枚举索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df = pd.DataFrame(data, index=['x','y','z'])
print(df)

# 访问某列的某个元素
print(df['a'][0])
print(df['a']['x'])

# 访问某行的某个元素
print(df.T['x'][0])
print(df.T['x']['b'])

# 访问某列的某几个元素
print(df['a'][['x','y','z']].values)
print(df['a'][[0,1,2]].values)
(2)布尔索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df = pd.DataFrame(data, index=['x','y','z'])
print(df)

# 单个列的布尔索引
print(df['b']>3)
print(df['b'][df['b']>3])
print(df['b'][df['b']>3].values)

# 整个表的布尔索引
print(df>3)
print(df[df>3])
print(df[df>3].values)
(3)切片索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df = pd.DataFrame(data, index=['x','y','z'])
print(df)

# 单个列的切片索引
print(df['b'][0:2])

# 整个表的切片索引
print(df[0:1][0:2])
(4)loc和iloc索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index=['x','y','z'])

# loc索引
print(df1.loc[1, 'a'])
print(df1.loc[[0,1,2], 'a'])
print(df1.loc[0:2, 'a'])
print(df2.loc['y', 'a'])
print(df2.loc[['x','y','z'], 'a'])

# iloc索引
print(df1.iloc[1, 0])
print(df1.iloc[[0,1,2], 0])
print(df1.iloc[0:3, 0])
print(df2.iloc[1, 0])
print(df2.iloc[[0,1,2], 0])

5、变更Dataframe字段名

1
2
3
4
5
6
7
8
9
10
11
12
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df = pd.DataFrame(data)

print(df.rename(columns={'a':'a_2'})) # 选择性更改列名
print(df.rename(columns=lambda x:x+'1')) # 批量更改列名

三、Pandas数据 I/O

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}

df = pd.DataFrame(data)

# 保存和读取csv格式
df.to_csv('01.csv', index=False, encoding='utf-8')
df1 = pd.read_csv('01.csv', quoting=3) # read_csv不设置quoting默认会去除英文双引号,只留下英文双引号内的内容。设置quoting=3会如实读取内容
print(df1)

# 保存和读取execl格式,需安装xlrd和openpyxl
df.to_excel('02.xlsx', index=False, encoding='utf-8')
df2 = pd.read_excel('02.xlsx')
print(df2)

# 保存和读取txt格式
df.to_csv('03.txt', sep='@', index=False, encoding='utf-8')
df3 = pd.read_csv('03.txt', sep='@', quoting=3)
print(df3)

# 保存和读取pickling格式
df.to_pickle('04.pkl')
df4 = pd.read_pickle('04.pkl')
print(df4)

# 保存和读取SQL表
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:Zxc123456@localhost/test')
df.to_sql('pandas_learn', engine) # 导出数据到SQL表
df5 = pd.read_sql('select * from pandas_learn', engine) # 从SQL表读取数据
print(df5)

# 保存和读取Json
df.to_json('t.json', orient='split')
df6 = pd.read_json('t.json', orient='split')
print(df6)

# 解析URL、字符串或者HTML⽂件,抽取其中的tables表格
tab = pd.read_html('https://downloads.mysql.com/archives/community/')
print(tab)

了解更多

四、Series和DataFrame的基本操作

1、Series和DataFrame添加删除元素

(1)Series添加删除元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import numpy as np
import pandas as pd

s1 = pd.Series([-1, -2, -3], index=['a', 'b', 'c'])
s2 = pd.Series([1, 2, 3, 4], index=['e', 'f', 'c', 'd'])

# 插入元素
s1['d'] = -4
s3 = s1.append(pd.Series([-4], index=['a'])) # verify_integrity=True 标签重复报错
print(s3)
s4 = s1.append(s2)
print(s4)

# 删除元素
s1.drop('d', inplace=False) # 设置为True时修改原Series
print(s1)
s1.pop('d')
print(s1)
(1)DataFrame添加删除元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
import pandas as pd

df = pd.DataFrame({'one' : pd.Series([-1, -2, -3]),
'two' : pd.Series([1, 2, 3, 4])})

# 添加新列
df['three'] = pd.Series([10,20,30])
df['four'] = df['one']+df['three']
df.loc[:, "five"] = [1, 4, 5, 9]
print(df)

# 添加新行
temp_df = pd.DataFrame([[5, 6], [7, 8]], columns=['one','two'])
df = df.append(temp_df, sort=False, ignore_index=True) # ignore_index=True用于重新生成索引
print(df)
df.loc[df.shape[0]] = [0, 0, 0, 0, 0]
print(df)

# 删除行
df.drop(3, axis=0, inplace=True)
print(df)

# 删除列
df.drop(['four','five'], axis=1, inplace=True)
print(df)

2、Series和DataFrame数据排重

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
import pandas as pd

s = pd.Series([-1, -1, -2, -2, -3])
df = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,1,2,1,3,3]})

# 查看是否重复
print(s.duplicated())
print(df.duplicated())
# 删除所有重复
print(s.drop_duplicates())
print(df.drop_duplicates())
# 删除某列的重复值
print(df.drop_duplicates(['k1']))

3、Series和DataFrame数据缺失值处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

s = pd.Series([-1, -1, None, -2, -3])
df = pd.DataFrame({'k1':['one','two']*3+[None],
'k2':[1,None,1,2,1,3,3]})

# 查看是否有缺失值
print(s.isnull().any())
print(df.isnull().any())
# 查看哪些是缺失值
print(s.isnull())
print(df.isnull())

# 一种更优雅的查看DataFrame缺失值的做法
# for columname in df.columns:
# if df[columname].count() != len(df):
# loc = df[columname][df[columname].isnull().values==True].index.tolist()
# print('列名:"{}", 第{}行位置有缺失值'.format(columname,loc))

# 查看缺失值
print(s[s.isnull().values==True])
print(df[df.isnull().values==True])

# 抛弃所有含缺失值的行
# print(s.dropna())
# print(df.dropna())

# 抛弃所有含缺失值的列
# print(df.dropna(axis=1))

# 以某个值填补所有缺失值
# print(df.fillna(value=10))

# 以某个值填补某个字段的缺失值
# print(df['k2'].fillna(value=10))

4、Series和DataFrame数据替换和映射

(1)数据替换
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import pandas as pd

s = pd.Series([-1, -1, -2, -2, -3])
df = pd.DataFrame({'k1':['one','two']*3+['two'],
'k2':[1,1,1,2,1,3,3]})

# Series数据替换
print(s.replace(-3, 999))
print(s.replace([-1,-2], 3))
print(s.replace({-1:'a',-2:'b'}))

# DataFrame数据替换
print(df.replace(1, 999))
print(df.replace([1,2], 3))
print(df.replace({1:'a',2:'b'}))
(2)数据映射

map()函数就是一个数据到一个数据的映射关系,中间没有对数据的增删改
map(function,list)就是将list里面的数据取出来然后用于function

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
import pandas as pd


df = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

###################### 方法一 ########################

# 字符串数据才有lower()
lowercased = df['food'].str.lower()
print(lowercased)

# 数据映射
print(lowercased.map(meat_to_animal))
df['animal1']=lowercased.map(meat_to_animal)
print(df)

###################### 方法二 ########################

# df['animal2']=df['food'].map(lambda x : meat_to_animal[x.lower()])
transform = lambda x : meat_to_animal[x.lower()]
df['animal2']=df['food'].map(transform)
print(df)

5、DataFrame行列变换

(1)使用 numpy 实现转置
1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd

df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])
print(df)

df2 = pd.DataFrame(df.values.T, index=df.columns, columns=df.index)
print(df2)
(2)利用 pandas 多级索引实现转置

stack() 会将数据的列”旋转”为行,新的行索引比原来的行索引低了一个级别(0的级别最高)。
unstack() 将数据的行”旋转”为列,默认旋转的是级别最低的索引。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
import pandas as pd

df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])
print(df)

# 列索引转为行索引
df = df.stack()
print(df)

# 最高级的行索引转为列索引
df = df.unstack(0)
print(df)

# 一步完成转换
print(df.stack().unstack(0))

6、DataFrame合并与拆分

(1)堆叠

concat()能够实现轴向数据集的堆叠

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import numpy as np
import pandas as pd

s1 = pd.Series([0, 1])
s2 = pd.Series([2, 3, 4])
s3 = pd.Series([5, 6])

# 纵向堆叠
df1 = pd.concat([s1, s2, s3], ignore_index=True) # 默认axis=0
print(df1)
df2 = pd.concat([s1, s2, s3])
df2.set_axis(range(len(df2)), inplace=True)
print(df2)

# 横向堆叠
df3 = pd.concat([s1, s2, s3], axis=1)
print(df3)
(2)连接

merge()可根据一个或者多个键将不同的DataFrame连接在一起,类似于SQL数据库中的join操作

1
2
3
4
5
6
7
8
9
10
11
12
13
import numpy as np
import pandas as pd

df1 = pd.DataFrame({'userid': [1,2,3,4], 'level': list('ssab')})
df2 = pd.DataFrame({'userid': [1,3,5], 'age': [16, 22, 34]})

print(pd.merge(df1, df2)) # 默认使用交集(how='inner')作为合并方式
print(pd.merge(df1, df2, how='outer')) # 使用并集作为合并方式

print(pd.merge(df1, df2, how='left')) # 左合并
print(pd.merge(df1, df2, how='right')) # 右合并

print(pd.merge(df1, df2, on=['userid'])) # 指定连接列
(3)拆分
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import numpy as np
import pandas as pd

df = pd.DataFrame([['Computer', 'Mac-Dell'],
['Computer', 'Mac-XiaoMi'],
['Computer', 'Mac-HuaWei-9X']], columns=['Type', 'Brands_rank'])

print(df)

# 一列拆多列
df['Brands-Second'] = df.Brands_rank.apply(lambda x: x.split('-')[1])
df['Brands-Third'] = df.Brands_rank.apply(lambda x: x.split('-')[2] if x.count('-') >= 2 else np.nan)
print(df)

# 一行拆多行
df2 = df
df2 = df.Brands_rank.str.split('-', expand=True).stack().to_frame()
df2 = df2.reset_index(level=1, drop=True).rename(columns={0:'Brands'})
print(df[['Type']].join(df2))

7、DataFrame排序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
import pandas as pd

df = pd.DataFrame([[0, 1, 2],
[3, 4, 5]], columns=['c1', 'c2', 'c3'])

############ 列索引排序 #############

# 按行索引升序排序
print(df.sort_index())
# 按行索引降序排序
print(df.sort_index(ascending=False))

############ 行索引排序 #############
# 按列索引升序排序
print(df.sort_index(axis=1))
# 按列索引降序排序
print(df.sort_index(axis=1, ascending=False))

############ 值排序 #################
# 按某列的值升序排序
print(df.sort_values(by='c2'))
# 按某列的值降序排序
print(df.sort_values(by='c2', ascending=False))
# 多列值降序排序
print(df.sort_values(by=['c2', 'c3'], ascending=False))

8、DataFrame分组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,7,9]
}
df = pd.DataFrame(data, index=['x','y','z'])
print(df)

# 按c列分组计数
print(df.groupby('c'))
# 按c列分组求和
print(df.groupby('c').agg(['sum']))
print(df.groupby('c').b.agg(['sum']))
# 按c列分组计算其他各列平均数
print(df.groupby('c').agg(np.mean))
# 按c列分组计算其他各列最小值和最大值
print(df.groupby('c').agg(['min','max']))

五、Pandas与数据统计

1、Series的统计

1
2
3
4
5
6
7
8
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

a = pd.Series([1, 2, 5, 5, 6, 8])

print(a.value_counts(dropna=False)) # 查看唯一值和计数
print(a.value_counts(sort=True)) # 返回a的频数统计并降序排列

2、DataFrame的统计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

data = {
"a":[1,2,3],
"b":[4,5,6],
"c":[7,8,9]
}
df = pd.DataFrame(data, index=['x','y','z'])

print(df)

print(df.shape) # 查看⾏数和列数
print(df.info()) # 查看索引、数据类型和内存信息

print(df.count()) # 返回每一列中的非空值的个数

print(df.describe()) # 查看数值型列的汇总统计

print(df.corr()) # 返回列与列之间的相关系数

print(df.max()) # 返回每一列的最大值
print(df.quantile(0.5)) # 返回每列的50分位数
print(df.min()) # 返回每一列的最小值

print(df.mean()) # 返回所有列的均值
print(df.median()) # 返回每一列的中位数
print(df.mad()) # 返回每一列的平均绝对离差
print(df.std()) # 返回每一列的标准差
print(df.var()) # 返回每一列的方差
print(df.head(2)) # 查看DataFrame对象的前2⾏
print(df.tail(2)) # 查看DataFrame对象的最后2⾏