Pandas数值分箱操作的几种方法

创建以下合成数据用于演示

1
2
3
4
5
6
7
8
9
import pandas as pd
import numpy as np

def create_df():
df = pd.DataFrame({'score': np.random.randint(0,101,1000)})
return df

df = create_df()
print(df.head())

between & loc

1
2
3
4
5
6
7
8
9
10
11
12
13
import pandas as pd
import numpy as np

def create_df():
df = pd.DataFrame({'score': np.random.randint(0,101,1000)})
return df

df = create_df()

df.loc[df['score'].between(0, 50, 'both'), 'grade'] = 'C'
df.loc[df['score'].between(50, 80, 'right'), 'grade'] = 'B'
df.loc[df['score'].between(80, 100, 'right'), 'grade'] = 'A'
print(df.grade.value_counts())

cut

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd
import numpy as np

def create_df():
df = pd.DataFrame({'score': np.random.randint(0,101,1000)})
return df

df = create_df()

bins = [0, 50, 80, 100]
labels = ['C', 'B', 'A']
df['grade'] = pd.cut(x = df['score'],
bins = bins,
labels = labels,
include_lowest = True)
print(df.grade.value_counts())

qcut

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import pandas as pd
import numpy as np

def create_df():
df = pd.DataFrame({'score': np.random.randint(0,101,1000)})
return df

df = create_df()

df['grade'], cut_bin = pd.qcut(df['score'],
q = 3,
labels = ['C', 'B', 'A'],
retbins = True)
print(df.grade.value_counts())
# 如果 retbins 设置为 True 则会返回 bin 边界
print(cut_bin)

value_counts

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
import numpy as np

def create_df():
df = pd.DataFrame({'score': np.random.randint(0,101,1000)})
return df

df = create_df()

print(df['score'].value_counts(bins = 3, sort = False))
print(df['score'].value_counts(bins = [0,50,80,100], sort = False))