Pandas循环提速攻略

一、标准循环————101.6秒

DataFrame是具有行和列的Pandas对象。如果使用循环,需要遍历整个对象。Python不能利用任何内置函数,而且速度很慢。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# coding:utf-8
import numpy as np
import pandas as pd
import random
from time import time
import warnings; warnings.filterwarnings("ignore")

data = np.arange(240000).reshape(80000, 3)
df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR'])
df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')

def soc_loop(df, TEAM):
'''查询某个特定的队是否打了平局'''
df['Draws'] = 99999 # 创建一个新的列,用于指示某个特定的队是否打了平局
for row in range(0, len(df)):
if ((df['HomeTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] == 'D')) | ((df['AwayTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] == 'D')):
df['Draws'].iloc[row] = 'Draw'
elif ((df['HomeTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] != 'D')) | ((df['AwayTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] != 'D')):
df['Draws'].iloc[row] = 'No_Draw'
else:
df['Draws'].iloc[row] = 'No_Game'
return df

begin = time()
soc_loop(df, 'Arsenal')
end = time()
print(end-begin)

二、使用Pandas内置函数:iterrows()————6.8秒,快15倍

在第一个示例中,循环遍历了整个DataFrame。iterrows()为每一行返回一个Series,它以索引对的形式遍历DataFrame,以Series的形式遍历感兴趣的列。这使得它比标准循环更快。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# coding:utf-8
import numpy as np
import pandas as pd
import random
from time import time
import warnings; warnings.filterwarnings("ignore")

data = np.arange(240000).reshape(80000, 3)
df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR'])
df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')

def soc_iter(TEAM, HomeTeam, AwayTeam, FTR):
'''查询某个特定的队是否打了平局'''
if [((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D'))]:
Draws = 'Draw'
elif [((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D'))]:
Draws = 'No_Draw'
else:
Draws = 'No_Game'
return Draws

begin = time()
draw_series = []
for index, row in df.iterrows():
draw_series.append(soc_iter('Arsenal', row['HomeTeam'], row['AwayTeam'], row['FTR']))
df['Draws'] = draw_series
end = time()
print(end-begin)

三、apply()方法————1.7秒,快59.7倍

apply 本身并不快,但与DataFrame结合使用时,它具有优势。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# coding:utf-8
import numpy as np
import pandas as pd
import random
from time import time
import warnings; warnings.filterwarnings("ignore")

data = np.arange(240000).reshape(80000, 3)
df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR'])
df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')

def soc_iter(TEAM, HomeTeam, AwayTeam, FTR):
'''查询某个特定的队是否打了平局'''
if [((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D'))]:
Draws = 'Draw'
elif [((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D'))]:
Draws = 'No_Draw'
else:
Draws = 'No_Game'
return Draws

begin = time()
draw_series = []
for index, row in df.iterrows():
draw_series.append(soc_iter('Arsenal', row['HomeTeam'], row['AwayTeam'], row['FTR']))
df['Draws'] = draw_series
end = time()
print(end-begin)

三、Pandas向量化————0.04秒,快2540倍

可以利用向量化的优点来创建非常快的代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# coding:utf-8
import numpy as np
import pandas as pd
import random
from time import time
import warnings; warnings.filterwarnings("ignore")

data = np.arange(240000).reshape(80000, 3)
df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR'])
df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')


def soc_iter(TEAM, HomeTeam, AwayTeam, FTR):
'''查询某个特定的队是否打了平局'''
df['Draws'] = 'No_Game'
df.loc[((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D')), 'Draws'] = 'Draw'
df.loc[((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D')), 'Draws'] = 'No_Draw'

begin = time()
df['Draws'] = soc_iter('Arsenal', df['HomeTeam'], df['AwayTeam'], df['FTR'])
end = time()
print(end-begin)

四、Numpy向量化————0.02秒,快5080倍

通过添加.values,可以得到一个Numpy数组。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# coding:utf-8
import numpy as np
import pandas as pd
import random
from time import time
import warnings; warnings.filterwarnings("ignore")

data = np.arange(240000).reshape(80000, 3)
df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR'])
df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal']))
df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')


def soc_iter(TEAM, HomeTeam, AwayTeam, FTR):
'''查询某个特定的队是否打了平局'''
df['Draws'] = 'No_Game'
df.loc[((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D')), 'Draws'] = 'Draw'
df.loc[((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D')), 'Draws'] = 'No_Draw'

begin = time()
df['Draws'] = soc_iter('Arsenal', df['HomeTeam'].values, df['AwayTeam'].values, df['FTR'].values)
end = time()
print(end-begin)