一、标准循环————101.6秒
DataFrame是具有行和列的Pandas对象。如果使用循环,需要遍历整个对象。Python不能利用任何内置函数,而且速度很慢。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
| import numpy as np import pandas as pd import random from time import time import warnings; warnings.filterwarnings("ignore")
data = np.arange(240000).reshape(80000, 3) df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR']) df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')
def soc_loop(df, TEAM): '''查询某个特定的队是否打了平局''' df['Draws'] = 99999 for row in range(0, len(df)): if ((df['HomeTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] == 'D')) | ((df['AwayTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] == 'D')): df['Draws'].iloc[row] = 'Draw' elif ((df['HomeTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] != 'D')) | ((df['AwayTeam'].iloc[row] == TEAM) & (df['FTR'].iloc[row] != 'D')): df['Draws'].iloc[row] = 'No_Draw' else: df['Draws'].iloc[row] = 'No_Game' return df
begin = time() soc_loop(df, 'Arsenal') end = time() print(end-begin)
|
二、使用Pandas内置函数:iterrows()————6.8秒,快15倍
在第一个示例中,循环遍历了整个DataFrame。iterrows()为每一行返回一个Series,它以索引对的形式遍历DataFrame,以Series的形式遍历感兴趣的列。这使得它比标准循环更快。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| import numpy as np import pandas as pd import random from time import time import warnings; warnings.filterwarnings("ignore")
data = np.arange(240000).reshape(80000, 3) df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR']) df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')
def soc_iter(TEAM, HomeTeam, AwayTeam, FTR): '''查询某个特定的队是否打了平局''' if [((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D'))]: Draws = 'Draw' elif [((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D'))]: Draws = 'No_Draw' else: Draws = 'No_Game' return Draws
begin = time() draw_series = [] for index, row in df.iterrows(): draw_series.append(soc_iter('Arsenal', row['HomeTeam'], row['AwayTeam'], row['FTR'])) df['Draws'] = draw_series end = time() print(end-begin)
|
三、apply()方法————1.7秒,快59.7倍
apply 本身并不快,但与DataFrame结合使用时,它具有优势。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| import numpy as np import pandas as pd import random from time import time import warnings; warnings.filterwarnings("ignore")
data = np.arange(240000).reshape(80000, 3) df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR']) df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')
def soc_iter(TEAM, HomeTeam, AwayTeam, FTR): '''查询某个特定的队是否打了平局''' if [((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D'))]: Draws = 'Draw' elif [((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D'))]: Draws = 'No_Draw' else: Draws = 'No_Game' return Draws
begin = time() draw_series = [] for index, row in df.iterrows(): draw_series.append(soc_iter('Arsenal', row['HomeTeam'], row['AwayTeam'], row['FTR'])) df['Draws'] = draw_series end = time() print(end-begin)
|
三、Pandas向量化————0.04秒,快2540倍
可以利用向量化的优点来创建非常快的代码。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| import numpy as np import pandas as pd import random from time import time import warnings; warnings.filterwarnings("ignore")
data = np.arange(240000).reshape(80000, 3) df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR']) df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')
def soc_iter(TEAM, HomeTeam, AwayTeam, FTR): '''查询某个特定的队是否打了平局''' df['Draws'] = 'No_Game' df.loc[((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D')), 'Draws'] = 'Draw' df.loc[((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D')), 'Draws'] = 'No_Draw'
begin = time() df['Draws'] = soc_iter('Arsenal', df['HomeTeam'], df['AwayTeam'], df['FTR']) end = time() print(end-begin)
|
四、Numpy向量化————0.02秒,快5080倍
通过添加.values,可以得到一个Numpy数组。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| import numpy as np import pandas as pd import random from time import time import warnings; warnings.filterwarnings("ignore")
data = np.arange(240000).reshape(80000, 3) df = pd.DataFrame(data, columns = ['HomeTeam', 'AwayTeam', 'FTR']) df['HomeTeam'] = df['HomeTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['AwayTeam'] = df['AwayTeam'].apply(lambda x:random.choice(['Arsenal', 'non-Arsenal'])) df['FTR'] = df['FTR'].apply(lambda x:'D' if random.random() > 0.5 else 'non-D')
def soc_iter(TEAM, HomeTeam, AwayTeam, FTR): '''查询某个特定的队是否打了平局''' df['Draws'] = 'No_Game' df.loc[((HomeTeam == TEAM) & (FTR == 'D')) | ((AwayTeam == TEAM) & (FTR == 'D')), 'Draws'] = 'Draw' df.loc[((HomeTeam == TEAM) & (FTR != 'D')) | ((AwayTeam == TEAM) & (FTR != 'D')), 'Draws'] = 'No_Draw'
begin = time() df['Draws'] = soc_iter('Arsenal', df['HomeTeam'].values, df['AwayTeam'].values, df['FTR'].values) end = time() print(end-begin)
|