时间序列预处理之缺失值填充

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
import copy

ts = pd.read_csv('daily-min-temperatures.csv', parse_dates=['Date'], index_col='Date')
print(ts.head())

def ts_fillna(data, date_col, val_col):
data = copy.deepcopy(data)
# 构造辅助列
helper = pd.DataFrame({date_col: pd.date_range(data.index.min(), data.index.max())})
# 原数据与辅助列合并
newdata = pd.merge(data, helper, on=date_col, how='outer').sort_values(date_col)

# 固定值填充
# newdata[val_col] = newdata[val_col].fillna(0)
# 前向填充
# newdata[val_col] = newdata[val_col].fillna(method='ffill')
# 后向填充
# newdata[val_col] = newdata[val_col].fillna(method='bfill')
# 插值填充
newdata[val_col] = newdata[val_col].interpolate(method='linear')

# 索引重建
newdata[date_col] = pd.to_datetime(newdata[date_col])
newdata.set_index(date_col, inplace=True, verify_integrity=False)
return newdata

ts2 = ts_fillna(ts, 'Date', 'Temp')
print(ts2.head())