文本预处理之停用词过滤

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import os

def stopwords_filter(base_path, output_path, stopwords_path):
if not os.path.isdir(output_path):
os.makedirs(output_path)

# 以空格分隔的停词表
# stopwords = open(stopwords_path, 'r', encoding='utf-8', errors='ignore').read().replace('\n', '').split(' ')
# 以换行分隔的停词表
stopwords = open(stopwords_path, 'r', encoding='utf-8', errors='ignore').read().split('\n')
stopwords = [i for i in stopwords if i !='']

filelist = os.listdir(base_path)
for file_path in filelist:
ff = open(base_path + file_path, 'r', encoding='utf-8', errors='ignore')
wordlist = ff.read().replace('\n', '').split(' ')
wordlist2 = list(filter(lambda x:x not in stopwords, wordlist))
text = ' '.join(wordlist2)
f = open(output_path + file_path, 'w', encoding='utf-8')
f.write(text + '\n')
f.close()
ff.close()

stopwords_filter("D:/data_train_cut/", "D:/data_train_clean/", "D:/stopwords.txt")