1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
| import os
def stopwords_filter(base_path, output_path, stopwords_path): if not os.path.isdir(output_path): os.makedirs(output_path)
stopwords = open(stopwords_path, 'r', encoding='utf-8', errors='ignore').read().split('\n') stopwords = [i for i in stopwords if i !='']
filelist = os.listdir(base_path) for file_path in filelist: ff = open(base_path + file_path, 'r', encoding='utf-8', errors='ignore') wordlist = ff.read().replace('\n', '').split(' ') wordlist2 = list(filter(lambda x:x not in stopwords, wordlist)) text = ' '.join(wordlist2) f = open(output_path + file_path, 'w', encoding='utf-8') f.write(text + '\n') f.close() ff.close()
stopwords_filter("D:/data_train_cut/", "D:/data_train_clean/", "D:/stopwords.txt")
|