文本预处理工具之特殊字符过滤

1
2
3
4
5
6
7
8
9
import re

# 通过re过滤除中英文及数字以外的其他字符
def filter_string(des_string, re_string=''):
res = re.compile("[^\\u4e00-\\u9fa5^a-z^A-Z^0-9]")
return res.sub(re_string, des_string)

s = 'Reply to @ray.ya.bae There is many more colors but the time ran out! 😅 link in bio! 🌟 #fyp #foryou #MyStyle"'
print(filter_string(s, ' '))