文本预处理
过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。
import regex as re import unicodedata def process(text): try: text = re.sub(ur"\p{P}+|\p{Z}+|\p{S}+|\p{N}+", u' ', text) text = unicodedata.normalize('NFKD',text)#.encode('ascii','ignore') text = re.sub(ur"\p{M}+", u'', text) text = re.sub(ur"\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+", u'', text) text = re.sub("([A-Za-z]+)", lambda m:m.group(1).lower(),text) text = re.sub(ur'([^\x00-\x7f])', lambda m:u' '+m.group(1)+u' ', text) text = re.sub(ur"(\w)\1{2,}",lambda m:m.group(1), text) text = re.sub("(\s+)", u' ',text) for fword in fword_list: f_re = '' for i in xrange(len(fword)): w = fword[i] f_re += w + "+\s*" if i < (len(fword)-1) else w + "+" text = re.sub(f_re, u' '+fword+u' ',text) text = re.sub("(\s+)", u' ',text) return text except: return text df['text'] = df['text'].apply(lambda x: process(x))