cpuwdl 发布的文章

合并词向量预训练模型和自训练模型

合并词向量预训练模型和自训练模型,其中的tokenizer来自上一篇文章。

import numpy as np
import codecs
EMBEDDING_FILE='../glove.6B/crawl-300d-2M.vec'
EMBEDDING_TRAIN = '../glove.6B/vectors_train.txt'
embed_size = 300
#EMBEDDING_FILE=INPUT+'glove.6B/glove.6B.300d.txt'
cn = 0
def get_coefs(word,*arr): 
    global cn
    cn += 1
    dict_v = np.asarray(arr, dtype='float32')
    if len(dict_v)<>embed_size:
        dict_v = np.zeros((embed_size))
    return word, dict_v
f_emb = codecs.open(EMBEDDING_FILE)
emb_list = f_emb.readlines()
embeddings_index = dict(get_coefs(*o.strip().split()) for o in emb_list)
print cn
f_emb.close()
f_emb = codecs.open(EMBEDDING_TRAIN,'r','utf-8')
emb_list = f_emb.readlines()
cn = 0
embeddings_index_train = dict(get_coefs(*o.strip().split()) for o in emb_list)
print cn
f_emb.close()
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print emb_mean,emb_std
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
novector = 0
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    embedding_vector_train = embeddings_index_train.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    elif embedding_vector_train is not None: 
        embedding_matrix[i] = embedding_vector_train
    else: 
        print word
        novector += 1

其中的自训练模型如下。

full_df = pd.concat([train, test])
full_df.to_csv('text.txt',index=False,sep=' ',quotechar=' ',columns=['text'],header=False,encoding='utf-8') 

"""
Get 'text.txt'.
https://nlp.stanford.edu/projects/glove/

GloVe-1.2.zip
demo.sh:

CORPUS=text.txt
VOCAB_FILE=vocab.txt
COOCCURRENCE_FILE=cooccurrence.bin
COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
BUILDDIR=build
SAVE_FILE=vectors_train
VERBOSE=2
MEMORY=4.0
VOCAB_MIN_COUNT=2
VECTOR_SIZE=300
MAX_ITER=60
WINDOW_SIZE=15
BINARY=2
NUM_THREADS=8
X_MAX=10

You can get "vectors_train.txt".
"""

使用keras.preprocessing进行文本序列化

使用keras.preprocessing进行文本序列化。

from keras.preprocessing import text, sequence
max_features = 200000
maxlen = 500
list_sentences_train = train["text"].fillna("[na]").values
list_sentences_test = test["text"].fillna("[na]").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
train_sequence = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
test_sequence = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


多个句子词频统计

使用collections.Counter计算词频,顺便去除只出现1次的单词。

from collections import Counter
cc = Counter()
def get_count(text):
    try:
        text_split = text.strip().split(' ')
        count = Counter(text_split)
        cc.update(count)
        return text_split
    except: 
        return text
def remove_one(text):
    try:
        text = u" ".join([x for x in [y for y in text.strip().split(u" ")] if cc[x] > 1])
        return text
    except: 
        return text
df['text_split'] = df['text'].apply(lambda x: get_count(x))
df['text'] = df['text'].apply(lambda x: remove_one(x))


文本预处理

过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。

import regex as re
import unicodedata
def process(text):
    try:
        text = re.sub(ur"\p{P}+|\p{Z}+|\p{S}+|\p{N}+", u' ', text)
        text = unicodedata.normalize('NFKD',text)#.encode('ascii','ignore')
        text = re.sub(ur"\p{M}+", u'', text)
        text = re.sub(ur"\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+", u'', text)
        text = re.sub("([A-Za-z]+)", lambda m:m.group(1).lower(),text)
        text = re.sub(ur'([^\x00-\x7f])', lambda m:u' '+m.group(1)+u' ', text)
        text = re.sub(ur"(\w)\1{2,}",lambda m:m.group(1), text)
        text = re.sub("(\s+)", u' ',text)
        for fword in fword_list:
            f_re = ''
            for i in xrange(len(fword)):
                w = fword[i]
                f_re += w + "+\s*" if i < (len(fword)-1) else w + "+" 
            text = re.sub(f_re, u' '+fword+u' ',text)
        text = re.sub("(\s+)", u' ',text)
        return text
    except: 
        return text
df['text'] = df['text'].apply(lambda x: process(x))


回归LightGBM Cross Validation

使用LightGBM,Cross Validation进行回归,RMSE loss。

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=17)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=17)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = train.columns.tolist()
    test_df = test_df[feats]
    #test_df = csr_matrix(test_df)
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y)):
        print('FOLD {}'.format(n_fold))
        train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx]
        valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        lgb_params =  {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            #"n_estimators":10000,
            "learning_rate": 0.01,
            
            'num_leaves': 60,
            'subsample': 0.6143,
            'colsample_bytree': 0.6453,
            'min_split_gain': np.power(10, -2.5988),
            'reg_alpha': np.power(10, -2.2887),
            'reg_lambda': np.power(10, 1.7570),
            'min_child_weight': np.power(10, -0.1477),
            'max_depth': -1,
            #'zero_as_missing':True
        }
        '''
        lgb_params = {
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
            'num_leaves': 16,
            'max_depth': -1,
            'min_child_samples': 1,
            'max_bin': 300,
            'subsample': 1.0,
            'subsample_freq': 1,
            'colsample_bytree': 0.5,
            'min_child_weight': 10,
            'reg_lambda': 0.1,
            'reg_alpha': 0.0,
            'scale_pos_weight': 1,
            'zero_as_missing': True,
            'num_threads': -1,
        }
        '''
        #train_x = csr_matrix(train_x)
        #valid_x = csr_matrix(valid_x)
        lgtrain = lgb.Dataset(train_x, train_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        lgvalid = lgb.Dataset(valid_x, valid_y,
                        feature_name=feats,
                        categorical_feature = 'auto')
        clf = lgb.train(
            lgb_params,
            lgtrain,
            num_boost_round=3000,
            valid_sets=[lgtrain, lgvalid],
            valid_names=['train','valid'],
            early_stopping_rounds=200,
            verbose_eval=100
        )
        
       # clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            #eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)

        #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
        sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance()
        #fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        #print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        print('Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx]))))      
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds))
    #print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
    print('Full RMSE score %.6f' % rmse)
    
    display_importances(feature_importance_df)
    return feature_importance_df, sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgb_importances.png')