多分类LightGBM Cross Validation
使用LightGBM,Cross Validation进行多分类,multi log loss。
import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import random import os from lightgbm import LGBMClassifier from sklearn.metrics import roc_auc_score, roc_curve from sklearn.metrics import log_loss from sklearn.model_selection import KFold, StratifiedKFold from sklearn.preprocessing import LabelEncoder import seaborn as sns import gc def multi_log_loss(y_true, y_pred, num_classes): # score function for CV esp = 1e-12 y_pred += esp y_true = y_true.astype('int') # Handle all zeroes all_zeros = np.all(y_pred == 0, axis=1) y_pred[all_zeros] = 1/num_classes # Normalise sum of row probabilities to one row_sums = np.sum(y_pred, axis=1) y_pred /= row_sums.reshape((-1, 1)) # Calculate score n_rows = y_true.size score_sum = 0 for i in range(y_true.size): score_sum -= np.log(y_pred[i, y_true[i]]) score = score_sum / n_rows return score def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False): if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=SEED) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=SEED) num_classes = 6 # Create arrays and dataframes to store results oof_preds = np.zeros((train_df.shape[0],num_classes)) sub_preds = np.zeros((test_df.shape[0],num_classes)) feature_importance_df = pd.DataFrame() feats = train_df.columns.tolist() cat_feats = 'auto' #feats = select for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], y)): train_x, train_y = train_df[feats].iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], y.iloc[valid_idx] lgtrain = lgb.Dataset(train_x, train_y, feature_name=feats, categorical_feature = cat_feats) lgvalid = lgb.Dataset(valid_x, valid_y, feature_name=feats, categorical_feature = cat_feats) print('get lgb train valid dataset end') lgb_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'multiclass', 'num_class':num_classes, 'metric': 'multi_logloss', #"n_estimators":10000, "learning_rate": 0.02, #"num_leaves": 200, #"feature_fraction": 0.50, #"bagging_fraction": 0.50, #'bagging_freq': 4, #"max_depth": -1, 'num_leaves': 32, 'max_depth': 8, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'feature_fraction': 0.7, "reg_alpha": 0.3, "reg_lambda": 0.1, 'min_child_samples': 100, #'max_bin': 100, "min_split_gain":0.2, 'nthread': 4, "min_child_weight":10, } clf = lgb.train( lgb_params, lgtrain, num_boost_round=3000, valid_sets=[lgtrain, lgvalid], valid_names=['train','valid'], early_stopping_rounds=100, verbose_eval=100 ) #clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], # eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200) #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance() #fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d Multi Log Loss : %.6f' % (n_fold + 1, multi_log_loss(valid_y.values, oof_preds[valid_idx], num_classes))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full Multi Log Loss %.6f' % multi_log_loss(y.values, oof_preds, num_classes)) display_importances(feature_importance_df) return feature_importance_df,sub_preds # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgb_importances.png')