回归LightGBM Cross Validation
使用LightGBM,Cross Validation进行回归,RMSE loss。
import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import random import os from lightgbm import LGBMClassifier from sklearn import metrics from sklearn.metrics import mean_squared_error from sklearn.metrics import roc_auc_score, roc_curve from sklearn.metrics import log_loss from sklearn.model_selection import KFold, StratifiedKFold from sklearn.preprocessing import LabelEncoder import seaborn as sns import gc def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False): if stratified: folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=17) else: folds = KFold(n_splits= num_folds, shuffle=True, random_state=17) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = train.columns.tolist() test_df = test_df[feats] #test_df = csr_matrix(test_df) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y)): print('FOLD {}'.format(n_fold)) train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx] # LightGBM parameters found by Bayesian optimization lgb_params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', #"n_estimators":10000, "learning_rate": 0.01, 'num_leaves': 60, 'subsample': 0.6143, 'colsample_bytree': 0.6453, 'min_split_gain': np.power(10, -2.5988), 'reg_alpha': np.power(10, -2.2887), 'reg_lambda': np.power(10, 1.7570), 'min_child_weight': np.power(10, -0.1477), 'max_depth': -1, #'zero_as_missing':True } ''' lgb_params = { 'objective': 'regression', 'metric': 'rmse', 'learning_rate': 0.01, 'num_leaves': 16, 'max_depth': -1, 'min_child_samples': 1, 'max_bin': 300, 'subsample': 1.0, 'subsample_freq': 1, 'colsample_bytree': 0.5, 'min_child_weight': 10, 'reg_lambda': 0.1, 'reg_alpha': 0.0, 'scale_pos_weight': 1, 'zero_as_missing': True, 'num_threads': -1, } ''' #train_x = csr_matrix(train_x) #valid_x = csr_matrix(valid_x) lgtrain = lgb.Dataset(train_x, train_y, feature_name=feats, categorical_feature = 'auto') lgvalid = lgb.Dataset(valid_x, valid_y, feature_name=feats, categorical_feature = 'auto') clf = lgb.train( lgb_params, lgtrain, num_boost_round=3000, valid_sets=[lgtrain, lgvalid], valid_names=['train','valid'], early_stopping_rounds=200, verbose_eval=100 ) # clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], #eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200) #oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] #sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration) sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance() #fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) #print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) print('Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx])))) del clf, train_x, train_y, valid_x, valid_y gc.collect() rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds)) #print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) print('Full RMSE score %.6f' % rmse) display_importances(feature_importance_df) return feature_importance_df, sub_preds # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgb_importances.png')