使用LightGBM,Cross Validation进行回归,RMSE loss。
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import gc
def kfold_lightgbm(train_df, test_df, y, num_folds, stratified = False, debug= False):
if stratified:
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=17)
else:
folds = KFold(n_splits= num_folds, shuffle=True, random_state=17)
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = train.columns.tolist()
test_df = test_df[feats]
#test_df = csr_matrix(test_df)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y)):
print('FOLD {}'.format(n_fold))
train_x, train_y = train_df.iloc[train_idx], y.iloc[train_idx]
valid_x, valid_y = train_df.iloc[valid_idx], y.iloc[valid_idx]
# LightGBM parameters found by Bayesian optimization
lgb_params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
#"n_estimators":10000,
"learning_rate": 0.01,
'num_leaves': 60,
'subsample': 0.6143,
'colsample_bytree': 0.6453,
'min_split_gain': np.power(10, -2.5988),
'reg_alpha': np.power(10, -2.2887),
'reg_lambda': np.power(10, 1.7570),
'min_child_weight': np.power(10, -0.1477),
'max_depth': -1,
#'zero_as_missing':True
}
'''
lgb_params = {
'objective': 'regression',
'metric': 'rmse',
'learning_rate': 0.01,
'num_leaves': 16,
'max_depth': -1,
'min_child_samples': 1,
'max_bin': 300,
'subsample': 1.0,
'subsample_freq': 1,
'colsample_bytree': 0.5,
'min_child_weight': 10,
'reg_lambda': 0.1,
'reg_alpha': 0.0,
'scale_pos_weight': 1,
'zero_as_missing': True,
'num_threads': -1,
}
'''
#train_x = csr_matrix(train_x)
#valid_x = csr_matrix(valid_x)
lgtrain = lgb.Dataset(train_x, train_y,
feature_name=feats,
categorical_feature = 'auto')
lgvalid = lgb.Dataset(valid_x, valid_y,
feature_name=feats,
categorical_feature = 'auto')
clf = lgb.train(
lgb_params,
lgtrain,
num_boost_round=3000,
valid_sets=[lgtrain, lgvalid],
valid_names=['train','valid'],
early_stopping_rounds=200,
verbose_eval=100
)
# clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
#eval_metric= 'auc', verbose= 100, early_stopping_rounds= 200)
#oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
#sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration)
sub_preds += clf.predict(test_df, num_iteration=clf.best_iteration)/ folds.n_splits
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = clf.feature_importance()
#fold_importance_df["importance"] = clf.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
print('Fold %2d RMSE : %.6f' % (n_fold, np.sqrt(metrics.mean_squared_error(valid_y, oof_preds[valid_idx]))))
del clf, train_x, train_y, valid_x, valid_y
gc.collect()
rmse = np.sqrt(metrics.mean_squared_error(y, oof_preds))
#print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
print('Full RMSE score %.6f' % rmse)
display_importances(feature_importance_df)
return feature_importance_df, sub_preds
# Display/plot feature importance
def display_importances(feature_importance_df_):
cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgb_importances.png')