Machine-Learning-for-Algorithmic-Trading-Second-Edition/ gradient boosting
本文作者:何百圣 哈尔滨工业大学(威海) 经济管理学院 数量金融方向
MLAT系列文章致力于完成校内课程作业,并采用博客形式发布学习内容。笔者主攻方向是12gradient_boosting这一技术方向,在上一篇中已经完成了数据集的创建,在本篇中将继续运用多种boosting方法对数据集进行优化处理。
Imports and Settings
import sys, os
import warnings
from time import time
from itertools import product
import joblib
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
# needed for HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.inspection import partial_dependence, plot_partial_dependence
from sklearn.metrics import roc_auc_score
AI写代码
其他设置
results_path = Path(r'E:/machine learning for algorithmic trading','results', 'baseline')
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
idx = pd.IndexSlice
np.random.seed(42)
DATA_STORE = r'E:/machine learning for algorithmic trading/wiki.h5'
AI写代码
Prepare Data
这里使用的就是上一节得到的数据集,在原书GitHub中属于第4节。
def get_data(start='2000', end='2018', task='classification', holding_period=1, dropna=False):
idx = pd.IndexSlice
target = f'target_{holding_period}m'
with pd.HDFStore(DATA_STORE) as store:
df = store['engineered_features']
if start is not None and end is not None:
df = df.loc[idx[:, start: end], :]
if dropna:
df = df.dropna()
y = (df[target]>0).astype(int)
#这里target收益率大于零则y为1,否则y为0,应用于classification方法,做方向判断
X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
return y, X
AI写代码
Factorize Categories
2. factorize函数的作用是将sector划分为类别(如将建筑业和制造业归类为0和1号类别),该函数还可以通过参数sort来实现有序分类。在没有排序的情况下,默认会以首次出现的类别作为基准,在后续遇到新的类别时会自动新增编号。返回变量为tuple类型数据结构。
cat_cols = ['year', 'month', 'age', 'msize', 'sector']
def factorize_cats(df, cats=['sector']):
cat_cols = ['year', 'month', 'age', 'msize'] + cats
for cat in cats:
df[cat] = pd.factorize(df[cat])[0]
df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1).astype(int)
return df
AI写代码
One-Hot Encoding
3. One Hot Encoding is designed as a method to transform categorical variables into a form that can be easily utilized by machine learning algorithms. The function get_dummies returns variables in the form of a dataframe, specifically as a sparse matrix.
注意:
当应用 get_dummies 进行编码时,请务必注意避免出现多重共线性现象。举个例子来说,在处理性别数据时若采用 get_dummies 方法,则必须删除一列(因为一旦知道某一列的数据就自然知道了另一列的数据)。
除了常见的 onehot 编码和 factorize 方法外, 还可以选择使用 map() 函数来达到类似的分类效果。
在进行分类任务时需要注意数据是否具有区分度。例如, 红色和黄色类别没有明显的区分度, 但像年龄这样的属性则具有较强的区分度。
def get_one_hot_data(df, cols=cat_cols[:-1]):
df = pd.get_dummies(df,
columns=cols + ['sector'],
#columns参数表明这些columns参与分类
prefix=cols + [''],
prefix_sep=['_'] * len(cols) + ['']
#其实get_dummies函数有默认prefix,这里是为了sector分类前不出现sector
)
return df.rename(columns={c: c.replace('.0', '') for c in df.columns})
AI写代码
Get Holdout Set
holdout set用于估计交叉验证后的泛化误差
def get_holdout_set(target, features, period=6):
idx = pd.IndexSlice
label = target.name
dates = np.sort(y.index.get_level_values('date').unique())
cv_start, cv_end = dates[0], dates[-period - 2]
holdout_start, holdout_end = dates[-period - 1], dates[-1]
#这里用了大部分的数据来做cross validation,留最后七天做测试集
df = features.join(target.to_frame())
train = df.loc[idx[:, cv_start: cv_end], :]
y_train, X_train = train[label], train.drop(label, axis=1)
test = df.loc[idx[:, holdout_start: holdout_end], :]
y_test, X_test = test[label], test.drop(label, axis=1)
return y_train, X_train, y_test, X_test
AI写代码
Load Data
y, features = get_data()
X_dummies = get_one_hot_data(features)
X_factors = factorize_cats(features)
y_clean, features_clean = get_data(dropna=True)
X_dummies_clean = get_one_hot_data(features_clean)
X_factors_clean = factorize_cats(features_clean)
#(clean)将滞后收益率为nan的项都删去了
AI写代码
Cross-Validation Setup
交叉验证,原理比较简单,这里采用了12-fold
class OneStepTimeSeriesSplit:
"""Generates tuples of train_idx, test_idx pairs
Assumes the index contains a level labeled 'date'"""
def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
self.n_splits = n_splits
self.test_period_length = test_period_length
self.shuffle = shuffle
@staticmethod
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
def split(self, X, y=None, groups=None):
unique_dates = (X.index
.get_level_values('date')
.unique()
.sort_values(ascending=False)
[:self.n_splits*self.test_period_length])
dates = X.reset_index()[['date']]
for test_date in self.chunks(unique_dates, self.test_period_length):
train_idx = dates[dates.date < min(test_date)].index
test_idx = dates[dates.date.isin(test_date)].index
if self.shuffle:
np.random.shuffle(list(train_idx))
yield train_idx, test_idx
#yield 用于在循环程序里多次返回
def get_n_splits(self, X, y, groups=None):
return self.n_splits
AI写代码
class实例化
cv = OneStepTimeSeriesSplit(n_splits=12,
test_period_length=1,
shuffle=False)
run_time = {}
AI写代码
CV Metrics
交叉验证的各项评价指标
metrics = {'balanced_accuracy': 'Accuracy' ,
'roc_auc': 'AUC',
'neg_log_loss': 'Log Loss',
'f1_weighted': 'F1',
'precision_weighted': 'Precision',
'recall_weighted': 'Recall'
}
def run_cv(clf, X=X_dummies, y=y, metrics=metrics, cv=cv, fit_params=None, n_jobs=-1):
start = time()
#scores是一个字典的形式,key是metrics,value是值
scores = cross_validate(estimator=clf,
X=X,
y=y,
scoring=list(metrics.keys()),
cv=cv,
return_train_score=True,
n_jobs=n_jobs,
verbose=1,
fit_params=fit_params)
duration = time() - start
return scores, duration
AI写代码
CV Result Handler Functions
结果处理函数,包括metics dataframe以及plot函数、
4.melt() 是 pivot() 逆转操作函数,也是数据透视的一种处理方法,非常好用,只是一言半语讲不清楚。python melt()用法
def stack_results(scores):
#利用元组创建多重索引
columns = pd.MultiIndex.from_tuples(
[tuple(m.split('_', 1)) for m in scores.keys()],
names=['Dataset', 'Metric'])
data = np.array(list(scores.values())).T
df = (pd.DataFrame(data=data,
columns=columns)
.iloc[:, 2:])
results = pd.melt(df, value_name='Value')
results.Metric = results.Metric.apply(lambda x: metrics.get(x))
results.Dataset = results.Dataset.str.capitalize()
return results
def plot_result(df, model=None, fname=None):
m = list(metrics.values())
#catplot函数,表示为用分类型数据(categorical data)绘图
g = sns.catplot(x='Dataset',
y='Value',
hue='Dataset',
col='Metric',
data=df,
col_order=m,
order=['Train', 'Test'],
kind="box",
col_wrap=3,
sharey=False,
height=4, aspect=1.2)
#aspect*height = width
df = df.groupby(['Metric', 'Dataset']).Value.mean().unstack().loc[m]
#遍历子图
for i, ax in enumerate(g.axes.flat):
s = f"Train: {df.loc[m[i], 'Train'] :>7.4f}\nTest: {df.loc[m[i], 'Test'] :>7.4f}"
#用来限制小数位数
#text函数用于给fig增加图例
ax.text(0.05, 0.85, s, fontsize=10,transform=ax.transAxes,
bbox=dict(facecolor='white', edgecolor='grey', boxstyle='round,pad=0.5'))
g.fig.suptitle(model, fontsize=16)
g.fig.subplots_adjust(top=.9)
if fname:
g.savefig(fname, dpi=300);
#transform = ax.transAxes是转换坐标系的意思,不加入这一command结果差别很大,但是具体意思也还没搞清楚
AI写代码
Baseline Classifier
流程存在较多重复性,在对不同算法构建模型时采用cross validated方法后不久即可完成后续操作,并随后进行可视化展示;最终比较各算法的表现以评估其性能差异
5. DummyClassifier是一种基于简单规则运用的分类器。通常该分类器用作简单基准(baseline)的基础,在与其他复杂的分类器对比时表现突出。
在机器学习过程中完成模型训练后,必须将训练好的模型保存至指定本地位置进行存储。其中,在本地存储时我们选择了使用 joblib 库来进行处理。
baseline
dummy_clf = DummyClassifier(strategy='stratified',
random_state=42)
algo = 'dummy_clf'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
dummy_cv_result, run_time[algo] = run_cv(dummy_clf)
joblib.dump(dummy_cv_result, fname)
else:
dummy_cv_result = joblib.load(fname)
dummy_result = stack_results(dummy_cv_result)
dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(dummy_result, model='Dummy Classifier')
AI写代码

RandomForest
7. 参数解释,详见代码段。
rf_clf = RandomForestClassifier(n_estimators=100, #决策树模型的个数
criterion='gini',
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
# min_sample_split是最低编号。分割所需的样本数量。例如,如果min_sample_split = 6节点中有4个样本,则不会发生分裂(与熵无关)。
# min_sample_leaf另一方面基本上是最小编号。的样本必须是叶节点。
# 假设min_sample_leaf = 3一个包含5个样本的节点可以拆分为两个大小分别为2和3的叶子节点,则该拆分不会发生,因为最小叶子大小为3
min_weight_fraction_leaf=0.0,
max_features='auto',
#代表sqrt(n_features),default状态就是auto,意思是对于每一个树,随机抽取的特征个数是总个数的平方根
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
bootstrap=True,
#是否有放回地采样,bootstrap的概念
oob_score=True,
#是否使用带外样本来估计泛化精度
n_jobs=-1,
random_state=42,
#整数型,默认为0,如果为0则不输出日志,如果为1,则每隔一段时间输出日志,大于1输出日志会更频繁。
verbose=1)
#n_jobs指定并行性,默认值为None或者数字1,如果设置成-1,则表示将任务派发到所有CPU上
AI写代码
疑问:random forest 用的clean数据,区别是什么呢
algo = 'random_forest'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
rf_cv_result, run_time[algo] = run_cv(rf_clf, y=y_clean, X=X_dummies_clean)
joblib.dump(rf_cv_result, fname)
else:
rf_cv_result = joblib.load(fname)
rf_result = stack_results(rf_cv_result)
rf_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

8.对于train 和 test相差过大的问题的探索
#注意,在训练集中使用随机森林完美拟合了数据。
由于训练集与测试机之间出现了较大的差异,
最初认为可能是测试机样本数量过少,
通过增大交叉验证中的测试机大小,
仍然没有明显的改进,
同时训练集中占有的比例始终保持在1。
这个问题仍待解决。
plot_result(rf_result, model='Random Forest')
AI写代码

scikit-learn: AdaBoost
9. 参数解释,详见代码段。
#基础模型设置为单层决策树
base_estimator = DecisionTreeClassifier(criterion='gini',
splitter='best',
max_depth=1,
min_samples_split=2,
min_samples_leaf=20,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
class_weight=None)
# splitter:取值为"best"和"random","best"在特征的所有划分点中找出最优的划分点,
# 适合样本量不大的情况,"random"随机地在部分划分点中找局部最优的划分点,适合样本量非常大的情况,默认选择"best"
# min_weight_fraction_leaf:叶子节点最小的样本权重和,默认取0,
# 即不考虑权重问题,如果小于该数值,该叶子节点会和兄弟节点一起被剪枝(即剔除该叶子节点和其兄弟节点,并停止分裂)。
# 如果较多样本有缺失值或者样本的分布类别偏差很大,则需考虑样本权重问题。
AI写代码
为了避免Adaboost发生过拟合现象,可以通过引入正则化项来实现. 其中, 正则化项中的缩减系数ν等同于learning rate, 也被称为学习率. 它们的取值范围均为(0,1], 其中较大的ν取值意味着达到相同的学习效果所需迭代次数减少、训练所需的弱学习器数量相应降低. 相反地, 较小的ν取值则要求进行更多的迭代步骤以达到指定的分类精度.
ada_clf = AdaBoostClassifier(base_estimator=base_estimator,
#n_estimator 参数实际上控制了number of boosting stages
n_estimators=100,
learning_rate=1.0,
algorithm='SAMME.R',
random_state=42)
AI写代码
表示通过调整弱学习器权重来实现样本集分类效果的提升;而SAMME.R则表示根据样本集分类预测的概率来调整各弱学习器的权重;其中默认采用的是SAMME.R方法
algo = 'adaboost'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
ada_cv_result, run_time[algo] = run_cv(ada_clf, y=y_clean, X=X_dummies_clean)
joblib.dump(ada_cv_result, fname)
else:
ada_cv_result = joblib.load(fname)
ada_result = stack_results(ada_cv_result)
ada_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(ada_result, model='AdaBoost')
AI写代码

HistGradientBoostingClassifier
The following HistGradientBoostingClassifier initialization code illustrates the key tuning parameters that we previously introduced, in addition to those that we are familiar with from looking at standalone decision tree models.
This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).
This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples.
gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy',
learning_rate=0.1,
# 在adaboost中n_estimator用于控制 number of boosting stages
# 在histgradientboosting中采用max_iter
max_iter=100,
min_samples_leaf=20,
max_depth=None,
random_state=None,
max_leaf_nodes=31, # opt value depends on feature interaction
warm_start=False,
verbose=0,
tol=0.0001)
AI写代码
建模后交叉验证
algo = 'sklearn_gbm'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
gb_cv_result, run_time[algo] = run_cv(gb_clf, y=y_clean, X=X_dummies_clean)
joblib.dump(gb_cv_result, fname)
else:
gb_cv_result = joblib.load(fname)
gb_result = stack_results(gb_cv_result)
gb_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(gb_result, model='Gradient Boosting Classifier')
AI写代码

Partial Dependence Plots(重要)
12.Partial_dependence概念等同于统计学领域中的边际效应概念,在固定其他变量不变的前提下变动目标特征变量的取值范围并观察模型预测效果的变化情况
在分析过程中需要注意的是,在进行分析前需要去除与公司IPO时间相关的数据字段,并采取措施以防止数据因时间和时序关系而产生的过度依赖性影响
13. product(A,B)函数,返回A和B中的元素组成的笛卡尔积的元组 ([0, 1], repeat=2)表示([0, 1],[0, 1])
14. '{:.0%}'.format(y)是格式化的灵活用法之一,应当掌握|
当我们仅聚焦于单个 feature 与 target 之间的相互作用时, 其呈现的结果则为二维(2D)。如果综合考虑两个 feature 的影响, 则其表现为三维(3D)结构。
X_ = X_factors_clean.drop(['year', 'month'], axis=1)
fname = results_path / f'{algo}_model.joblib'
if not Path(fname).exists():
gb_clf.fit(y=y_clean, X=X_)
joblib.dump(gb_clf, fname)
else:
gb_clf = joblib.load(fname)
gb_clf.score(X=X_, y=y_clean)
>>>0.5889181460403748
y_score = gb_clf.predict_proba(X_)[:, 1]
roc_auc_score(y_score=y_score, y_true=y_clean)
>>>0.6183261924270361
AI写代码
画图
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
plot_partial_dependence(
estimator=gb_clf,
X=X_,
features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
percentiles=(0.05, 0.95),
n_jobs=-1,
n_cols=2,
response_method='decision_function',
grid_resolution=250,
ax=axes)
for i, j in product([0, 1], repeat=2):
if i!=1 or j!= 0:
axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
axes[0][0].set_ylabel('Partial Dependence')
axes[1][0].set_ylabel('Partial Dependence')
axes[0][0].set_xlabel('12-Months Return')
axes[0][1].set_xlabel('6-Months Return')
axes[1][0].set_xlabel('Conservative Minus Aggressive')
axes[1][1].set_xlabel('12-Month Return')
axes[1][1].set_ylabel('6-Months Return')
fig.suptitle('Partial Dependence Plots', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=.95)
AI写代码

3D
targets = ['return_12m', 'return_6m']
pdp, axes = partial_dependence(estimator=gb_clf,
features=targets,
X=X_,
grid_resolution=100)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
fig = plt.figure(figsize=(14, 8))
ax = Axes3D(fig)
surface = ax.plot_surface(XX, YY, Z,
rstride=1,
cstride=1,
cmap=plt.cm.BuPu,
edgecolor='k')
ax.set_xlabel('12-Month Return')
ax.set_ylabel('6-Month Return')
ax.set_zlabel('Partial Dependence')
ax.view_init(elev=22, azim=30)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
# fig.colorbar(surface)
fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
fig.tight_layout()
AI写代码

ax.view_init() 用于调整绘制图像的观察视角, 即表示用于确定观察视角的相机位置, azim参数绕着z轴方向转动, elev参数则沿着y轴方向调整.
绘制在x轴上的数据点数量表示横坐标上点的数量,默认设置为100。通常建议避免将grid_resolution参数设置过大,则可能导致图形出现明显的锯齿状效果
18. 该函数被设计用于基于输入的一组独立坐标向量序列来创建相应的网格结构 meshgrid用法
XGBoost
xgb_clf = XGBClassifier(max_depth=3,
learning_rate=0.1,
n_estimators=100, # Number of boosted trees to fit.
silent=True, # Whether to print messages while running
objective='binary:logistic', # Task and objective or custom objective function
booster='gbtree', # Select booster: gbtree, gblinear or dart
n_jobs=-1,
gamma=0, # Min loss reduction for further splits
min_child_weight=1, # Min sum of sample weight(hessian) needed
max_delta_step=0, # Max delta step for each tree's weight estimation
subsample=1, # Subsample ratio of training samples
colsample_bytree=1, # Subsample ratio of cols for each tree
colsample_bylevel=1, # Subsample ratio of cols for each split
reg_alpha=0, # L1 regularization term on weights
reg_lambda=1, # L2 regularization term on weights
scale_pos_weight=1, # Balancing class weights
base_score=0.5, # Initial prediction score; global bias
random_state=42) # random seed
AI写代码
建模后cv
algo = 'xgboost'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
xgb_cv_result, run_time[algo] = run_cv(xgb_clf)
joblib.dump(xgb_cv_result, fname)
else:
xgb_cv_result = joblib.load(fname)
xbg_result = stack_results(xgb_cv_result)
xbg_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(xbg_result, model='XG Boost')
AI写代码

Feature Importance
特征重要性分析

fi = pd.Series(xgb_clf.feature_importances_,
index=X_dummies.columns)
fi.nlargest(25).sort_values().plot.barh(figsize=(10, 5),
title='Feature Importance')
sns.despine()
plt.tight_layout();
AI写代码

LightGBM
lgb_clf = LGBMClassifier(boosting_type='gbdt',
objective='binary', # learning task
metric='auc',
num_leaves=31, # Maximum tree leaves for base learners.
max_depth=-1, # Maximum tree depth for base learners, -1 means no limit.
learning_rate=0.1, # Adaptive lr via callback override in .fit() method
n_estimators=100, # Number of boosted trees to fit
subsample_for_bin=200000, # Number of samples for constructing bins.
class_weight=None, # dict, 'balanced' or None
min_split_gain=0.0, # Minimum loss reduction for further split
min_child_weight=0.001, # Minimum sum of instance weight(hessian)
min_child_samples=20, # Minimum number of data need in a child(leaf)
subsample=1.0, # Subsample ratio of training samples
subsample_freq=0, # Frequency of subsampling, <=0: disabled
colsample_bytree=1.0, # Subsampling ratio of features
reg_alpha=0.0,
reg_lambda=0.0,
random_state=42, # Random number seed; default: C++ seed
n_jobs=-1, # Number of parallel threads.
silent=False,
importance_type='gain', # default: 'split' or 'gain'
)
AI写代码
cv
algo = 'lgb_factors'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
lgb_factor_cv_result, run_time[algo] = run_cv(lgb_clf, X=X_factors, fit_params={'categorical_feature': cat_cols})
joblib.dump(lgb_factor_cv_result, fname)
else:
lgb_factor_cv_result = joblib.load(fname)
lgb_factor_result = stack_results(lgb_factor_cv_result)
lgb_factor_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(lgb_factor_result, model='Light GBM | Factors')
AI写代码

algo = 'lgb_dummies'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
lgb_dummy_cv_result, run_time[algo] = run_cv(lgb_clf)
joblib.dump(lgb_dummy_cv_result, fname)
else:
lgb_dummy_cv_result = joblib.load(fname)
lgb_dummy_result = stack_results(lgb_dummy_cv_result)
lgb_dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
AI写代码

plot_result(lgb_dummy_result, model='Light GBM | Factors')
AI写代码

Compare Results
results = {'Baseline': dummy_result,
'Random Forest': rf_result,
'AdaBoost': ada_result,
'Gradient Booster': gb_result,
'XGBoost': xbg_result,
'LightGBM Dummies': lgb_dummy_result,
'LightGBM Factors': lgb_factor_result}
df = pd.DataFrame()
for model, result in results.items():
df = pd.concat([df, result.groupby(['Metric', 'Dataset']
).Value.mean().unstack()['Test'].to_frame(model)], axis=1)
df.T.sort_values('AUC', ascending=False)
AI写代码

