Advertisement

Machine-Learning-for-Algorithmic-Trading-Second-Edition/ gradient boosting

阅读量:

本文作者:何百圣 哈尔滨工业大学(威海) 经济管理学院 数量金融方向

MLAT系列文章致力于完成校内课程作业,并采用博客形式发布学习内容。笔者主攻方向是12gradient_boosting这一技术方向,在上一篇中已经完成了数据集的创建,在本篇中将继续运用多种boosting方法对数据集进行优化处理。

Imports and Settings

复制代码
 import sys, os

    
 import warnings
    
 from time import time
    
 from itertools import product
    
 import joblib
    
 from pathlib import Path
    
 import numpy as np
    
 import pandas as pd
    
  
    
 import matplotlib.pyplot as plt
    
 from matplotlib.ticker import FuncFormatter
    
 from mpl_toolkits.mplot3d import Axes3D
    
 import seaborn as sns
    
  
    
 from xgboost import XGBClassifier
    
 from lightgbm import LGBMClassifier
    
 from catboost import CatBoostClassifier
    
 from sklearn.model_selection import cross_validate
    
 from sklearn.dummy import DummyClassifier
    
 from sklearn.tree import DecisionTreeClassifier
    
 # needed for HistGradientBoostingClassifier
    
 from sklearn.experimental import enable_hist_gradient_boosting
    
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
    
 from sklearn.inspection import partial_dependence, plot_partial_dependence
    
 from sklearn.metrics import roc_auc_score
    
    
    
    
    AI写代码

其他设置

复制代码
 results_path = Path(r'E:/machine learning for algorithmic trading','results', 'baseline')

    
  
    
 warnings.filterwarnings('ignore')
    
 sns.set_style("whitegrid")
    
 idx = pd.IndexSlice
    
 np.random.seed(42)
    
  
    
 DATA_STORE = r'E:/machine learning for algorithmic trading/wiki.h5'
    
    
    
    
    AI写代码

Prepare Data

这里使用的就是上一节得到的数据集,在原书GitHub中属于第4节。

复制代码
 def get_data(start='2000', end='2018', task='classification', holding_period=1, dropna=False):

    
     
    
     idx = pd.IndexSlice
    
     target = f'target_{holding_period}m'
    
     
    
     with pd.HDFStore(DATA_STORE) as store:
    
     df = store['engineered_features']
    
     
    
  
    
     if start is not None and end is not None:
    
     df = df.loc[idx[:, start: end], :]
    
     if dropna:
    
     df = df.dropna()
    
     
    
     y = (df[target]>0).astype(int)
    
     #这里target收益率大于零则y为1,否则y为0,应用于classification方法,做方向判断
    
     
    
     X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    
     
    
     return y, X
    
    
    
    
    AI写代码

Factorize Categories

2. factorize函数的作用是将sector划分为类别(如将建筑业和制造业归类为0和1号类别),该函数还可以通过参数sort来实现有序分类。在没有排序的情况下,默认会以首次出现的类别作为基准,在后续遇到新的类别时会自动新增编号。返回变量为tuple类型数据结构。

复制代码
 cat_cols = ['year', 'month', 'age', 'msize', 'sector']

    
 def factorize_cats(df, cats=['sector']):
    
     cat_cols = ['year', 'month', 'age', 'msize'] + cats
    
     for cat in cats:
    
     df[cat] = pd.factorize(df[cat])[0]
    
     
    
     df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1).astype(int)
    
     return df
    
    
    
    
    AI写代码

One-Hot Encoding

3. One Hot Encoding is designed as a method to transform categorical variables into a form that can be easily utilized by machine learning algorithms. The function get_dummies returns variables in the form of a dataframe, specifically as a sparse matrix.

注意:

当应用 get_dummies 进行编码时,请务必注意避免出现多重共线性现象。举个例子来说,在处理性别数据时若采用 get_dummies 方法,则必须删除一列(因为一旦知道某一列的数据就自然知道了另一列的数据)。

除了常见的 onehot 编码和 factorize 方法外, 还可以选择使用 map() 函数来达到类似的分类效果。

在进行分类任务时需要注意数据是否具有区分度。例如, 红色和黄色类别没有明显的区分度, 但像年龄这样的属性则具有较强的区分度。

复制代码
 def get_one_hot_data(df, cols=cat_cols[:-1]):

    
     df = pd.get_dummies(df,
    
                     columns=cols + ['sector'],
    
                     #columns参数表明这些columns参与分类
    
                     prefix=cols + [''],
    
                     prefix_sep=['_'] * len(cols) + ['']
    
                     #其实get_dummies函数有默认prefix,这里是为了sector分类前不出现sector
    
 )
    
     return df.rename(columns={c: c.replace('.0', '') for c in df.columns})
    
    
    
    
    AI写代码

Get Holdout Set

holdout set用于估计交叉验证后的泛化误差

复制代码
 def get_holdout_set(target, features, period=6):

    
     idx = pd.IndexSlice
    
     label = target.name
    
     dates = np.sort(y.index.get_level_values('date').unique())
    
     cv_start, cv_end = dates[0], dates[-period - 2]
    
     holdout_start, holdout_end = dates[-period - 1], dates[-1]
    
  
    
     #这里用了大部分的数据来做cross validation,留最后七天做测试集
    
     
    
     df = features.join(target.to_frame())
    
     train = df.loc[idx[:, cv_start: cv_end], :]
    
     y_train, X_train = train[label], train.drop(label, axis=1)
    
  
    
     test = df.loc[idx[:, holdout_start: holdout_end], :]
    
     y_test, X_test = test[label], test.drop(label, axis=1)
    
     return y_train, X_train, y_test, X_test
    
    
    
    
    AI写代码

Load Data

复制代码
 y, features = get_data()

    
 X_dummies = get_one_hot_data(features)
    
 X_factors = factorize_cats(features)
    
  
    
 y_clean, features_clean = get_data(dropna=True)
    
 X_dummies_clean = get_one_hot_data(features_clean)
    
 X_factors_clean = factorize_cats(features_clean)
    
 #(clean)将滞后收益率为nan的项都删去了
    
    
    
    
    AI写代码

Cross-Validation Setup

交叉验证,原理比较简单,这里采用了12-fold

复制代码
 class OneStepTimeSeriesSplit:

    
     """Generates tuples of train_idx, test_idx pairs
    
     Assumes the index contains a level labeled 'date'"""
    
  
    
     def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
    
     self.n_splits = n_splits
    
     self.test_period_length = test_period_length
    
     self.shuffle = shuffle
    
  
    
     @staticmethod
    
     def chunks(l, n):
    
     for i in range(0, len(l), n):
    
         yield l[i:i + n]
    
  
    
     def split(self, X, y=None, groups=None):
    
     unique_dates = (X.index
    
                     .get_level_values('date')
    
                     .unique()
    
                     .sort_values(ascending=False)
    
                     [:self.n_splits*self.test_period_length])
    
  
    
     dates = X.reset_index()[['date']]
    
     for test_date in self.chunks(unique_dates, self.test_period_length):
    
         train_idx = dates[dates.date < min(test_date)].index
    
         test_idx = dates[dates.date.isin(test_date)].index
    
         if self.shuffle:
    
             np.random.shuffle(list(train_idx))
    
         yield train_idx, test_idx
    
         
    
         #yield 用于在循环程序里多次返回
    
  
    
     def get_n_splits(self, X, y, groups=None):
    
     return self.n_splits
    
    
    
    
    AI写代码

class实例化

复制代码
 cv = OneStepTimeSeriesSplit(n_splits=12,

    
                         test_period_length=1, 
    
                         shuffle=False)
    
  
    
 run_time = {}
    
    
    
    
    AI写代码

CV Metrics

交叉验证的各项评价指标

复制代码
 metrics = {'balanced_accuracy': 'Accuracy' ,

    
        'roc_auc': 'AUC',
    
        'neg_log_loss': 'Log Loss',
    
        'f1_weighted': 'F1',
    
        'precision_weighted': 'Precision',
    
        'recall_weighted': 'Recall'
    
 }
    
  
    
 def run_cv(clf, X=X_dummies, y=y, metrics=metrics, cv=cv, fit_params=None, n_jobs=-1):
    
     start = time()
    
     #scores是一个字典的形式,key是metrics,value是值
    
     scores = cross_validate(estimator=clf,
    
                         X=X, 
    
                         y=y,
    
                         scoring=list(metrics.keys()),
    
                         cv=cv,
    
                         return_train_score=True,
    
                         n_jobs=n_jobs,
    
                         verbose=1,
    
                         fit_params=fit_params)
    
     
    
     duration = time() - start
    
     return scores, duration 
    
    
    
    
    AI写代码

CV Result Handler Functions

结果处理函数,包括metics dataframe以及plot函数、

4.melt() 是 pivot() 逆转操作函数,也是数据透视的一种处理方法,非常好用,只是一言半语讲不清楚。python melt()用法

复制代码
 def stack_results(scores):

    
     
    
     #利用元组创建多重索引
    
     columns = pd.MultiIndex.from_tuples(
    
     [tuple(m.split('_', 1)) for m in scores.keys()],
    
     names=['Dataset', 'Metric'])
    
     data = np.array(list(scores.values())).T
    
     df = (pd.DataFrame(data=data,
    
                    columns=columns)
    
       .iloc[:, 2:])
    
     results = pd.melt(df, value_name='Value')
    
     results.Metric = results.Metric.apply(lambda x: metrics.get(x))
    
     results.Dataset = results.Dataset.str.capitalize()
    
     return results
    
  
    
  
    
 def plot_result(df, model=None, fname=None):
    
     m = list(metrics.values())
    
     
    
     #catplot函数,表示为用分类型数据(categorical data)绘图
    
     g = sns.catplot(x='Dataset', 
    
                 y='Value', 
    
                 hue='Dataset', 
    
                 col='Metric',
    
                 data=df, 
    
                 col_order=m,
    
                 order=['Train', 'Test'],
    
                 kind="box", 
    
                 col_wrap=3,
    
                 sharey=False,
    
                 height=4, aspect=1.2)
    
     #aspect*height = width
    
     
    
     df = df.groupby(['Metric', 'Dataset']).Value.mean().unstack().loc[m]
    
     
    
     #遍历子图
    
     for i, ax in enumerate(g.axes.flat):
    
     s = f"Train: {df.loc[m[i], 'Train'] :>7.4f}\nTest:  {df.loc[m[i], 'Test'] :>7.4f}"
    
     #用来限制小数位数
    
     
    
     #text函数用于给fig增加图例
    
     ax.text(0.05, 0.85, s, fontsize=10,transform=ax.transAxes,
    
             bbox=dict(facecolor='white', edgecolor='grey', boxstyle='round,pad=0.5'))
    
     g.fig.suptitle(model, fontsize=16)
    
     g.fig.subplots_adjust(top=.9)
    
     if fname:
    
     g.savefig(fname, dpi=300);
    
     
    
 #transform = ax.transAxes是转换坐标系的意思,不加入这一command结果差别很大,但是具体意思也还没搞清楚
    
    
    
    
    AI写代码

Baseline Classifier

流程存在较多重复性,在对不同算法构建模型时采用cross validated方法后不久即可完成后续操作,并随后进行可视化展示;最终比较各算法的表现以评估其性能差异

复制代码

5. DummyClassifier是一种基于简单规则运用的分类器。通常该分类器用作简单基准(baseline)的基础,在与其他复杂的分类器对比时表现突出。

在机器学习过程中完成模型训练后,必须将训练好的模型保存至指定本地位置进行存储。其中,在本地存储时我们选择了使用 joblib 库来进行处理。

baseline

复制代码
 dummy_clf = DummyClassifier(strategy='stratified',

    
                         random_state=42)
    
  
    
 algo = 'dummy_clf'
    
  
    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     dummy_cv_result, run_time[algo] = run_cv(dummy_clf)
    
     joblib.dump(dummy_cv_result, fname)
    
 else:
    
     dummy_cv_result = joblib.load(fname)
    
  
    
  
    
 dummy_result = stack_results(dummy_cv_result)
    
 dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(dummy_result, model='Dummy Classifier')
    
    AI写代码

RandomForest

7. 参数解释,详见代码段。

复制代码
 rf_clf = RandomForestClassifier(n_estimators=100, #决策树模型的个数

    
                             criterion='gini', 
    
                             max_depth=None, 
    
                             min_samples_split=2, 
    
                             min_samples_leaf=1, 
    
                    
    
 # min_sample_split是最低编号。分割所需的样本数量。例如,如果min_sample_split = 6节点中有4个样本,则不会发生分裂(与熵无关)。
    
  
    
 # min_sample_leaf另一方面基本上是最小编号。的样本必须是叶节点。
    
 # 假设min_sample_leaf = 3一个包含5个样本的节点可以拆分为两个大小分别为2和3的叶子节点,则该拆分不会发生,因为最小叶子大小为3                                
    
                             min_weight_fraction_leaf=0.0, 
    
                             max_features='auto',
    
                             #代表sqrt(n_features),default状态就是auto,意思是对于每一个树,随机抽取的特征个数是总个数的平方根
    
                             max_leaf_nodes=None, 
    
                             min_impurity_decrease=0.0, 
    
                             min_impurity_split=None, 
    
                             bootstrap=True, 
    
                             #是否有放回地采样,bootstrap的概念
    
                             oob_score=True, 
    
                             #是否使用带外样本来估计泛化精度
    
                             n_jobs=-1,
    
                             random_state=42, 
    
                             
    
                             #整数型,默认为0,如果为0则不输出日志,如果为1,则每隔一段时间输出日志,大于1输出日志会更频繁。
    
                             verbose=1)
    
  
    
 #n_jobs指定并行性,默认值为None或者数字1,如果设置成-1,则表示将任务派发到所有CPU上
    
    
    
    
    AI写代码

疑问:random forest 用的clean数据,区别是什么呢

复制代码
 algo = 'random_forest'

    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     rf_cv_result, run_time[algo] = run_cv(rf_clf, y=y_clean, X=X_dummies_clean)
    
     joblib.dump(rf_cv_result, fname)
    
 else:
    
     rf_cv_result = joblib.load(fname)
    
  
    
 rf_result = stack_results(rf_cv_result)
    
 rf_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码

8.对于train 和 test相差过大的问题的探索

#注意,在训练集中使用随机森林完美拟合了数据。
由于训练集与测试机之间出现了较大的差异,
最初认为可能是测试机样本数量过少,
通过增大交叉验证中的测试机大小,
仍然没有明显的改进,
同时训练集中占有的比例始终保持在1。
这个问题仍待解决。

复制代码
    plot_result(rf_result, model='Random Forest')
    
    AI写代码

scikit-learn: AdaBoost

9. 参数解释,详见代码段。

复制代码
 #基础模型设置为单层决策树

    
 base_estimator = DecisionTreeClassifier(criterion='gini', 
    
                                     splitter='best',
    
                                     max_depth=1, 
    
                                     min_samples_split=2, 
    
                                     min_samples_leaf=20, 
    
                                     min_weight_fraction_leaf=0.0,
    
                                     max_features=None, 
    
                                     random_state=None, 
    
                                     max_leaf_nodes=None, 
    
                                     min_impurity_decrease=0.0, 
    
                                     min_impurity_split=None, 
    
                                     class_weight=None)
    
  
    
 # splitter:取值为"best"和"random","best"在特征的所有划分点中找出最优的划分点,
    
 # 适合样本量不大的情况,"random"随机地在部分划分点中找局部最优的划分点,适合样本量非常大的情况,默认选择"best"
    
  
    
 # min_weight_fraction_leaf:叶子节点最小的样本权重和,默认取0,
    
 # 即不考虑权重问题,如果小于该数值,该叶子节点会和兄弟节点一起被剪枝(即剔除该叶子节点和其兄弟节点,并停止分裂)。
    
 # 如果较多样本有缺失值或者样本的分布类别偏差很大,则需考虑样本权重问题。
    
    
    
    
    AI写代码

为了避免Adaboost发生过拟合现象,可以通过引入正则化项来实现. 其中, 正则化项中的缩减系数ν等同于learning rate, 也被称为学习率. 它们的取值范围均为(0,1], 其中较大的ν取值意味着达到相同的学习效果所需迭代次数减少、训练所需的弱学习器数量相应降低. 相反地, 较小的ν取值则要求进行更多的迭代步骤以达到指定的分类精度.

复制代码
 ada_clf = AdaBoostClassifier(base_estimator=base_estimator,

    
                          #n_estimator 参数实际上控制了number of boosting stages
    
                          n_estimators=100,
    
                          
    
                          learning_rate=1.0,
    
                          algorithm='SAMME.R',
    
                          random_state=42)
    
  
    
  
    
    
    
    
    AI写代码

表示通过调整弱学习器权重来实现样本集分类效果的提升;而SAMME.R则表示根据样本集分类预测的概率来调整各弱学习器的权重;其中默认采用的是SAMME.R方法

复制代码
 algo = 'adaboost'

    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     ada_cv_result, run_time[algo] = run_cv(ada_clf, y=y_clean, X=X_dummies_clean)
    
     joblib.dump(ada_cv_result, fname)
    
 else:
    
     ada_cv_result = joblib.load(fname)
    
  
    
 ada_result = stack_results(ada_cv_result)
    
 ada_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(ada_result, model='AdaBoost')
    
    AI写代码

HistGradientBoostingClassifier

The following HistGradientBoostingClassifier initialization code illustrates the key tuning parameters that we previously introduced, in addition to those that we are familiar with from looking at standalone decision tree models.

This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).

This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples.

复制代码
 gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy',  
    
                                     learning_rate=0.1,         
    
                                     
    
                                     # 在adaboost中n_estimator用于控制 number of boosting stages
    
                                     # 在histgradientboosting中采用max_iter
    
                                     max_iter=100,               
    
                                     min_samples_leaf=20,
    
                                     max_depth=None,
    
                                     random_state=None,
    
                                     max_leaf_nodes=31,           # opt value depends on feature interaction
    
                                     warm_start=False,
    
  
    
                                     verbose=0,
    
                                     tol=0.0001)
    
    
    
    
    AI写代码

建模后交叉验证

复制代码
 algo = 'sklearn_gbm'

    
  
    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     gb_cv_result, run_time[algo] = run_cv(gb_clf, y=y_clean, X=X_dummies_clean)
    
     joblib.dump(gb_cv_result, fname)
    
 else:
    
     gb_cv_result = joblib.load(fname)
    
  
    
 gb_result = stack_results(gb_cv_result)
    
 gb_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(gb_result, model='Gradient Boosting Classifier')
    
    AI写代码

Partial Dependence Plots(重要)

12.Partial_dependence概念等同于统计学领域中的边际效应概念,在固定其他变量不变的前提下变动目标特征变量的取值范围并观察模型预测效果的变化情况
在分析过程中需要注意的是,在进行分析前需要去除与公司IPO时间相关的数据字段,并采取措施以防止数据因时间和时序关系而产生的过度依赖性影响

partial elucidation

13. product(A,B)函数,返回A和B中的元素组成的笛卡尔积的元组 ([0, 1], repeat=2)表示([0, 1],[0, 1])

14. '{:.0%}'.format(y)是格式化的灵活用法之一,应当掌握|

当我们仅聚焦于单个 feature 与 target 之间的相互作用时, 其呈现的结果则为二维(2D)。如果综合考虑两个 feature 的影响, 则其表现为三维(3D)结构。

复制代码
 X_ = X_factors_clean.drop(['year', 'month'], axis=1)

    
  
    
 fname = results_path / f'{algo}_model.joblib'
    
 if not Path(fname).exists():
    
     gb_clf.fit(y=y_clean, X=X_)
    
     joblib.dump(gb_clf, fname)
    
 else:
    
     gb_clf = joblib.load(fname)
    
  
    
  
    
 gb_clf.score(X=X_, y=y_clean)
    
 >>>0.5889181460403748
    
  
    
 y_score = gb_clf.predict_proba(X_)[:, 1]
    
 roc_auc_score(y_score=y_score, y_true=y_clean)
    
 >>>0.6183261924270361
    
    
    
    
    AI写代码

画图

复制代码
 fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

    
  
    
 plot_partial_dependence(
    
     estimator=gb_clf,
    
     X=X_,
    
     features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
    
     percentiles=(0.05, 0.95),
    
     n_jobs=-1,
    
     n_cols=2,
    
     response_method='decision_function',
    
     grid_resolution=250,
    
     ax=axes)
    
  
    
 for i, j in product([0, 1], repeat=2):
    
     if i!=1 or j!= 0:
    
     axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
    
  
    
 axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
    
  
    
 axes[0][0].set_ylabel('Partial Dependence')
    
 axes[1][0].set_ylabel('Partial Dependence')
    
 axes[0][0].set_xlabel('12-Months Return')
    
 axes[0][1].set_xlabel('6-Months Return')
    
 axes[1][0].set_xlabel('Conservative Minus Aggressive')
    
  
    
 axes[1][1].set_xlabel('12-Month Return')
    
 axes[1][1].set_ylabel('6-Months Return')
    
 fig.suptitle('Partial Dependence Plots', fontsize=16)
    
 fig.tight_layout()
    
 fig.subplots_adjust(top=.95)
    
    
    
    
    AI写代码

3D

复制代码
 targets = ['return_12m', 'return_6m']

    
 pdp, axes = partial_dependence(estimator=gb_clf,
    
                            features=targets,
    
                            X=X_,
    
                            grid_resolution=100)
    
  
    
 XX, YY = np.meshgrid(axes[0], axes[1])
    
 Z = pdp[0].reshape(list(map(np.size, axes))).T
    
  
    
 fig = plt.figure(figsize=(14, 8))
    
 ax = Axes3D(fig)
    
 surface = ax.plot_surface(XX, YY, Z,
    
                       rstride=1,
    
                       cstride=1,
    
                       cmap=plt.cm.BuPu,
    
                       edgecolor='k')
    
 ax.set_xlabel('12-Month Return')
    
 ax.set_ylabel('6-Month Return')
    
 ax.set_zlabel('Partial Dependence')
    
 ax.view_init(elev=22, azim=30)
    
 ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
    
 ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
    
  
    
 # fig.colorbar(surface)
    
 fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
    
 fig.tight_layout()
    
    
    
    
    AI写代码

ax.view_init() 用于调整绘制图像的观察视角, 即表示用于确定观察视角的相机位置, azim参数绕着z轴方向转动, elev参数则沿着y轴方向调整.

绘制在x轴上的数据点数量表示横坐标上点的数量,默认设置为100。通常建议避免将grid_resolution参数设置过大,则可能导致图形出现明显的锯齿状效果

18. 该函数被设计用于基于输入的一组独立坐标向量序列来创建相应的网格结构 meshgrid用法

XGBoost

复制代码
 xgb_clf = XGBClassifier(max_depth=3,  
    
                     learning_rate=0.1,            
    
                     n_estimators=100,             # Number of boosted trees to fit.
    
                     silent=True,                  # Whether to print messages while running
    
                     objective='binary:logistic',  # Task and objective or custom objective function
    
                     booster='gbtree',             # Select booster: gbtree, gblinear or dart
    
  
    
                     n_jobs=-1,                   
    
                     gamma=0,                      # Min loss reduction for further splits
    
                     min_child_weight=1,           # Min sum of sample weight(hessian) needed
    
                     max_delta_step=0,             # Max delta step for each tree's weight estimation
    
                     subsample=1,                  # Subsample ratio of training samples
    
                     colsample_bytree=1,           # Subsample ratio of cols for each tree
    
                     colsample_bylevel=1,          # Subsample ratio of cols for each split
    
                     reg_alpha=0,                  # L1 regularization term on weights
    
                     reg_lambda=1,                 # L2 regularization term on weights
    
                     scale_pos_weight=1,           # Balancing class weights
    
                     base_score=0.5,               # Initial prediction score; global bias
    
                     random_state=42)              # random seed
    
    
    
    
    AI写代码

建模后cv

复制代码
 algo = 'xgboost'

    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     xgb_cv_result, run_time[algo] = run_cv(xgb_clf)
    
     joblib.dump(xgb_cv_result, fname)
    
 else:
    
     xgb_cv_result = joblib.load(fname)
    
  
    
 xbg_result = stack_results(xgb_cv_result)
    
 xbg_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(xbg_result, model='XG Boost')
    
    AI写代码

Feature Importance

特征重要性分析

复制代码
 fi = pd.Series(xgb_clf.feature_importances_,

    
            index=X_dummies.columns)
    
  
    
 fi.nlargest(25).sort_values().plot.barh(figsize=(10, 5), 
    
                                     title='Feature Importance')
    
 sns.despine()
    
 plt.tight_layout();
    
    
    
    
    AI写代码

LightGBM

复制代码
 lgb_clf = LGBMClassifier(boosting_type='gbdt',

    
                      objective='binary',          # learning task
    
                      metric='auc',
    
                      num_leaves=31,               # Maximum tree leaves for base learners.
    
                      max_depth=-1,                # Maximum tree depth for base learners, -1 means no limit.
    
                      learning_rate=0.1,          # Adaptive lr via callback override in .fit() method  
    
                      n_estimators=100,            # Number of boosted trees to fit
    
                      subsample_for_bin=200000,    # Number of samples for constructing bins.
    
                      class_weight=None,           # dict, 'balanced' or None
    
                      min_split_gain=0.0,          # Minimum loss reduction for further split
    
                      min_child_weight=0.001,      # Minimum sum of instance weight(hessian)
    
                      min_child_samples=20,        # Minimum number of data need in a child(leaf)
    
                      subsample=1.0,               # Subsample ratio of training samples
    
                      subsample_freq=0,            # Frequency of subsampling, <=0: disabled
    
                      colsample_bytree=1.0,        # Subsampling ratio of features
    
                      reg_alpha=0.0,               
    
                      reg_lambda=0.0,              
    
                      random_state=42,             # Random number seed; default: C++ seed
    
                      n_jobs=-1,                   # Number of parallel threads.
    
                      silent=False,
    
                      importance_type='gain',      # default: 'split' or 'gain'
    
                     )
    
    
    
    
    AI写代码

cv

复制代码
 algo = 'lgb_factors'

    
  
    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     lgb_factor_cv_result, run_time[algo] = run_cv(lgb_clf, X=X_factors, fit_params={'categorical_feature': cat_cols})
    
     joblib.dump(lgb_factor_cv_result, fname)
    
 else:
    
     lgb_factor_cv_result = joblib.load(fname)
    
  
    
 lgb_factor_result = stack_results(lgb_factor_cv_result)
    
 lgb_factor_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(lgb_factor_result, model='Light GBM | Factors')
    
    AI写代码
复制代码
 algo = 'lgb_dummies'

    
 fname = results_path / f'{algo}.joblib'
    
 if not Path(fname).exists():
    
     lgb_dummy_cv_result, run_time[algo] = run_cv(lgb_clf)
    
     joblib.dump(lgb_dummy_cv_result, fname)
    
 else:
    
     lgb_dummy_cv_result = joblib.load(fname)
    
  
    
 lgb_dummy_result = stack_results(lgb_dummy_cv_result)
    
 lgb_dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()
    
    
    
    
    AI写代码
复制代码
    plot_result(lgb_dummy_result, model='Light GBM | Factors')
    
    AI写代码

Compare Results

复制代码
 results = {'Baseline': dummy_result,

    
        'Random Forest': rf_result,
    
        'AdaBoost': ada_result,
    
        'Gradient Booster': gb_result,
    
        'XGBoost': xbg_result,
    
        'LightGBM Dummies': lgb_dummy_result,
    
        'LightGBM Factors': lgb_factor_result}
    
  
    
 df = pd.DataFrame()
    
 for model, result in results.items():
    
     df = pd.concat([df, result.groupby(['Metric', 'Dataset']
    
                                    ).Value.mean().unstack()['Test'].to_frame(model)], axis=1)
    
  
    
 df.T.sort_values('AUC', ascending=False)
    
    
    
    
    AI写代码

全部评论 (0)

还没有任何评论哟~