Advertisement

天池工业蒸汽量预测

阅读量:

天池工业蒸汽量

复制代码
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    
    import seaborn as sns#画图
    
    from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
    from xgboost import XGBRegressor
    from lightgbm import LGBMRegressor
    
    #支持向量机
    from sklearn.svm import SVR
    
    #评价标准
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures

数据聚合

复制代码
    train = pd.read_csv('./zhengqi_train.txt',sep = '\t')
    test = pd.read_csv('./zhengqi_test.txt',sep = '\t')
复制代码
    #将train,test进行级联,级联之前进行处理
    #给train,test增加一列
    train['origin'] = 'train'
    test['origin'] = 'test'
    
    #将train和test进行融合
    data_all = pd.concat([train,test])
    print(data_all.shape)
    data_all.head()
在这里插入图片描述
复制代码
    
    
    # 38个特征,将一些不重要的特征删除
    # 特征分布情况,训练和测试数据特征分布不均匀,删除
    plt.figure(figsize=(9,38*6))
    for i,col in enumerate(data_all.columns[:-2]):
    cond = data_all['origin'] == 'train'
    train_col = data_all[col][cond] #训练数据
    cond = data_all['origin'] == 'test'
    test_col = data_all[col][cond] #测试数据
    axes = plt.subplot(38,1,i+1)
    ax = sns.kdeplot(train_col,shade = True)
    sns.kdeplot(test_col,shade = True,ax = ax)
    plt.legend(['train','test'])
    plt.xlabel(col)
在这里插入图片描述
复制代码
    plt.figure(figsize=(9,6))
    for col in data_all.columns[:-2]:
    g = sns.FacetGrid(data_all,col = 'origin')
    g.map(sns.distplot,col)#distribute
在这里插入图片描述
在这里插入图片描述
复制代码
    drop_labels = ['V11','V17','V22','V5']
    data_all.drop(drop_labels,axis = 1,inplace=True)
    data_all.shape

相关性系数

复制代码
    # 协方差
    cov = data_all.cov()
    cov.head()
在这里插入图片描述
复制代码
    # 相关性系数
    corr = data_all.corr()
    corr.head()
在这里插入图片描述
复制代码
    # 通过相关性系数找到7个相关性不大的属性
    cond = corr.loc['target'].abs() < 0.1
    drop_labels = corr.loc['target'].index[cond]
    # Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
    drop_labels
    
    # 查看了属性的分布,分布不好的删除
    drop_labels = ['V14', 'V21']
    data_all.drop(drop_labels,axis = 1,inplace=True)
复制代码
    data_all.shape
在这里插入图片描述
复制代码
    # 找出相关程度
    plt.figure(figsize=(20, 16))  # 指定绘图对象宽度和高度
    mcorr = train.corr()  # 相关系数矩阵,即给出了任意两个变量之间的相关系数
    mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵 为bool型
    
    mask[np.triu_indices_from(mask)] = True  # 角分线右侧为True
    # 颜色
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
    plt.show()
在这里插入图片描述

标准化

复制代码
    data_all[data_all['origin'] == 'test'].describe()
在这里插入图片描述
复制代码
    data_all[data_all['origin'] == 'train'].describe()
在这里插入图片描述
复制代码
    stand = StandardScaler()
    data = data_all.iloc[:,:-2]
    data2 = stand.fit_transform(data)
    data2
在这里插入图片描述
复制代码
    cols = data_all.columns
    data_all_std = pd.DataFrame(data2,columns=cols[:-2])
    data_all_std
在这里插入图片描述
复制代码
    data_all.index = np.arange(4813)
    data_all
在这里插入图片描述
复制代码
    data_all_std = pd.merge(data_all_std,data_all.iloc[:,-2:],right_index=True,left_index=True)
    data_all_std.head()
在这里插入图片描述
复制代码
    data_all_std.describe()
在这里插入图片描述

使用不同算法进行训练和测试

复制代码
    # 异常值
    from sklearn.linear_model import RidgeCV
复制代码
    data_all_std.head()
在这里插入图片描述
复制代码
    ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
    
    cond = data_all_std['origin'] == 'train'
    
    X_train = data_all_std[cond].iloc[:,:-2]
    # 真实值
    y_train = data_all_std[cond]['target']
    # 算法拟合数据和目标值的时候,不可能100%拟合
    ridge.fit(X_train,y_train)
    # 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
    y_ = ridge.predict(X_train)
复制代码
    cond = abs((y_train - y_ )) > y_train.std()*0.8
    cond.sum()
在这里插入图片描述
复制代码
    # 画图
    plt.figure(figsize=(12,6))
    axes = plt.subplot(1,3,1)
    axes.scatter(y_train,y_)
    axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)
    
    axes = plt.subplot(1,3,2)
    axes.scatter(y_train,y_train - y_)
    axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')
    
    axes = plt.subplot(1,3,3)
    # _ = axes.hist(y_train,bins = 50)
    (y_train - y_).plot.hist(bins = 50,ax = axes)
    (y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
在这里插入图片描述
复制代码
    data_all_std
在这里插入图片描述
复制代码
    # 将异常值点过滤
    drop_index = cond[cond].index
    print(data_all_std.shape)
    data_all_std.drop(drop_index,axis = 0,inplace=True)
    data_all_std.shape
在这里插入图片描述
复制代码
    def detect_model(etsimators,data):
    for key,estimator in estimators.items():
        estimator.fit(data[0],data[2])
        y_ = estimator.predict(data[1])
        mse = mean_squared_error(data[3],y_)
        print('-------------------mse%s'%(key),mse)
        r2 = estimator.score(data[1],data[3])
        print('+++++++++++++++++++r2_score%s'%(key),r2)
        print('\n')
复制代码
    cond = data_all_std['origin'] == 'train'
    X = data_all_std[cond].iloc[:,:-2]
    y = data_all_std[cond]['target']
    data = train_test_split(X,y,test_size = 0.2)
复制代码
    estimators = {}
    estimators['knn'] = KNeighborsRegressor()
    estimators['linear'] = LinearRegression()
    estimators['ridge'] = Ridge()
    estimators['lasso'] = Lasso()
    estimators['elasticnet'] = ElasticNet()
    estimators['forest'] = RandomForestRegressor()
    estimators['gbdt'] = GradientBoostingRegressor()
    estimators['ada'] = AdaBoostRegressor()
    estimators['extreme'] = ExtraTreesRegressor()
    estimators['svm_rbf'] = SVR(kernel='rbf')
    estimators['svm_poly'] = SVR(kernel='poly')
    estimators['light'] = LGBMRegressor()
    estimators['xgb'] = XGBRegressor()
复制代码
    # 对于我们的测试数据而言:KNN、Lasso、ElasticNet、SVM_poly
    detect_model(estimators,data)
在这里插入图片描述
复制代码
    #过滤掉效果不好的方法:线性回归
    estimators = {}
    # estimators['linear'] = LinearRegression()
    # estimators['ridge'] = Ridge()
    # estimators['lasso'] = Lasso()
    estimators['forest'] = RandomForestRegressor()
    estimators['gbdt'] = GradientBoostingRegressor()
    estimators['ada'] = AdaBoostRegressor()
    estimators['extreme'] = ExtraTreesRegressor()
    estimators['svm_rbf'] = SVR(kernel='rbf')
    estimators['light'] = LGBMRegressor()
    estimators['xgb'] = XGBRegressor()
复制代码
    cond = data_all_std['origin'] == 'train'
    
    X_train = data_all_std[cond].iloc[:,:-2]
    y_train = data_all_std[cond]['target']
    
    cond = data_all_std['origin'] == 'test'
    X_test = data_all_std[cond].iloc[:,:-2]
复制代码
    # 一个算法预测结果,将结果合并
    y_pred = []
    for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    y_pred.append(y_)
    
    y_ = np.mean(y_pred,axis = 0)
复制代码
    pd.Series(y_).to_csv('./ensemble2.txt',index =False)
复制代码
    # 预测的结果作为新特征,让我们的算法学习,寻找数据和目标值之间的关系
    # y_ 预测值,和真实值之间差距,将预测值当成新的特征,让我们算法进行再学习
    for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_train)
    X_train[key] = y_
    y_ = model.predict(X_test)
    X_test[key] = y_
复制代码
    # 一个算法预测结果,将结果合并
    y_pred = []
    for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    y_pred.append(y_)
    
    y_ = np.mean(y_pred,axis = 0)
复制代码
    pd.Series(y_).to_csv('./ensemble3.txt',index = False)
复制代码
    sns.distplot(y_)
在这里插入图片描述
复制代码
    y_.mean()
在这里插入图片描述
复制代码
    y_.std()
在这里插入图片描述
复制代码
    #给y_加一个噪声
    y_+=np.random.randn(1925)*0.1
复制代码
    pd.Series(y_).to_csv('./ensemble4.txt',index = False)

对数据进行归一化

复制代码
    data_all.head()
在这里插入图片描述
复制代码
    data = data_all.iloc[:,:-2]
    
    minmaxscaler = MinMaxScaler()
    data3 = minmaxscaler.fit_transform(data)
    data3
在这里插入图片描述
复制代码
    #归一化的数据
    
    data_all_norm = pd.DataFrame(data3,columns = data_all.columns[:-2])
    data_all_norm
在这里插入图片描述
复制代码
    #进行级联
    data_all_norm = pd.merge(data_all_norm,data_all.iloc[:,-2:],left_index = True,right_index = True)
复制代码
    data_all_norm.describe()
在这里插入图片描述
复制代码
    def scale_minmax(data):
    return (data - data.min())/(data.max()-data.min())
复制代码
    #连续变量画图
    from scipy import stats
    fcols = 6
    frows = len(data_all_norm.columns[:10])
    plt.figure(figsize = (4*fcols,4*frows))
    i = 0
    for col in data_all_norm.columns[:10]:
      
    dat = data_all_norm[[col,'target']].dropna()
    # 第一个图数据分布图
    i+=1
    plt.subplot(frows,fcols,i)
    sns.distplot(dat[col],fit = stats.norm)#正太分布图
    plt.title(var + 'Original')
    #第二个图:skew统计分析中的属性
    #skewness:偏斜系数,是对正态分布的度量
    i+=1
    plt.subplot(frows,fcols,i)
    _ = stats.probplot(dat[col],plot = plt)#画图:偏斜度
    plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
    plt.xlabel('')
    plt.ylabel('')
    
    
    #第三个图:散点图
    i+=1
    plt.subplot(frows,fcols,i)
    #     plt.plot(dat[var], dat['target'],'.',alpha=0.5)
    plt.scatter(dat[col],dat['target'],alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))
    
    
    #     !!!对数据进行了处理!!!
    #   数据分布图distribution
    i+=1
    plt.subplot(frows,fcols,i)
    trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
    trans_var = scale_minmax(trans_var)      
    sns.distplot(trans_var , fit=stats.norm);
    plt.title(var+' Tramsformed')
    plt.xlabel('')
    
     #     偏斜度
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(trans_var, plot=plt)
    plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
    plt.xlabel('')
    plt.ylabel('')
    
    #     散点图
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(trans_var, dat['target'],'.',alpha=0.5)
    plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
在这里插入图片描述
复制代码
    #将数据进行box-cox转换(所有列,利用for循环)
    #统计建模中常用的数据变化,让数据更加正态化,更加标准化
    for col in data_all_norm.columns[:-2]:
    boxcox,maxlog = stats.boxcox(data_all_norm[col]+1)#加1原因data_all_nrom最小值0,stats.boxcox返回两个值,进行接收
    data_all_norm[col] = scale_minmax(boxcox)#进行归一化

过滤异常值

复制代码
    ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
    
    cond = data_all_norm['origin'] == 'train'
    
    X_train = data_all_norm[cond].iloc[:,:-2]
    # 真实值
    y_train = data_all_norm[cond]['target']
    # 算法拟合数据和目标值的时候,不可能100%拟合
    ridge.fit(X_train,y_train)
    # 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
    y_ = ridge.predict(X_train)
    
    cond = abs(y_ - y_train)>y_train.std()
    print(cond.sum)
    # 画图
    plt.figure(figsize=(12,6))
    axes = plt.subplot(1,3,1)
    axes.scatter(y_train,y_)
    axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)
    
    axes = plt.subplot(1,3,2)
    axes.scatter(y_train,y_train - y_)
    axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')
    
    axes = plt.subplot(1,3,3)
    # _ = axes.hist(y_train,bins = 50)
    (y_train - y_).plot.hist(bins = 50,ax = axes)
    (y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
在这里插入图片描述
复制代码
    #把异常值过滤
    index = cond[cond].index
    data_all_norm.drop(index,axis=0,inplace = True)
复制代码
    cond = data_all_norm['origin'] == 'train'
    X_train = data_all_norm[cond].iloc[:,:-2]
    y_train = data_all_norm[cond]['target']
    
    cond = data_all_norm['origin'] == 'test'
    X_test = data_all_norm[cond].iloc[:,:-2]
复制代码
    #过滤掉效果不好的方法:线性回归
    estimators = {}
    # estimators['linear'] = LinearRegression()
    # estimators['ridge'] = Ridge()
    # estimators['lasso'] = Lasso()
    estimators['forest'] = RandomForestRegressor(n_estimators = 300)
    estimators['gbdt'] = GradientBoostingRegressor(n_estimators = 300)
    estimators['ada'] = AdaBoostRegressor(n_estimators = 300)
    estimators['extreme'] = ExtraTreesRegressor(n_estimators = 300)
    estimators['svm_rbf'] = SVR(kernel='rbf')
    estimators['light'] = LGBMRegressor(n_estimators = 300)
    estimators['xgb'] = XGBRegressor(n_estimators = 300)
复制代码
    result = []
    for key,model in estimators.items():
    model.fit(X_train,y_train)
    y_ = model.predict(X_test)
    result.append(y_)
    
    y_ = np.mean(result,axis = 0)
    
    pd.Series(y_).to_csv('./norm.txt',index = False)

全部评论 (0)

还没有任何评论哟~