Advertisement

kaggle课程(二)Intro to Machine Learning

阅读量:

一、Your First Machine Learning Model

复制代码
    # Code you have previously used to load data
    import pandas as pd
    
    # 加载数据
    iowa_file_path = '../input/home-data-for-ml-course/train.csv'
    
    home_data = pd.read_csv(iowa_file_path)
    
    #删除空值的行
    home_data = home_data.dropna(axis=0)
    
    #预览
    home_data.columns
    
    #选取目标预测值Specify Prediction Target
    y = home_data.SalePrice
    
    # 创建特征值Create the list of features below
    feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']
    # Select data corresponding to features in feature_names
    X = home_data[feature_names]
    
    # 预览Review data
    # print description or statistics from X
    print(X.describe())
    # print the top few lines
    print(X.head())
    
    from sklearn.tree import DecisionTreeRegressor
    #创建模型specify the model. 
    #For model reproducibility, set a numeric value for random_state when specifying the model
    iowa_model = DecisionTreeRegressor(random_state = 1)
    
    # Fit the model
    iowa_model.fit(X,y)
    
    #Make Predictions
    predictions = iowa_model.predict(X)
    print(predictions)
    print("Making predictions for the following 5 houses:")
    print(X.head())
    print("The predictions are")
    print(melbourne_model.predict(X.head()))

二、Model Validation

首先我们需要区分验证集和训练集

复制代码
    from sklearn.model_selection import train_test_split
    
    # split data into training and validation data, for both features and target
    # The split is based on a random number generator. Supplying a numeric value to
    # the random_state argument guarantees we get the same split every time we
    # run this script.
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
    # Define model
    melbourne_model = DecisionTreeRegressor(random_state=1)
    # Fit model
    melbourne_model.fit(train_X, train_y)

然后使用验证集进行预测:

复制代码
    # Predict with all validation observations
    val_predictions = iowa_model.predict(val_X)
    # print the top few validation predictions
    print(val_predictions[:5])
    # print the top few actual prices from validation data
    print(val_y[:5])

最后打印MAE的值:

复制代码
    from sklearn.metrics import mean_absolute_error
    val_mae = mean_absolute_error(val_y,val_predictions)
    
    # uncomment following line to see the validation_mae
    print(val_mae)

三、Underfitting and Overfitting

复制代码
    # Code you have previously used to load data
    import pandas as pd
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    
    
    # Path of the file to read
    iowa_file_path = '../input/home-data-for-ml-course/train.csv'
    
    home_data = pd.read_csv(iowa_file_path)
    # Create target object and call it y
    y = home_data.SalePrice
    # Create X
    features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
    X = home_data[features]
    
    # Split into validation and training data
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    
    # Specify Model
    iowa_model = DecisionTreeRegressor(random_state=1)
    # Fit Model
    iowa_model.fit(train_X, train_y)
    
    # Make validation predictions and calculate mean absolute error
    val_predictions = iowa_model.predict(val_X)
    val_mae = mean_absolute_error(val_predictions, val_y)
    print("Validation MAE: {:,.0f}".format(val_mae))

选取最合适的值:

复制代码
    def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)
    
    candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
    # Write loop to find the ideal tree size from candidate_max_leaf_nodes
    scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
    best_tree_size = min(scores, key=scores.get)

建造最终模型

复制代码
    # Fit the model with best_tree_size. Fill in argument to make optimal size
    final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
    
    # fit the final model
    final_model.fit(X, y)

四、Random Forests

复制代码
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeRegressor
    
    # Path of the file to read
    iowa_file_path = '../input/home-data-for-ml-course/train.csv'
    
    home_data = pd.read_csv(iowa_file_path)
    # Create target object and call it y
    y = home_data.SalePrice
    # Create X
    features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
    X = home_data[features]
    
    # Split into validation and training data
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    
    # Define the model. Set random_state to 1
    rf_model = RandomForestRegressor(random_state = 1)
    
    # fit your model
    rf_model.fit(train_X,train_y)
    
    # Calculate the mean absolute error of your Random Forest model on the validation data
    rf_val_predictions = rf_model.predict(val_X)
    rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
    
    print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

全部评论 (0)

还没有任何评论哟~