李宏毅机器学习HW1总结
发布时间
阅读量:
阅读量
李宏毅机器学习HW1总结
问题
经典的回归问题是通过给定一组输入变量来预测输出结果的一种统计学习方法。该课程作业(HW1)要求运用这一技术框架完成数据建模任务。
固定参数
config = {# 将来要用到的超参数
"seed" : 5201314,
"valid_ratio":0.2,
"select_all":True,
"feat_idx":[0,1,2,3,4],
"batch_size":256,
'n_epochs':3000,
'learning_rate':1e-5,
'save_path':'./models/model.ckpt',
'early_step':400
}
def same_seed(seed):#设置固定的随机种子使得程序的随机具有复现性
'''Fixes random number generator seeds for reproducibility.'''
torch.backends.cudnn.deterministic = True #每次返回的卷积算法将是确定的
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available(): #为GPU设置种子
torch.cuda.manual_seed_all(seed)
数据处理
导入库
from torch.utils.data import DataLoader,Dataset,random_split
#DataLoader 用于将数据分为多个batch用于加速训练
#Dataset 处理数据方便后续的 DataLoader
#random_split 用于将一个数据随机分为多份
读入数据
import pandas as pd
train_data,test_data = pd.read_csv('./covid.train.csv').values[:,1:],pd.read_csv('./covid.test.csv').values[:,1:]
对数据进行划分,得到训练集、验证集、与测试集
def train_valid_split(data_set,valid_ratio,seed):
valid_set_size = int(valid_ratio*len(data_set))
train_set_size = len(data_set) - valid_set_size
train_data,valid_data = random_split(data_set,[train_set_size,valid_set_size],generator=torch.Generator().manual_seed(seed))
#
return np.array(train_data),np.array(valid_data)
选择特征并分别生成训练集、验证集的特征和标签,与测试集的特征
def select_feat(train_data,valid_data,test_data,select_all=True):
y_train = train_data[:,-1]
y_valid = valid_data[:,-1]
if select_all :
feat_idx = list(range(train_data.shape[1]-1))
else :
feat_idx = config['feat_idx']
return train_data[:,feat_idx],valid_data[:,feat_idx],test_data[:,feat_idx],y_train,y_valid
x_train,x_valid,x_test,y_train,y_valid = select_feat(train_data,valid_data,test_data,config['select_all'])
创建Dataset子类产出标准化数据集便于后续批量处理及模型训练
class COVID19Dataset(Dataset):
def __init__(self,x,y=None):
super(COVID19Dataset, self).__init__()
self.x = torch.FloatTensor(x)
if(y is None):
self.y = y
else :
self.y = torch.FloatTensor(y)
def __len__(self):#重载len()函数
return len(self.x)
def __getitem__(self, item):#重载索引函数
if(self.y==None):
return self.x[item]
else :
return self.x[item],self.y[item]
train_dataset = COVID19Dataset(x_train,y_train)
valid_dataset = COVID19Dataset(x_valid,y_valid)
test_dataset = COVID19Dataset(x_test)
对数据集进行批量化
train_loader = DataLoader(train_dataset,config['batch_size'],shuffle=True)
valid_loader = DataLoader(valid_dataset,config['batch_size'],shuffle=True)
test_loader = DataLoader(test_dataset,config['batch_size'],shuffle=False)
数据处理完成
搭建模型
导入库
import torch
import numpy as np
from torch import nn
搭建神经网络
class Mymodel(torch.nn.Module):
def __init__(self,input_size):
super(Mymodel, self).__init__()
self.layers = nn.Sequential(
nn.Linear(input_size,16),
nn.ReLU(),
nn.Linear(16,8),
nn.ReLU(),
nn.Linear(8,1)
)
def forward(self,x):#前向传播
x = self.layers(x)
x = x.squeeze(1)#注意这里必须要将返回值压缩为行向量与标签的维度相同
return x
model = Mymodel(input_size=x_train.shape[1])
训练模型
导入库
from tqdm import tqdm #用于显示训练进度与训练参数
from torch.utils.tensorboard import SummaryWriter#方便画出最后的Loss图或acc图
import os
开始训练
找到合适的Loss函数 和 优化器
def trainer(model,train_loader,valid_loader):
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(params=model.parameters(),lr=config['learning_rate'],momentum=0.9)
导入tensorboard方便后续的绘图
writer = SummaryWriter()
为保存最优的模型开辟路径 和准备参数
if not os.path.isdir('./models'):
os.mkdir('./models')
n_epochs,best_loss,step,early_stop_count = config['n_epochs'],math.inf,0,0
训练
for epoch in range(n_epochs):
model.train()
loss_record = []
train_pbar = tqdm(train_loader,position=0,leave=True)
for x,y in train_pbar:
optimizer.zero_grad()
pred = model(x)
loss = criterion(pred,y)
loss.backward()
optimizer.step()
step+=1
loss_record.append(loss.detach().item())#必须detach 将该loss的从求导图中取出防止后续继续参与求导增加运算量 。
train_pbar.set_description(f'Eopch[{epoch}/{n_epochs}]') #设置进度条的名称
train_pbar.set_postfix({'loss':loss.detach().item()})#设置进度条显示的参数
mean_train_loss = sum(loss_record)/len(loss_record)
writer.add_scalar('Loss/train',mean_train_loss,step)#在tensorboard图中名为'Loss/train'的折现图中加入节点
model.eval()#用于取消dropout
loss_record = []
for x,y in valid_loader:
with torch.no_grad():#以下操作均不构建求导图
pred = model(x)
loss = criterion(pred,y)
loss_record.append(loss.item())
mean_valid_loss = sum(loss_record)/len(loss_record)
print(f'Epoch{epoch+1}/{n_epochs}: Trainloss:{mean_train_loss:.4f} Validloss:{mean_valid_loss:.4f}')
writer.add_scalar('Loss/valid',mean_valid_loss,step)
if mean_valid_loss < best_loss :
best_loss = mean_valid_loss
torch.save(model.state_dict(),config['save_path'])
print('Saving model with loss{:.3f}...'.format(best_loss))
early_stop_count = 0
else :
early_stop_count += 1
if early_stop_count >= config['early_step'] :
print("The model is not improving,so we stop the training session")
return
在终端环境中使用 tensorboard 命令,并将 logdir 设置为指定路径以查看生成的 tensorboard 图。
预测
model.load_state_dict(torch.load(config['save_path']))#加载训练出的最优模型
def predict(test_loader,model):
model.eval()
preds = []
for x in tqdm(test_loader):
with torch.no_grad():
pred = model(x)
preds.append(pred.detach())
preds = torch.cat(preds,dim=0).numpy()
return preds
preds = predict(test_loader,model)
def save_pred(preds,file):#保存模型
with open(file,"w") as fp:
writer = csv.writer(fp)
writer.writerow(['id','tested_positive'])
for i,p in enumerate(preds):
writer.writerow([i,p.item()])
save_pred(preds,'.\pred.csv')
全部评论 (0)
还没有任何评论哟~
