(17-7-04)基于强化学习的自动驾驶系统:训练DDPG智能体执行自动驾驶任务
发布时间
阅读量:
阅读量

17.8.5 训练DDPG智能体执行自动驾驶任务
编写代码文件train_ddpg_agent.py并使其功能设计为训练一个Deep Deterministic Policy Gradient(DDPG)智能体。该智能体旨在使它能够在CARLA仿真环境中进行自动驾驶任务。该智能体通过与环境的持续交互来优化其策略网络和价值网络,并不断更新策略网络和价值网络以优化其性能。最终目标是使该智能体能够更有效地完成驾驶任务。
def train_agent(env, weather_list, agent, nb_training_episodes, save_folder, route_id, nb_updates=250, episode_skip=10):
if (agent.episode_nb+1)==1:
avg_reward, std_reward, success_rate = test_agent(env, weather_list, agent, route_id)
agent.tr_steps_vec.append(agent.tr_step+1)
agent.avg_reward_vec.append(avg_reward)
agent.std_reward_vec.append(std_reward)
agent.success_rate_vec.append(success_rate)
agent.save_actor(os.path.join(save_folder, f"actor_ep_{agent.episode_nb+1}.pt"))
max_steps = 200
noise = True
for agent.episode_nb in range(agent.episode_nb, nb_training_episodes):
if agent.episode_nb > episode_skip*2:
max_steps = 1000
done = False
episode_reward = 0
episode_steps = 0
agent.reset_noise()
weather = weather_list[agent.episode_nb%len(weather_list)]
env.set_weather(weather)
obs = env.reset(route_id)
transitions = []
while not done and episode_steps < max_steps:
act = agent.select_action(obs, noise=noise)
obs_t1, reward, done, info = env.step(act)
transitions.append((obs, act, reward, obs_t1, done))
obs = obs_t1
episode_reward += reward
episode_steps += 1
agent.tr_step += 1
print('Global training step %5d | Training episode %5d | Steps: %4d | Reward: %4d | Success: %5r' % \
(agent.tr_step + 1, agent.episode_nb + 1, episode_steps, episode_reward, reward>=450))
if info['collision']:
print("Collision")
for transition in transitions[-50:]:
agent.store_transition_collision(*transition)
for transition in transitions[:-50]:
agent.store_transition(*transition)
else:
for transition in transitions:
agent.store_transition(*transition)
if agent.episode_nb+1 > episode_skip:
for _ in range(nb_updates):
agent.update()
if (agent.episode_nb+1)%20==0 and (agent.episode_nb+1)>episode_skip:
avg_reward, std_reward, success_rate = test_agent(env, weather_list, agent, route_id)
agent.tr_steps_vec.append(agent.tr_step+1)
agent.avg_reward_vec.append(avg_reward)
agent.std_reward_vec.append(std_reward)
agent.success_rate_vec.append(success_rate)
agent.save_actor(os.path.join(save_folder, f"actor_ep_{agent.episode_nb+1}.pt"))
agent.save(os.path.join(save_folder, "agent.pkl"))
def test_agent(env, weather_list, agent, route_id):
ep_rewards = []
success_rate = 0
avg_steps = 0
nb_episodes =3*len(weather_list)
for episode in range(nb_episodes):
weather = weather_list[episode%len(weather_list)]
env.set_weather(weather)
obs = env.reset(route_id)
done = False
episode_reward = 0
nb_steps = 0
while not done:
act = agent.select_action(obs, noise=False)
print(act)
obs_t1, reward, done, _ = env.step(act)
obs = obs_t1
episode_reward += reward
nb_steps += 1
if done:
if reward > 450:
success_rate += 1
avg_steps += nb_steps
ep_rewards.append(episode_reward)
print('Evaluation episode %3d | Steps: %4d | Reward: %4d | Success: %r' % (episode + 1, nb_steps, episode_reward, reward>450))
ep_rewards = np.array(ep_rewards)
avg_reward = np.average(ep_rewards)
std_reward = np.std(ep_rewards)
success_rate /= nb_episodes
avg_steps /= nb_episodes
print('Average Reward: %.2f, Reward Deviation: %.2f | Average Steps: %.2f, Success Rate: %.2f' % (avg_reward, std_reward, avg_steps, success_rate))
return avg_reward, std_reward, success_rate
if __name__=='__main__':
argparser = ArgumentParser()
argparser.add_argument('--world-port', type=int, default=config.WORLD_PORT)
argparser.add_argument('--host', type=str, default=config.WORLD_HOST)
argparser.add_argument('--cam_height', type=int, default=config.CAM_HEIGHT, help="Camera height")
argparser.add_argument('--cam_width', type=int, default=config.CAM_WIDTH, help="Camera width")
argparser.add_argument('--fov', type=int, default=config.CAM_FOV, help="Camera field of view")
argparser.add_argument('--tick', type=float, default=config.TICK, help="Sensor tick length")
argparser.add_argument('--model', type=str, default=config.AE_MODEL, help='model',
choices=['Autoencoder', 'AutoencoderSEM', 'VAE'])
argparser.add_argument('--autoencoder_model', type=str, help="Autoencoder model path", default=config.AE_PRETRAINED)
argparser.add_argument('--device', type=str, default='cpu', help="Device to use for training", choices=['cuda', 'cpu'])
argparser.add_argument('--nb_episodes', type=int, default=config.TRAIN_EPISODES, help="Number of episodes of training")
argparser.add_argument('--save_folder', type=str, default=config.AGENT_FOLDER, help="Path to save the agent and data")
argparser.add_argument('--route_id', type=int, default=config.ROUTE_ID, help="Route id to use for training")
argparser.add_argument('--nb_updates', type=int, default=config.DDPG_NB_UPDATES, help="Number of updates per episode")
args = argparser.parse_args()
if not os.path.exists(args.autoencoder_model):
raise Exception('Autoencoder model not found')
os.makedirs(args.save_folder, exist_ok=True)
save_agent_path = os.path.join(args.save_folder, 'agent.pkl')
if args.model=='AutoencoderSEM':
autoencoder = AutoencoderSEM.load_from_checkpoint(args.autoencoder_model)
elif args.model=='VAE':
autoencoder = VAE.load_from_checkpoint(args.autoencoder_model)
elif args.model=='Autoencoder':
autoencoder = Autoencoder.load_from_checkpoint(args.autoencoder_model)
else:
raise ValueError(f"Unknown model {args.model}")
autoencoder.freeze()
autoencoder.eval()
env = CarlaEnv(autoencoder, args.world_port, args.host, config.TRAIN_MAP, 'ClearNoon',
args.cam_height, args.cam_width, args.fov, args.tick, 500, exo_vehicles=config.USE_EXO_VEHICLES)
num_routes = len(Route.get_possibilities(config.TRAIN_MAP))
weather_list = config.TRAIN_WEATHER
if os.path.exists(save_agent_path):
with open(save_agent_path, 'rb') as f:
agent = pickle.load(f)
else:
agent = DDPGAgent(obs_dim=260, nb_actions=2, device='cpu', lr_actor=1e-4, lr_critic=1e-3,
batch_size=config.DDPG_BATCH_SIZE, gamma=0.95, tau=0.005, clip_norm=5e-3, buffer_size=40000, action_clip=(-1,1),
collision_percentage=0.2, noise_sigma=config.DDPG_NOISE_SIGMA, noise_decay=1/300, sch_gamma = 0.9,
sch_steps=config.DDPG_SCH_STEPS, use_expert_data=config.DDPG_USE_EXPERT_DATA, expert_percentage=0.25,
lambda_bc=0.5, use_env_model=config.DDPG_USE_ENV_MODEL, lambda_env=0.2,
env_steps=config.DDPG_ENV_STEPS)
if config.DDPG_USE_EXPERT_DATA:
agent.load_expert_data(config.DDPG_EXPERT_DATA_FILE)
print("Pretraining...")
for _ in range(config.DDPG_PRETRAIN_STEPS):
agent.pretrain_update()
try:
train_agent(env, weather_list, agent, args.nb_episodes, args.save_folder, args.route_id, args.nb_updates)
finally:
env.reset_settings()
上述代码的实现流程如下所示:
为CARLA仿真环境配置命令行参数需要考虑主机设定、相机设置以及天气条件等因素,并涵盖训练周期等多个关键要素。
(2)加载预训练的Autoencoder模型,用于图像特征提取。
(3)创建CARLA仿真环境,设置路线、天气等参数。
(4)初始化DDPG智能体,设置训练超参数、经验缓冲区等。
(5)开始训练智能体,循环执行以下步骤:
- 由环境驱动的智能体行为选择过程通过实时监测获取观测数据、动作指令以及即时反馈。
- 维护智能体的经验回放存储库以积累历史行为轨迹。
- 定期触发深度 Deterministic Policy Gradients(DDPG)算法更新以持续改进策略网络和价值网络参数。
- 定期对系统的运行状态进行评估并计算平均奖励指标和成功率统计。
- 记录训练进程信息并保存模型参数副本作为后续训练参考。
(6)在训练过程结束后,保存最终的DDPG智能体模型和训练数据。
全部评论 (0)
还没有任何评论哟~
