Advertisement

Actor Critic算法

阅读量:

Actor Critic(直译为演员评判家算法,易理解):

融合两个网络:Actor(演员网络)与Critic(评判家网络),其中Actor基于概率分布生成动作行为序列,Critic则通过评估Actor的行为表现来打分,最终Actor会根据Critic的评价结果不断优化自己的表现概率

钟摆游戏的动作是一个连续值:

演员网络:

评判家网络:

复制代码
 import tensorflow as tf

    
 import numpy as np
    
 import gym
    
  
    
 np.random.seed(2)
    
 tf.set_random_seed(2)
    
  
    
 class Actor(object):
    
     def __init__(self,sess,n_features,action_bound,lr=0.0001):
    
     self.sess =sess
    
     self.s = tf.placeholder(tf.float32,[1,n_features],"state")
    
     self.a =tf.placeholder(tf.float32,None,name ="act")
    
     self.td_error =tf.placeholder(tf.float32,None,name="td_eroor")
    
     '''
    
     action:从当前状态s,通过演员动作网络,预测正太分布的u值,
    
     从当前状态s,通过演员动作网络,预测正太分布的sigma值,
    
     通过u、sigma抽样1个样本 构建action
    
     根据Mu和sigma求出一个正太分布,这个是随机的正态分布
    
     '''
    
     l1 = tf.layers.dense(
    
         inputs =self.s,
    
         units =30,
    
         activation = tf.nn.relu,
    
         kernel_initializer=tf.random_normal_initializer(0.,.1),
    
         bias_initializer =tf.constant_initializer(0.1),
    
         name ='l1'
    
     )
    
     mu =tf.layers.dense(
    
         inputs=l1,
    
         units= 1,
    
         activation=tf.nn.tanh,
    
         kernel_initializer=tf.random_normal_initializer(0.,.1),
    
         bias_initializer=tf.constant_initializer(0.1),
    
         name ='mu'
    
     )
    
     sigma =tf.layers.dense(
    
         inputs=l1,
    
         units=1,
    
         activation=tf.nn.softplus,
    
         kernel_initializer=tf.random_normal_initializer(0.,.1),
    
         bias_initializer = tf.constant_initializer(1.),
    
         name ='sigma'
    
     )
    
     global_step =tf.Variable(0,trainable=False)
    
     self.mu,self.sigma =tf.squeeze(mu*2),tf.squeeze(sigma+0.1)
    
     self.normal_dist =tf.distributions.Normal(self.mu,self.sigma)
    
  
    
     self.action = tf.clip_by_value(self.normal_dist.sample(1),action_bound[0],action_bound[1])
    
     '''
    
     用随机分布逼近选择动作action的正太分布。
    
     最大化正太分布的概率的log, 改变mu,sigma得出一条中心点更加在a的正太分布。
    
     
    
     动作Action不一定正确,要乘以v的误差值
    
     加上策略的熵增加探索空间,避免过早进入局部最优
    
          X ~ Normal(loc=0, scale=1)
    
          `loc = mu` is the mean, `scale = sigma` is the std.
    
   58.           def _entropy(self):   
    
           scale = self.scale * array_ops.ones_like(self.loc)
    
           return 0.5 * math.log(2. * math.pi * math.e) + math_ops.log(scale) 
    
  
    
     '''
    
  
    
     with tf.name_scope('exp_v'):
    
         log_prob =self.normal_dist.log_prob(self.a)
    
         self.exp_v =log_prob*self.td_error
    
         self.exp_v+=0.01*self.normal_dist.entropy()
    
  
    
     with tf.name_scope('train'):
    
         self.train_op =tf.train.AdamOptimizer(lr).minimize(-self.exp_v,global_step)
    
  
    
     def learn(self,s,a,td):
    
     s=s[np.newaxis,:]
    
     feed_dict ={self.s:s,self.a:a,self.td_error:td}
    
     _,exp_v =self.sess.run([self.train_op,self.exp_v],feed_dict)
    
     return  exp_v
    
  
    
     def choose_action(self,s):
    
     s=s[np.newaxis,:]
    
     return self.sess.run(self.action,{self.s:s})
    
  
    
 class Critic(object):
    
     def __init__(self,sess,n_features,lr=0.01):
    
     self.sess =sess
    
     with tf.name_scope('inputs'):
    
         self.s=tf.placeholder(tf.float32,[1,n_features],"state")
    
         self.v_=tf.placeholder(tf.float32,[1,1],name="v_next")
    
         self.r =tf.placeholder(tf.float32,name ="r")
    
  
    
     with tf.variable_scope('Critic'):
    
         l1 =tf.layers.dense(
    
             inputs =self.s,
    
             units =30,
    
             activation=tf.nn.relu,
    
             kernel_initializer= tf.random_normal_initializer(0.,.1),
    
             bias_initializer = tf.constant_initializer(0.1),
    
             name ='l1'
    
         )
    
  
    
         self.v = tf.layers.dense(
    
             inputs =l1,
    
             units =1,
    
             activation=None,
    
             kernel_initializer=tf.random_normal_initializer(0.,.1),
    
             bias_initializer=tf.constant_initializer(0.1),
    
             name ='v'
    
         )
    
  
    
     with tf.variable_scope('squared_TD_error'):
    
         # v: 当前状态s, 通过评论家网络,预测v。
    
         # v_: 下一个状态s_, 通过评论家网络,预测v_
    
         self.td_error =tf.reduce_mean(self.r+ GAMMA*self.v_ -self.v )
    
         self.loss =tf.square(self.td_error)
    
  
    
     with tf.variable_scope('train'):
    
         self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
    
  
    
     def learn(self,s,r,s_):
    
     s,s_=s[np.newaxis,:],s_[np.newaxis,:]
    
     v_ =self.sess.run(self.v,{self.s:s_})
    
     td_error,_=self.sess.run([self.td_error,self.train_op],
    
                              {self.s:s,self.v_:v_,self.r:r})
    
     return  td_error
    
  
    
 OUTPUT_GRAPH =True
    
 MAX_EPISODE =1000
    
 MAX_EP_STEPS =200
    
 DISPLAY_REWARD_THRESHOLD =-100
    
 RENDER =False
    
 GAMMA =0.9
    
 LR_A=0.001
    
 LR_C =0.01
    
  
    
 env =gym.make('Pendulum-v0')
    
 env.seed(1)
    
 env.unwrapped
    
  
    
 N_S =env.observation_space.shape[0]
    
 A_BOUND =env.action_space.high
    
 sess =tf.Session()
    
  
    
 actor =Actor(sess,n_features=N_S,lr =LR_A,action_bound=[-A_BOUND,A_BOUND])
    
 critic =Critic(sess,n_features=N_S,lr=LR_C)
    
  
    
 sess.run(tf.global_variables_initializer())
    
  
    
 if OUTPUT_GRAPH:
    
     tf.summary.FileWriter("logs/",sess.graph)
    
  
    
 for i_episode in range(MAX_EPISODE):
    
     s =env.reset()
    
     t=0
    
     ep_rs =[]
    
     while True:
    
     env.reset()
    
  
    
     a=actor.choose_action(s)
    
     s_,r,done,info =env.step(a)
    
  
    
     r/=10
    
     #gradient = grad[r+gamma * v(s_) -v(s)]
    
     td_error =critic.learn(s,r,s_)
    
     #true_gradient =grad[logpi(s,a)*td_error]
    
     actor.learn(s,a,td_error)
    
  
    
     s=s_
    
     t+=1
    
     ep_rs.append(r)
    
     if t> MAX_EP_STEPS:
    
         ep_rs_sum=sum(ep_rs)
    
         if 'running_reward' not in globals():
    
             running_reward =ep_rs_sum
    
         else:
    
             running_reward =running_reward*0.9+ep_rs_sum*0.1
    
  
    
         if running_reward  > DISPLAY_REWARD_THRESHOLD:RENDER=True
    
         print("episode:",i_episode,"  reward:",int(running_reward))
    
         break
    
  
    
  
    
  
    
  
    
  
    
  
    
    
    
    
    代码解读

小车立杆游戏的动作是一个离散值:

复制代码
 import numpy as np

    
 import tensorflow as tf
    
 import gym
    
  
    
 np.random.seed(2)
    
 tf.set_random_seed(2)
    
  
    
 OUTPUT_GRAPH =True
    
 MAX_EPISODE =3000
    
 DISPLAY_REWARD_THESHOLD =200
    
 MAX_EP_STEPS =1000
    
 RENDER =True
    
 GAMMA =0.9
    
 LR_A=0.001
    
 LR_C=0.01
    
  
    
 env =gym.make("CartPole-v0")
    
 env.seed(1)
    
 env =env.unwrapped
    
  
    
 N_F=env.observation_space.shape[0]
    
 N_A=env.action_space.n
    
  
    
 class Actor(object):
    
     def __init__(self,sess,n_features,n_actions,lr=0.001):
    
     self.sess =sess
    
     self.s=tf.placeholder(tf.float32,[1,n_features],"state")
    
     self.a=tf.placeholder(tf.int32,None,"act")
    
     self.td_error= tf.placeholder(tf.float32,None,"td_error")
    
  
    
     with tf.variable_scope('Actor'):
    
         l1= tf.layers.dense(
    
             inputs=self.s,
    
             units =20,
    
             activation=tf.nn.relu,
    
             kernel_initializer=tf.random_normal_initializer(0.,.1),
    
             bias_initializer=tf.constant_initializer(0.1),
    
             name='l1'
    
         )
    
  
    
     self.acts_prob =tf.layers.dense(
    
         inputs =l1,
    
         units=n_actions,
    
         activation=tf.nn.softmax,
    
         kernel_initializer=tf.random_normal_initializer(0.,.1),
    
         bias_initializer=tf.constant_initializer(0.1),
    
         name ='acts_prob'
    
     )
    
  
    
     with tf.variable_scope('exp_v'):
    
        log_prob= tf.log(self.acts_prob[0,self.a])
    
        self.exp_v =tf.reduce_mean(log_prob*self.td_error)
    
  
    
    #minimize(-exp_v)=maxmize(exp_v)
    
     with tf.variable_scope('train'):
    
         self.train_optimizer=tf.train.AdamOptimizer(lr).minimize(-self.exp_v)
    
  
    
     def learn(self,s,a,td):
    
     s=s[np.newaxis,:]
    
     feed_dict={self.s:s,self.a:a,self.td_error:td}
    
     _,exp_v =self.sess.run([self.train_optimizer,self.exp_v],feed_dict)
    
     return  exp_v
    
  
    
     def choose_action(self,s):
    
     s=s[np.newaxis,:]
    
     probs=self.sess.run(self.acts_prob,{self.s:s})
    
  
    
     return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel())
    
  
    
 class Crtic(object):
    
     def __init__(self,sess,n_features,lr=0.01):
    
     self.sess= sess
    
     self.s =tf.placeholder(tf.float32,[1,n_features],"state")
    
     self.v_ =tf.placeholder(tf.float32,[1,1],"v_next")
    
     self.r =tf.placeholder(tf.float32,None,'r')
    
  
    
     with tf.variable_scope('Critic'):
    
         l1 =tf.layers.dense(
    
             inputs =self.s,
    
             units =20,
    
             activation=tf.nn.relu,
    
             kernel_initializer=tf.random_normal_initializer(0.,.1),
    
             bias_initializer=tf.constant_initializer(0.1),
    
             name ='l1'
    
         )
    
  
    
         self.v =tf.layers.dense(
    
             inputs =l1,
    
             units =1,
    
             activation=None,
    
             kernel_initializer=tf.random_normal_initializer(0.,.1),
    
             bias_initializer=tf.constant_initializer(0.1),
    
             name='v'
    
          )
    
     with tf.variable_scope('squared_TD_error'):
    
         self.td_error =self.r +GAMMA*self.v_ - self.v
    
         self.loss =tf.square(self.td_error)
    
     with tf.variable_scope('train'):
    
         self.train_op =tf.train.AdamOptimizer(lr).minimize(self.loss)
    
  
    
     def learn(self,s,r,s_):
    
      s,s_ =s[np.newaxis,:],s_[np.newaxis,:]
    
      v_ =self.sess.run(self.v,{self.s:s_ })
    
      td_error,_=self.sess.run([self.td_error,self.train_op],
    
                               {self.s:s,self.v_ :v_,self.r:r})
    
      return  td_error
    
  
    
 sess =tf.Session()
    
 actor = Actor(sess,n_features=N_F,n_actions=N_A,lr=LR_A)
    
 critic=Crtic(sess,n_features=N_F,lr=LR_C)
    
  
    
 sess.run(tf.global_variables_initializer())
    
  
    
 if OUTPUT_GRAPH:
    
     tf.summary.FileWriter("logs/",sess.graph)
    
  
    
 for i_episode in range(MAX_EP_STEPS):
    
     s =env.reset()
    
     t=0
    
     track_r =[]
    
     while True:
    
     if RENDER:env.render()
    
  
    
     a=actor.choose_action(s)
    
  
    
     s_,r,done,info =env.step(a)
    
  
    
     if done :r=-20
    
  
    
     track_r.append(r)
    
     #gradient =grad[r+gamma*V(s_)-V(s)]
    
     td_error =critic.learn(s,r,s_)
    
     #true_gradient =grad[logPi(s,a)*td_error]
    
     actor.learn(s,a,td_error)
    
  
    
     s=s_
    
     t+=1
    
  
    
     if done or t>=MAX_EP_STEPS:
    
         ep_rs_sum =sum(track_r)
    
  
    
         if 'running_reward' not in globals():
    
             running_reward =ep_rs_sum
    
  
    
         else:
    
             running_reward=running_reward*0.95+ep_rs_sum*0.05
    
  
    
         if running_reward >DISPLAY_REWARD_THESHOLD:RENDER=True
    
         print("episode: ",i_episode,"  reward: ",int(running_reward))
    
         break
    
  
    
  
    
    
    
    
    代码解读

全部评论 (0)

还没有任何评论哟~