1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| class PPO(object): def __init__(self): self.sess = tf.Session() self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
with tf.variable_scope('critic'): l1 = tf.layers.dense(self.tfs, 100, activation=tf.nn.relu) self.v = tf.layers.dense(l1, 1) self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') self.advantage = self.tfdc_r - self.v self.closs = tf.reduce_mean(tf.square(self.advantage)) self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
pi, pi_params = self._build_anet('pi', trainable=True) oldpi, oldpi_params = self._build_anet('oldpi', trainable=False) with tf.variable_scope('sample_action'): self.sample_op = tf.squeeze(pi.sample(1), axis=0) with tf.variable_scope('update_oldpi'): self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
self.tfa = tf.placeholder(tf.float32, [None, A_DIM], 'action') self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') with tf.variable_scope('loss'): with tf.variable_scope('surrogate'): ratio = pi.prob(self.tfa) / oldpi.prob(self.tfa) surr = ratio * self.tfadv if METHOD['name'] == 'kl_pen': self.tflam = tf.placeholder(tf.float32, None, 'lambda') kl = tf.distributions.kl_divergence(oldpi, pi) self.kl_mean = tf.reduce_mean(kl) self.aloss = -tf.reduce_mean(surr - self.tflam * kl) elif METHOD['name'] == 'clip': self.aloss = -tf.reduce_mean(tf.minimum( surr, tf.clip_by_value(ratio, 1.0 - METHOD['epsilon'], 1.0 + METHOD['epsilon']) * self.tfadv) ) else: raise NotImplementedError
with tf.variable_scope('atrain'): self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.aloss)
tf.summary.FileWriter('log/', self.sess.graph) self.sess.run(tf.global_variables_initializer())
|