|
|
import tensorflow as tf |
|
|
import functools |
|
|
|
|
|
from baselines.common.tf_util import get_session, save_variables, load_variables |
|
|
from baselines.common.tf_util import initialize |
|
|
|
|
|
try: |
|
|
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer |
|
|
from mpi4py import MPI |
|
|
from baselines.common.mpi_util import sync_from_root |
|
|
except ImportError: |
|
|
MPI = None |
|
|
|
|
|
class Model(object): |
|
|
""" |
|
|
We use this object to : |
|
|
__init__: |
|
|
- Creates the step_model |
|
|
- Creates the train_model |
|
|
|
|
|
train(): |
|
|
- Make the training part (feedforward and retropropagation of gradients) |
|
|
|
|
|
save/load(): |
|
|
- Save load the model |
|
|
""" |
|
|
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, |
|
|
nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None): |
|
|
self.sess = sess = get_session() |
|
|
|
|
|
if MPI is not None and comm is None: |
|
|
comm = MPI.COMM_WORLD |
|
|
|
|
|
with tf.compat.v1.variable_scope('ppo2_model', reuse=tf.compat.v1.AUTO_REUSE): |
|
|
|
|
|
|
|
|
act_model = policy(nbatch_act, 1, sess) |
|
|
|
|
|
|
|
|
if microbatch_size is None: |
|
|
train_model = policy(nbatch_train, nsteps, sess) |
|
|
else: |
|
|
train_model = policy(microbatch_size, nsteps, sess) |
|
|
|
|
|
|
|
|
self.A = A = train_model.pdtype.sample_placeholder([None]) |
|
|
self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None]) |
|
|
self.R = R = tf.compat.v1.placeholder(tf.float32, [None]) |
|
|
|
|
|
self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None]) |
|
|
|
|
|
self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None]) |
|
|
self.LR = LR = tf.compat.v1.placeholder(tf.float32, []) |
|
|
|
|
|
self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, []) |
|
|
|
|
|
neglogpac = train_model.pd.neglogp(A) |
|
|
|
|
|
|
|
|
|
|
|
entropy = tf.reduce_mean(input_tensor=train_model.pd.entropy()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vpred = train_model.vf |
|
|
vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) |
|
|
|
|
|
vf_losses1 = tf.square(vpred - R) |
|
|
|
|
|
vf_losses2 = tf.square(vpredclipped - R) |
|
|
|
|
|
vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2)) |
|
|
|
|
|
|
|
|
ratio = tf.exp(OLDNEGLOGPAC - neglogpac) |
|
|
|
|
|
|
|
|
pg_losses = -ADV * ratio |
|
|
|
|
|
pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) |
|
|
|
|
|
|
|
|
pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses, pg_losses2)) |
|
|
approxkl = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac - OLDNEGLOGPAC)) |
|
|
clipfrac = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE), dtype=tf.float32)) |
|
|
|
|
|
|
|
|
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef |
|
|
|
|
|
|
|
|
|
|
|
params = tf.compat.v1.trainable_variables('ppo2_model') |
|
|
|
|
|
if comm is not None and comm.Get_size() > 1: |
|
|
self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) |
|
|
else: |
|
|
self.trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) |
|
|
|
|
|
grads_and_var = self.trainer.compute_gradients(loss, params) |
|
|
grads, var = zip(*grads_and_var) |
|
|
|
|
|
if max_grad_norm is not None: |
|
|
|
|
|
grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) |
|
|
grads_and_var = list(zip(grads, var)) |
|
|
|
|
|
|
|
|
|
|
|
self.grads = grads |
|
|
self.var = var |
|
|
self._train_op = self.trainer.apply_gradients(grads_and_var) |
|
|
self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] |
|
|
self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] |
|
|
|
|
|
|
|
|
self.train_model = train_model |
|
|
self.act_model = act_model |
|
|
self.step = act_model.step |
|
|
self.value = act_model.value |
|
|
self.initial_state = act_model.initial_state |
|
|
|
|
|
self.save = functools.partial(save_variables, sess=sess) |
|
|
self.load = functools.partial(load_variables, sess=sess) |
|
|
|
|
|
initialize() |
|
|
global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") |
|
|
if MPI is not None: |
|
|
sync_from_root(sess, global_variables, comm=comm) |
|
|
|
|
|
def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): |
|
|
|
|
|
|
|
|
advs = returns - values |
|
|
|
|
|
|
|
|
advs = (advs - advs.mean()) / (advs.std() + 1e-8) |
|
|
|
|
|
td_map = { |
|
|
self.train_model.X : obs, |
|
|
self.A : actions, |
|
|
self.ADV : advs, |
|
|
self.R : returns, |
|
|
self.LR : lr, |
|
|
self.CLIPRANGE : cliprange, |
|
|
self.OLDNEGLOGPAC : neglogpacs, |
|
|
self.OLDVPRED : values |
|
|
} |
|
|
if states is not None: |
|
|
td_map[self.train_model.S] = states |
|
|
td_map[self.train_model.M] = masks |
|
|
|
|
|
return self.sess.run( |
|
|
self.stats_list + [self._train_op], |
|
|
td_map |
|
|
)[:-1] |
|
|
|
|
|
|