|
|
import numpy as np |
|
|
from baselines.common.runners import AbstractEnvRunner |
|
|
from collections import deque |
|
|
|
|
|
|
|
|
class Runner(AbstractEnvRunner): |
|
|
""" |
|
|
We use this object to make a mini batch of experiences |
|
|
__init__: |
|
|
- Initialize the runner |
|
|
|
|
|
run(): |
|
|
- Make a mini batch |
|
|
""" |
|
|
|
|
|
def __init__(self, *, env, model, nsteps, gamma, lam, num_embeddings): |
|
|
super().__init__(env=env, model=model, nsteps=nsteps) |
|
|
|
|
|
self.lam = lam |
|
|
|
|
|
self.gamma = gamma |
|
|
self.num_embeddings = num_embeddings |
|
|
|
|
|
def run(self): |
|
|
|
|
|
mb_obs, mb_rewards, mb_actions, mb_values, mb_cluster_value_losses, mb_encoding_indices, mb_dones, mb_neglogpacs = [], [], [], [], [], [], [], [] |
|
|
mb_states = self.states |
|
|
epinfos = [] |
|
|
|
|
|
for _ in range(self.nsteps): |
|
|
|
|
|
|
|
|
actions, values, encoding_indices, self.states, neglogpacs = self.model.step( |
|
|
self.obs, S=self.states, M=self.dones) |
|
|
mb_obs.append(self.obs.copy()) |
|
|
mb_actions.append(actions) |
|
|
mb_values.append(values) |
|
|
encoding_indices = np.squeeze(encoding_indices) |
|
|
|
|
|
mb_encoding_indices.append(encoding_indices) |
|
|
mb_neglogpacs.append(neglogpacs) |
|
|
mb_dones.append(self.dones) |
|
|
|
|
|
|
|
|
|
|
|
self.obs[:], rewards, self.dones, infos = self.env.step(actions) |
|
|
for info in infos: |
|
|
maybeepinfo = info.get('episode') |
|
|
if maybeepinfo: epinfos.append(maybeepinfo) |
|
|
mb_rewards.append(rewards) |
|
|
|
|
|
mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) |
|
|
mb_rewards = np.asarray(mb_rewards, dtype=np.float32) |
|
|
mb_actions = np.asarray(mb_actions) |
|
|
mb_values = np.asarray(mb_values, dtype=np.float32) |
|
|
|
|
|
mb_encoding_indices = np.asarray(mb_encoding_indices, dtype=np.float32) |
|
|
mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) |
|
|
mb_dones = np.asarray(mb_dones, dtype=np.bool) |
|
|
last_values = self.model.value(self.obs, S=self.states, M=self.dones) |
|
|
|
|
|
|
|
|
mb_returns = np.zeros_like(mb_rewards) |
|
|
mb_advs = np.zeros_like(mb_rewards) |
|
|
lastgaelam = 0 |
|
|
for t in reversed(range(self.nsteps)): |
|
|
if t == self.nsteps - 1: |
|
|
nextnonterminal = 1.0 - self.dones |
|
|
nextvalues = last_values |
|
|
else: |
|
|
nextnonterminal = 1.0 - mb_dones[t + 1] |
|
|
nextvalues = mb_values[t + 1] |
|
|
delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] |
|
|
mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam |
|
|
mb_returns = mb_advs + mb_values |
|
|
return (*map(sf01, ( |
|
|
mb_obs, mb_returns, mb_encoding_indices, mb_dones, mb_actions, |
|
|
mb_values, |
|
|
mb_neglogpacs)), |
|
|
mb_states, epinfos) |
|
|
|
|
|
|
|
|
|
|
|
def sf01(arr): |
|
|
""" |
|
|
swap and then flatten axes 0 and 1 |
|
|
""" |
|
|
s = arr.shape |
|
|
try: |
|
|
return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) |
|
|
except: |
|
|
print() |
|
|
|