Versioni tf: 2.3.0
import numpy as np
import tensorflow as tf
from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import actor_distribution_network
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
tf.compat.v1.enable_v2_behavior()
env_name='CartPole-v0'
num_iterations=1
collect_episodes_per_iteration=2
replay_buffer_capacity=2000
fc_layer_params=(100, )
learning_rate=1e-3
log_interval=5
num_eval_episodes=10
eval_interval=10
env=suite_gym.load(env_name)
env.reset()
time_step=env.reset()
train_py_env=suite_gym.load(env_name)
train_env=tf_py_environment.TFPyEnvironment(train_py_env)
actor_net=actor_distribution_network.ActorDistributionNetwork(train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
train_step_counter=tf.compat.v2.Variable(0)
tf_agent=reinforce_agent.ReinforceAgent(train_env.time_step_spec(),
train_env.action_spec(),
actor_network=actor_net,
optimizer=optimizer,
normalize_returns=True,
train_step_counter=train_step_counter)
tf_agent.initialize()
eval_policy=tf_agent.policy
collect_policy=tf_agent.collect_policy
replay_buffer=tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=tf_agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=replay_buffer_capacity
)
tf_agent.train=common.function(tf_agent.train)
def collect_episode(environment, policy, num_episodes):
episode_counter=0
environment.reset()
while episode_counter<num_episodes:
time_step=environment.current_time_step()
action_step=policy.action(time_step)
next_time_step=environment.step(action_step.action)
traj=trajectory.from_transition(time_step, action_step, next_time_step)
replay_buffer.add_batch(traj)
if traj.is_boundary():
episode_counter+=1
collect_episode(train_env, tf_agent.collect_policy, 1)
experience=replay_buffer.gather_all()
for _ in range(num_iterations):
collect_episode(train_env, tf_agent.collect_policy, collect_episodes_per_iteration)
from copy import copy
before=copy(tf_agent.trainable_variables)
experience=replay_buffer.gather_all()
train_loss=tf_agent.train(experience)
replay_buffer.clear()
after=copy(tf_agent.trainable_variables)
print('before==after?', before==after)
https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial
Unë po ndiqja mësimet e TFAgents, por e gjeta atë
before=copy(tf_agent.trainable_variables)
tf_agent.train(experience)
after=copy(tf_agent.trainable_variables)
Atëherë 'para' duhet të jetë e ndryshme me 'pas'. Por (para==pas) gjithmonë përfaqëson 'E vërtetë'.
Unë jam shumë konfuz për këtë. Mendova se gradientët mund të jenë zero.
Megjithatë, është e paarsyeshme që humbja e modeles vazhdon të ulet në një hap trajnimi.
Në modulin reinforce_agent, hapi i shiritit gradient ishte shkruar mirë..
Nuk mund ta gjej se cili është problemi... edhe tf_agent.policy.trainable_variables është i njëjtë pavarësisht nga një hap trajnimi..