opendilab
diff --git a/‎lzero/entry/eval_muzero.py
Lines changed: 0 additions & 2 deletions b/‎lzero/entry/eval_muzero.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎lzero/mcts/buffer/game_buffer.py
Lines changed: 6 additions & 3 deletions b/‎lzero/mcts/buffer/game_buffer.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎lzero/mcts/tree_search/mcts_ctree.py
Lines changed: 2 additions & 0 deletions b/‎lzero/mcts/tree_search/mcts_ctree.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎lzero/model/common.py
Lines changed: 0 additions & 1 deletion b/‎lzero/model/common.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎lzero/model/unizero_world_models/tokenizer.py
Lines changed: 4 additions & 6 deletions b/‎lzero/model/unizero_world_models/tokenizer.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎lzero/model/unizero_world_models/tokenizer_bkp20250428.py
Lines changed: 0 additions & 244 deletions b/‎lzero/model/unizero_world_models/tokenizer_bkp20250428.py
Lines changed: 0 additions & 244 deletions
diff --git a/‎lzero/model/unizero_world_models/world_model.py
Lines changed: 19 additions & 4 deletions b/‎lzero/model/unizero_world_models/world_model.py
Lines changed: 19 additions & 4 deletions
@@ -60,10 +60,8 @@ def eval_muzero(
 
     # load pretrained model
     if model_path is not None:
-        # print(policy._learn_model.representation_network.pretrained_model.encoder.layer[0].attention.output.LayerNorm.weight)
         logging.info(f"Loading pretrained model from {model_path}...")
         policy.learn_mode.load_state_dict(torch.load(model_path, map_location=cfg.policy.device))
-        # policy.eval_mode.load_state_dict(torch.load(model_path, map_location=cfg.policy.device))
         logging.info("Pretrained model loaded successfully!")
     else:
         logging.warning("model_path is None!!!")
 
@@ -151,13 +151,16 @@ def _sample_orig_data(self, batch_size: int) -> Tuple:
             # Indices exceeding `game_segment_length` are padded with the next segment and are not updated
             # in the current implementation. Therefore, we need to sample `pos_in_game_segment` within
             # [0, game_segment_length - num_unroll_steps] to avoid padded data.
-
+            
             if self._cfg.action_type == 'varied_action_space':
-                # TODO: Consider increasing `self._cfg.game_segment_length` to ensure sampling efficiency.
+                # For multi-environment training (e.g., Jericho), each environment may have a different discrete action space size.
+                # To ensure we can always unroll `num_unroll_steps` steps starting from the sampled position (without exceeding segment length),
+                # we avoid sampling from the last `num_unroll_steps` steps of the game segment. 
                 if pos_in_game_segment >= self._cfg.game_segment_length - self._cfg.num_unroll_steps:
                     pos_in_game_segment = np.random.choice(self._cfg.game_segment_length - self._cfg.num_unroll_steps, 1).item()
             else:
-                # NOTE: Sample the init position from the whole segment, but not from the padded part
+                # For environments with a fixed action space (e.g., Atari),
+                # we can safely sample from the entire game segment range.
                 if pos_in_game_segment >= self._cfg.game_segment_length:
                     pos_in_game_segment = np.random.choice(self._cfg.game_segment_length, 1).item()
 
 
@@ -93,6 +93,8 @@ def search(
 
             # preparation some constant
             batch_size = roots.num
+
+            # Store the latent state of each possible action at the MCTS root for each environment.
             first_action_latent_map = {env_id: {} for env_id in range(batch_size)} # {env_id: {action: latent_state}} 
 
             pb_c_base, pb_c_init, discount_factor = self._cfg.pb_c_base, self._cfg.pb_c_init, self._cfg.discount_factor
 
@@ -729,7 +729,6 @@ def __init__(
             # last_linear_layer_init_zero=True is beneficial for convergence speed.
             last_linear_layer_init_zero=True,
         )
-        # self.sim_norm = SimNorm(simnorm_dim=group_size)
 
         # # Select the normalization method based on the final_norm_option_in_encoder parameter.
         if final_norm_option_in_encoder.lower() == "simnorm":
 
@@ -188,25 +188,23 @@ def decode_to_plain_text_for_decoder(
             List[List[int]]: List of decoded strings, one per input in batch.
         """
 
-        # 设置 decoder_network 与 projection_layer 为评估模式，关闭 dropout 等训练行为
+        # Set decoder_network and projection_layer to evaluation mode to disable dropout and other training-specific behaviors.
         self.decoder_network.eval()
         self.projection_layer.eval()
 
-        # 如果 embeddings 不是 Tensor，则转换为 torch.Tensor
+        # If embeddings is not a Tensor, convert it to a torch.Tensor.
         if not isinstance(embeddings, torch.Tensor):
             embeddings = torch.tensor(embeddings, dtype=torch.float32)
 
-        # 尝试从 decoder_network 获取设备信息，如果没有则从模型参数中获取
+        # Attempt to retrieve the device information from decoder_network; if unavailable, fall back to the model’s parameters.
         try:
             device = self.decoder_network.device
         except AttributeError:
             device = next(self.decoder_network.parameters()).device
 
-        # 将 embeddings 移动到正确的设备上
         embeddings = embeddings.to(device)
 
-        with torch.no_grad():  # 在推理过程中关闭梯度计算，节约显存和计算
-
+        with torch.no_grad(): 
             if embeddings.dim() == 2:
                 embeddings = embeddings.unsqueeze(1)
 
 
@@ -97,17 +97,19 @@ def __init__(self, config: TransformerConfig, tokenizer) -> None:
 
         # print(self.tokenizer.encoder.pretrained_model.encoder.layer[0].attention.output.LayerNorm.weight)
 
-        # 首先，构建需要跳过初始化的模块集合
+        # First, build the set of modules to skip during re-initialization
         skip_modules = set(self.tokenizer.encoder.pretrained_model.modules())
         skip_modules.update(self.tokenizer.decoder_network.modules())
 
         def custom_init(module):
-            # 如果当前 module 属于跳过初始化的模块，则直接返回
+            # If the current module is part of the skip list, return without reinitializing
             if module in skip_modules:
                 return
-            # 否则使用指定的初始化方法
+            # Otherwise, apply the specified initialization method
             init_weights(module, norm_type=self.config.norm_type)
-        # 递归地对模型中所有子模块应用 custom_init 函数
+
+        # Recursively apply `custom_init` to all submodules of the model
+        # NOTE: This step is crucial — without skipping, pretrained modules (e.g., encoder/decoder) would be unintentionally re-initialized
         self.apply(custom_init)
 
         # Apply weight initialization, the order is important
@@ -1414,6 +1416,19 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
         else:
             dormant_ratio_world_model = torch.tensor(0.)
 
+        #  ========== for visualization ==========
+        # Uncomment the lines below for visualization
+        # predict_policy = outputs.logits_policy
+        # predict_policy = F.softmax(outputs.logits_policy, dim=-1)
+        # predict_value = inverse_scalar_transform_handle(outputs.logits_value.reshape(-1, 101)).reshape(batch['observations'].shape[0], batch['observations'].shape[1], 1)
+        # predict_rewards = inverse_scalar_transform_handle(outputs.logits_rewards.reshape(-1, 101)).reshape(batch['observations'].shape[0], batch['observations'].shape[1], 1)
+        # import pdb; pdb.set_trace()
+        # visualize_reward_value_img_policy(original_images, reconstructed_images, target_predict_value, true_rewards, target_policy, predict_value, predict_rewards, predict_policy, not_plot_timesteps=[], suffix='pong_H10_H4_0613')
+
+        # visualize_reward_value_img_policy(original_images, reconstructed_images, target_predict_value, true_rewards, target_policy, predict_value, predict_rewards, predict_policy, not_plot_timesteps=list(np.arange(4,60)), suffix='visual_match_memlen1-60-15/one_success_episode')
+        # visualize_reward_value_img_policy(original_images, reconstructed_images, target_predict_value, true_rewards, target_policy, predict_value, predict_rewards, predict_policy, not_plot_timesteps=list(np.arange(4,60)), suffix='visual_match_memlen1-60-15/one_fail_episode')
+        #  ========== for visualization ==========
+
         # For training stability, use target_tokenizer to compute the true next latent state representations
         with torch.no_grad():
             target_obs_embeddings = target_tokenizer.encode_to_obs_embeddings(batch['observations'])
Original file line number	Diff line number	Diff line change
`@@ -729,7 +729,6 @@ def __init__(`
`729`	`729`	`# last_linear_layer_init_zero=True is beneficial for convergence speed.`
`730`	`730`	`last_linear_layer_init_zero=True,`
`731`	`731`	`)`
`732`		`- # self.sim_norm = SimNorm(simnorm_dim=group_size)`
`733`	`732`
`734`	`733`	`# # Select the normalization method based on the final_norm_option_in_encoder parameter.`
`735`	`734`	`if final_norm_option_in_encoder.lower() == "simnorm":`