run_id: 0906_bestvla_retrain_lr_v2 run_root_dir: ./results/Checkpoints seed: 42 trackers: - jsonl - wandb wandb_entity: jinhuiye wandb_project: InternVLA is_debug: false framework: framework_py: InternVLA-M1 qwenvl: base_vlm: InternRobotics/InternVLA-M1 attn_implementation: flash_attention_2 vl_hidden_dim: 2048 dino: dino_backbone: dinov2_vits14 layer_qformer: qformer_end_layer: 37 qformer_start_layer: 36 num_query_tokens: 64 input_dim: 2048 ouptput_dim: 768 grad_scale: 0.5 action_model: action_model_type: DiT-B action_hidden_dim: 768 action_dim: 7 use_ema: false future_action_window_size: 15 past_action_window_size: 0 repeated_diffusion_steps: 8 reduce_in_full_precision: true datasets: vlm_data: dataformat: llava_json dataset_use: ao_droid_data,ao_droid_molmo_sam2,ao_hoi4d_data,ao_maniskills,ao_hoi4d_frame_data,pixmo_point,refspatial_sim%10,xudong_spatial_interact%10,xudong_invalid_task%10,xudong_task_onlyaction%10,xudong_task_cot_cap_resp_act%10,gsys2_14kv2_gd_coco_rule%10,gsys2_14kv2_obj_attr%10,gsys2_14kv2_obj_nearby%10,gsys2_14kv2_obj_senmatic%10,gsys2_14kv2_action_plan%10,asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_neg_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en,robo_point_refobj,robo_point_refregion,roborefit,ao_droid_data,ao_droid_molmo_sam2,ao_hoi4d_data,ao_hoi4d_frame_data,ao_maniskills,molmo_traj_auxiliary_trace,molmo_traj_bridge_dataset,molmo_traj_bc_z,molmo_traj_fractal eval_dataset: aokvqa_cauldron_llava_format data_flatten: false base_interval: 2 max_pixels: 12845056 min_pixels: 3136 model_max_length: 2048 model_type: qwen2.5vl per_device_batch_size: 2 vla_data: dataset_py: rlds_datasets data_root_dir: playground/Datasets/OXE_openvla data_mix: bridge_rt_1 CoT_prompt: Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format. CoT_answer: bbox default_image_resolution: - 3 - 224 - 224 shuffle_buffer_size: 250000 image_aug: true per_device_batch_size: 16 load_all_data_for_training: true trainer: epochs: 100 max_train_steps: 100000 num_warmup_steps: 5000 save_interval: 5000 eval_interval: 100 learning_rate: base: 4.0e-05 qwen_vl_interface: 1.0e-05 action_model: 0.0001 lr_scheduler_type: cosine_with_min_lr scheduler_specific_kwargs: min_lr: 5.0e-07 freeze_modules: null loss_scale: vla: 1.0 vlm: 0.1 max_grad_norm: 1.0 warmup_ratio: 0.1 weight_decay: 0.0 logging_frequency: 10 gradient_clipping: 1.0 gradient_accumulation_steps: 1 optimizer: name: AdamW betas: - 0.9 - 0.95 eps: 1.0e-08 weight_decay: 1.0e-08 is_resume: false resume_epoch: null resume_step: null enable_gradient_checkpointing: true enable_mixed_precision_training: true is_resume: false output_dir: ./results/Checkpoints/0906_internvla_m1