# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse from nemo_run.config import get_nemorun_home from .utils import DEFAULT_NEMO_HOME def parse_cli_args(): """ Command line arguments correspong to Slurm cluster and NeMo2.0 for running pre-training and fine-tuning experiments. """ parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning") parser.add_argument( "-a", "--account", type=str, help="Slurm account to use for experiment", required=True, ) parser.add_argument( "-p", "--partition", type=str, help="Slurm partition to use for experiment", required=True, ) parser.add_argument( "-g", "--gpu", type=str, choices=["h100", "b200", "gb200"], help="Target gpu type.", required=True, ) parser.add_argument( "-l", "--log_dir", type=str, help=f"Directory for logging experiment results. Defaults to {get_nemorun_home()}", required=False, default=get_nemorun_home(), ) parser.add_argument( "-t", "--time_limit", type=str, help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')", required=False, default="00:30:00", ) parser.add_argument( "--additional_slurm_params", type=str, help="Additional SLURM parameters as key=value pairs. " "Use semicolons (;) to separate parameters when values contain commas. " "Examples: 'nodelist=node001,node002;constraint=gpu' or 'reservation=my_res;exclusive'", required=False, default=None, ) container_img_msg = [ "NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'", "Make sure your NGC credentials are accessible in your environment.", ] parser.add_argument( "-i", "--container_image", type=str, help=" ".join(container_img_msg), required=False, default="nvcr.io/nvidia/nemo:dev", ) parser.add_argument( "-c", "--compute_dtype", type=str, choices=["bf16", "fp8"], help="Compute precision. Options- bf16 or fp8. Defaults to bf16", required=False, default="bf16", ) fp8_recipe_msg = ( "FP8 recipe. Options- ds (per-tensor delayed scaling), cs (per-tensor current scaling), " "mxfp8, ss (subchannel scaling). Defaults to ds" ) parser.add_argument( "-fr", "--fp8_recipe", type=str, choices=["ds", "cs", "mxfp8", "ss"], help=fp8_recipe_msg, required=False, default="ds", ) parser.add_argument( "-en", "--enable_nsys", help="Enable Nsys profiling. Diabled by default", action="store_true", ) parser.add_argument( "-em", "--enable_memory_profile", help="Enable memory usage profiling. Diabled by default", action="store_true", ) parser.add_argument( "-mp", "--memory_profile_out_path", type=str, help="Path to the output file of memory profiling", required=False, default=None, ) parser.add_argument( "-tb", "--tensorboard", help="Enable tensorboard logging. Disabled by default", action="store_true", ) parser.add_argument( "-wd", "--wandb", help="Enable wandb logging. Disabled by default", action="store_true", ) parser.add_argument( "-wdk", "--wandb_key", type=str, help="wandb key. Needed for wandb logger projetion to server", required=False, default=None, ) parser.add_argument( "-wdp", "--wandb_prj_name", type=str, help="wandb project name", required=False, default=None, ) parser.add_argument( "-wdj", "--wandb_job_name", type=str, help="wandb job name", required=False, default=None, ) parser.add_argument( "-f", "--finetuning", choices=["sft", "lora"], help="Finetuning scheme to use. Defaults to 'lora'", default='lora', ) parser.add_argument( "-hf", "--hf_token", type=str, help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.", default=None, ) nemo_home_msg = [ "Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searches", "for models and datasets. This saves a lot of time (especially for bigger models) if checkpoints already", f"exist here. Missing files will be downloaded here from HuggingFace. Defaults to {DEFAULT_NEMO_HOME}", ] parser.add_argument( "-nh", "--nemo_home", type=str, help=" ".join(nemo_home_msg), default=DEFAULT_NEMO_HOME, ) parser.add_argument( "-d", "--dryrun", help="If true, prints sbatch script to terminal without launching experiment.", required=False, action="store_true", ) parser.add_argument( "-tp", "--tensor_parallel_size", type=int, help="Intra-layer model parallelism. Splits tensors across GPU ranks.", required=False, default=None, ) parser.add_argument( "-pp", "--pipeline_parallel_size", type=int, help="Inter-layer model parallelism. Splits transformer layers across GPU ranks.", required=False, default=None, ) parser.add_argument( "-cp", "--context_parallel_size", type=int, help="Splits network input along sequence dimension across GPU ranks.", required=False, default=None, ) parser.add_argument( "-vp", "--virtual_pipeline_parallel_size", type=int, help="Number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.", required=False, default=None, ) parser.add_argument( "-ep", "--expert_parallel_size", type=int, help="Distributes Moe Experts across sub data parallel dimension.", required=False, default=None, ) parser.add_argument( "-et", "--expert_tensor_parallel_size", type=lambda x: int(x) if x is not None else None, nargs="?", const=None, help="Intra-layer tensor model parallelsm for expert layer. Splits tensors across GPU ranks.\ Use -et/--expert_tensor_parallel_size for None or -et/--expert_tensor_parallel_size ", required=False, default=None, ) parser.add_argument( "-mb", "--micro_batch_size", type=int, required=False, default=None, ) parser.add_argument( "-gb", "--global_batch_size", type=int, required=False, default=None, ) parser.add_argument( "-ng", "--num_gpus", type=int, help="Number of gpus.", required=False, default=None, ) parser.add_argument( "-gn", "--gpus_per_node", type=int, help="Number of gpus per node. Defaults to 8", required=False, default=8, ) parser.add_argument( "-ms", "--max_steps", type=int, help="Number of train steps. Defaults to 100", required=False, default=100, ) def bool_arg(arg): if arg.lower() in ['true', '1', 't', 'yes', 'y']: return True elif arg.lower() in ['false', '0', 'f', 'no', 'n']: return False else: raise ValueError(f"Invalid value for boolean argument: {arg}") parser.add_argument( "-cg", "--cuda_graphs", help="Enable CUDA graphs. Disabled by default", type=bool_arg, required=False, default=None, # NOTE: DO NOT SET DEFAULT TO FALSE, IT WILL BE OVERRIDDEN BY THE RECOMMENDED MODEL CONFIGS ) parser.add_argument( "-fsdp", "--use_mcore_fsdp", help="Enable Megatron Core (Mcore) FSDP. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "-fsdp_db", "--use_fsdp_double_buffer", help="Enable FSDP double buffer. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "-ubr", "--use_user_buffer_registration", help="Enable user buffer registration. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "-sharp", "--use_sharp", help="Enable sharp. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "-rl", "--recompute_layers", type=int, help="Number of Transformer layers to recompute, where all the intermediate " "activations of a Transformer layer are computed. Defaults to None", required=False, default=None, ) parser.add_argument( "-ol", "--activation_offload_layers", type=int, help="Number of Transformer layers to offload to the CPU memory. Defaults to None", required=False, default=None, ) parser.add_argument( "--nccl_communicator_config_path", type=str, help="Path to NCCL communicator config yaml file", required=False, default=None, ) def list_of_strings(arg): return arg.split(',') parser.add_argument( "-rm", "--recompute_modules", nargs="*", const=None, type=str, help="List of modules to perform selective activation recompute. " "Users can provide 0 or any number of arguments. Defaults to None", required=False, default=None, ) parser.add_argument( "-cm", "--custom_mounts", type=list_of_strings, help="Comma separated string of mounts", required=False, default=[], ) parser.add_argument( "--use_hf_tokenizer", help="Use HuggingFace tokenizer. Disabled by default. Null tokenizer will be used if not provided.", action="store_true", required=False, ) parser.add_argument( "-dcdfr", "--dump_config_diff_from_base_recipe", help="Dump the config diff from the base recipe. Defaults to False", action="store_true", required=False, default=False, ) parser.add_argument( "--keep_fsdp_fp8_transpose_cache", help="Keep FSDP FP8 transpose cache. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "-vb", "--enable_vboost", help="Enable VBoost which steers more power towards tensor cores. Disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "--use_te_op_fuser", help="Enable Transformer Engine's operation fuser. This feature is experimental and disabled by default", type=bool_arg, required=False, default=None, ) parser.add_argument( "--use_te_act_func", help="Use TE activation function for the MLP part.", type=bool_arg, required=False, default=None, ) parser.add_argument( "--act_func_fp8_input_store", help="Store input of activation function in FP8 (tensorwise recipe). Disabled by default", type=bool_arg, required=False, default=False, ) parser.add_argument( "--detach", help="Detach from experiment. Default is True. Unset to keep the process running.", type=bool_arg, required=False, default=True, ) return parser def parse_additional_slurm_params(params_str): """ Parse additional SLURM parameters from a string of key=value pairs. This function handles different separator formats: 1. Semicolon-separated: "key1=value1;key2=value2" (recommended for multiple parameters) 2. Space-separated: "key1=value1 key2=value2" 3. Single parameter: "key1=value1,value2" (no separators = single parameter) Args: params_str (str): String with parameters Returns: dict: Dictionary of parameters, or None if params_str is None/empty Example: parse_additional_slurm_params("nodelist=node001,node002") returns {"nodelist": "node001,node002"} parse_additional_slurm_params("nodelist=node001,node002;constraint=gpu") returns {"nodelist": "node001,node002", "constraint": "gpu"} parse_additional_slurm_params("reservation=my_res;constraint=gpu") returns {"reservation": "my_res", "constraint": "gpu"} """ if not params_str: return None params = {} # Try semicolon separation first (most reliable for complex values) if ';' in params_str: parts = params_str.split(';') # Try space separation next elif ' ' in params_str: parts = params_str.split() # No separators found - treat as single parameter else: parts = [params_str] for part in parts: part = part.strip() if not part: continue if '=' in part: key, value = part.split('=', 1) params[key.strip()] = value.strip() else: # Boolean flag (no value) params[part] = True return params if params else None