Spaces:
Runtime error
Runtime error
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import argparse | |
| from nemo_run.config import get_nemorun_home | |
| from .utils import DEFAULT_NEMO_HOME | |
| def parse_cli_args(): | |
| """ | |
| Command line arguments correspong to Slurm cluster and NeMo2.0 for running pre-training and | |
| fine-tuning experiments. | |
| """ | |
| parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning") | |
| parser.add_argument( | |
| "-a", | |
| "--account", | |
| type=str, | |
| help="Slurm account to use for experiment", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "-p", | |
| "--partition", | |
| type=str, | |
| help="Slurm partition to use for experiment", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "-g", | |
| "--gpu", | |
| type=str, | |
| choices=["h100", "b200", "gb200"], | |
| help="Target gpu type.", | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "-l", | |
| "--log_dir", | |
| type=str, | |
| help=f"Directory for logging experiment results. Defaults to {get_nemorun_home()}", | |
| required=False, | |
| default=get_nemorun_home(), | |
| ) | |
| parser.add_argument( | |
| "-t", | |
| "--time_limit", | |
| type=str, | |
| help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')", | |
| required=False, | |
| default="00:30:00", | |
| ) | |
| parser.add_argument( | |
| "--additional_slurm_params", | |
| type=str, | |
| help="Additional SLURM parameters as key=value pairs. " | |
| "Use semicolons (;) to separate parameters when values contain commas. " | |
| "Examples: 'nodelist=node001,node002;constraint=gpu' or 'reservation=my_res;exclusive'", | |
| required=False, | |
| default=None, | |
| ) | |
| container_img_msg = [ | |
| "NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'", | |
| "Make sure your NGC credentials are accessible in your environment.", | |
| ] | |
| parser.add_argument( | |
| "-i", | |
| "--container_image", | |
| type=str, | |
| help=" ".join(container_img_msg), | |
| required=False, | |
| default="nvcr.io/nvidia/nemo:dev", | |
| ) | |
| parser.add_argument( | |
| "-c", | |
| "--compute_dtype", | |
| type=str, | |
| choices=["bf16", "fp8"], | |
| help="Compute precision. Options- bf16 or fp8. Defaults to bf16", | |
| required=False, | |
| default="bf16", | |
| ) | |
| fp8_recipe_msg = ( | |
| "FP8 recipe. Options- ds (per-tensor delayed scaling), cs (per-tensor current scaling), " | |
| "mxfp8, ss (subchannel scaling). Defaults to ds" | |
| ) | |
| parser.add_argument( | |
| "-fr", | |
| "--fp8_recipe", | |
| type=str, | |
| choices=["ds", "cs", "mxfp8", "ss"], | |
| help=fp8_recipe_msg, | |
| required=False, | |
| default="ds", | |
| ) | |
| parser.add_argument( | |
| "-en", | |
| "--enable_nsys", | |
| help="Enable Nsys profiling. Diabled by default", | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "-em", | |
| "--enable_memory_profile", | |
| help="Enable memory usage profiling. Diabled by default", | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "-mp", | |
| "--memory_profile_out_path", | |
| type=str, | |
| help="Path to the output file of memory profiling", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-tb", | |
| "--tensorboard", | |
| help="Enable tensorboard logging. Disabled by default", | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "-wd", | |
| "--wandb", | |
| help="Enable wandb logging. Disabled by default", | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "-wdk", | |
| "--wandb_key", | |
| type=str, | |
| help="wandb key. Needed for wandb logger projetion to server", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-wdp", | |
| "--wandb_prj_name", | |
| type=str, | |
| help="wandb project name", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-wdj", | |
| "--wandb_job_name", | |
| type=str, | |
| help="wandb job name", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-f", | |
| "--finetuning", | |
| choices=["sft", "lora"], | |
| help="Finetuning scheme to use. Defaults to 'lora'", | |
| default='lora', | |
| ) | |
| parser.add_argument( | |
| "-hf", | |
| "--hf_token", | |
| type=str, | |
| help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.", | |
| default=None, | |
| ) | |
| nemo_home_msg = [ | |
| "Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searches", | |
| "for models and datasets. This saves a lot of time (especially for bigger models) if checkpoints already", | |
| f"exist here. Missing files will be downloaded here from HuggingFace. Defaults to {DEFAULT_NEMO_HOME}", | |
| ] | |
| parser.add_argument( | |
| "-nh", | |
| "--nemo_home", | |
| type=str, | |
| help=" ".join(nemo_home_msg), | |
| default=DEFAULT_NEMO_HOME, | |
| ) | |
| parser.add_argument( | |
| "-d", | |
| "--dryrun", | |
| help="If true, prints sbatch script to terminal without launching experiment.", | |
| required=False, | |
| action="store_true", | |
| ) | |
| parser.add_argument( | |
| "-tp", | |
| "--tensor_parallel_size", | |
| type=int, | |
| help="Intra-layer model parallelism. Splits tensors across GPU ranks.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-pp", | |
| "--pipeline_parallel_size", | |
| type=int, | |
| help="Inter-layer model parallelism. Splits transformer layers across GPU ranks.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-cp", | |
| "--context_parallel_size", | |
| type=int, | |
| help="Splits network input along sequence dimension across GPU ranks.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-vp", | |
| "--virtual_pipeline_parallel_size", | |
| type=int, | |
| help="Number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-ep", | |
| "--expert_parallel_size", | |
| type=int, | |
| help="Distributes Moe Experts across sub data parallel dimension.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-et", | |
| "--expert_tensor_parallel_size", | |
| type=lambda x: int(x) if x is not None else None, | |
| nargs="?", | |
| const=None, | |
| help="Intra-layer tensor model parallelsm for expert layer. Splits tensors across GPU ranks.\ | |
| Use -et/--expert_tensor_parallel_size <space> for None or -et/--expert_tensor_parallel_size <int>", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-mb", | |
| "--micro_batch_size", | |
| type=int, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-gb", | |
| "--global_batch_size", | |
| type=int, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-ng", | |
| "--num_gpus", | |
| type=int, | |
| help="Number of gpus.", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-gn", | |
| "--gpus_per_node", | |
| type=int, | |
| help="Number of gpus per node. Defaults to 8", | |
| required=False, | |
| default=8, | |
| ) | |
| parser.add_argument( | |
| "-ms", | |
| "--max_steps", | |
| type=int, | |
| help="Number of train steps. Defaults to 100", | |
| required=False, | |
| default=100, | |
| ) | |
| def bool_arg(arg): | |
| if arg.lower() in ['true', '1', 't', 'yes', 'y']: | |
| return True | |
| elif arg.lower() in ['false', '0', 'f', 'no', 'n']: | |
| return False | |
| else: | |
| raise ValueError(f"Invalid value for boolean argument: {arg}") | |
| parser.add_argument( | |
| "-cg", | |
| "--cuda_graphs", | |
| help="Enable CUDA graphs. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, # NOTE: DO NOT SET DEFAULT TO FALSE, IT WILL BE OVERRIDDEN BY THE RECOMMENDED MODEL CONFIGS | |
| ) | |
| parser.add_argument( | |
| "-fsdp", | |
| "--use_mcore_fsdp", | |
| help="Enable Megatron Core (Mcore) FSDP. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-fsdp_db", | |
| "--use_fsdp_double_buffer", | |
| help="Enable FSDP double buffer. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-ubr", | |
| "--use_user_buffer_registration", | |
| help="Enable user buffer registration. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-sharp", | |
| "--use_sharp", | |
| help="Enable sharp. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-rl", | |
| "--recompute_layers", | |
| type=int, | |
| help="Number of Transformer layers to recompute, where all the intermediate " | |
| "activations of a Transformer layer are computed. Defaults to None", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-ol", | |
| "--activation_offload_layers", | |
| type=int, | |
| help="Number of Transformer layers to offload to the CPU memory. Defaults to None", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--nccl_communicator_config_path", | |
| type=str, | |
| help="Path to NCCL communicator config yaml file", | |
| required=False, | |
| default=None, | |
| ) | |
| def list_of_strings(arg): | |
| return arg.split(',') | |
| parser.add_argument( | |
| "-rm", | |
| "--recompute_modules", | |
| nargs="*", | |
| const=None, | |
| type=str, | |
| help="List of modules to perform selective activation recompute. " | |
| "Users can provide 0 or any number of arguments. Defaults to None", | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-cm", | |
| "--custom_mounts", | |
| type=list_of_strings, | |
| help="Comma separated string of mounts", | |
| required=False, | |
| default=[], | |
| ) | |
| parser.add_argument( | |
| "--use_hf_tokenizer", | |
| help="Use HuggingFace tokenizer. Disabled by default. Null tokenizer will be used if not provided.", | |
| action="store_true", | |
| required=False, | |
| ) | |
| parser.add_argument( | |
| "-dcdfr", | |
| "--dump_config_diff_from_base_recipe", | |
| help="Dump the config diff from the base recipe. Defaults to False", | |
| action="store_true", | |
| required=False, | |
| default=False, | |
| ) | |
| parser.add_argument( | |
| "--keep_fsdp_fp8_transpose_cache", | |
| help="Keep FSDP FP8 transpose cache. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "-vb", | |
| "--enable_vboost", | |
| help="Enable VBoost which steers more power towards tensor cores. Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--use_te_op_fuser", | |
| help="Enable Transformer Engine's operation fuser. This feature is experimental and disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--use_te_act_func", | |
| help="Use TE activation function for the MLP part.", | |
| type=bool_arg, | |
| required=False, | |
| default=None, | |
| ) | |
| parser.add_argument( | |
| "--act_func_fp8_input_store", | |
| help="Store input of activation function in FP8 (tensorwise recipe). Disabled by default", | |
| type=bool_arg, | |
| required=False, | |
| default=False, | |
| ) | |
| parser.add_argument( | |
| "--detach", | |
| help="Detach from experiment. Default is True. Unset to keep the process running.", | |
| type=bool_arg, | |
| required=False, | |
| default=True, | |
| ) | |
| return parser | |
| def parse_additional_slurm_params(params_str): | |
| """ | |
| Parse additional SLURM parameters from a string of key=value pairs. | |
| This function handles different separator formats: | |
| 1. Semicolon-separated: "key1=value1;key2=value2" (recommended for multiple parameters) | |
| 2. Space-separated: "key1=value1 key2=value2" | |
| 3. Single parameter: "key1=value1,value2" (no separators = single parameter) | |
| Args: | |
| params_str (str): String with parameters | |
| Returns: | |
| dict: Dictionary of parameters, or None if params_str is None/empty | |
| Example: | |
| parse_additional_slurm_params("nodelist=node001,node002") | |
| returns {"nodelist": "node001,node002"} | |
| parse_additional_slurm_params("nodelist=node001,node002;constraint=gpu") | |
| returns {"nodelist": "node001,node002", "constraint": "gpu"} | |
| parse_additional_slurm_params("reservation=my_res;constraint=gpu") | |
| returns {"reservation": "my_res", "constraint": "gpu"} | |
| """ | |
| if not params_str: | |
| return None | |
| params = {} | |
| # Try semicolon separation first (most reliable for complex values) | |
| if ';' in params_str: | |
| parts = params_str.split(';') | |
| # Try space separation next | |
| elif ' ' in params_str: | |
| parts = params_str.split() | |
| # No separators found - treat as single parameter | |
| else: | |
| parts = [params_str] | |
| for part in parts: | |
| part = part.strip() | |
| if not part: | |
| continue | |
| if '=' in part: | |
| key, value = part.split('=', 1) | |
| params[key.strip()] = value.strip() | |
| else: | |
| # Boolean flag (no value) | |
| params[part] = True | |
| return params if params else None | |