subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
from typing import Any, Dict, List, Optional
import nemo_run as run
from nemo_run.config import get_nemorun_home
from nemo_run.core.execution.launcher import SlurmTemplate
from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
from nemo.utils import logging
DEFAULT_NEMO_HOME = os.getenv('NEMO_HOME', DEFAULT_NEMO_CACHE_HOME)
# NOTE: If you update this template,
# PLEASE test it by submitting a job to GPU/node/cluster and verifying the sbatch and bash scripts.
INLINE_TEMPLATE = r"""
#!/usr/bin/env bash
set -euo pipefail
# NOTE: DO NOT change the single quotes to double quotes.
bash -c '{{ pre_cmds }} {{ command }}'
"""
def slurm_executor(
gpu: str,
account: str,
partition: str,
log_dir: str,
nodes: int,
num_gpus_per_node: int,
time_limit: str = "00:30:00",
container_image: str = "nvcr.io/nvidia/nemo:dev",
custom_mounts: List[str] = [],
custom_env_vars: Dict[str, str] = {},
custom_srun_args: List[str] = [],
hf_token: str = None,
nemo_home: str = DEFAULT_NEMO_HOME,
wandb_key: str = None,
network: str = None,
custom_bash_cmds: List[str] = None,
optional_gpus_per_node: Optional[int] = None,
additional_slurm_params: Dict[str, Any] = None,
) -> run.SlurmExecutor:
"""
Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
and fine-tuning experiments
Args:
additional_slurm_params: Dict[str, Any], optional
Additional SLURM parameters to pass to sbatch. These will be converted to #SBATCH directives.
Example: {"nodelist": "node001,node002", "constraint": "gpu"} will generate:
#SBATCH --nodelist=node001,node002
#SBATCH --constraint=gpu
"""
PERF_ENV_VARS = {
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
"HF_HUB_OFFLINE": "1", # Enable online downloads from HuggingFace
"TOKENIZERS_PARALLELISM": "False", # Restrict warning message prints
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
"NVTE_FLASH_ATTN": "1", # Enable Flash Attention, which is needed to enable cuDNN fused attention
"NVTE_FUSED_ATTN": "1", # Enable cuDNN fused attention
"NEMO_LOG_MEMORY_USAGE": "1", # Print memory allocation
}
custom_bash_cmds = [] if custom_bash_cmds is None else custom_bash_cmds
err_msgs = []
mounts = []
srun_args = custom_srun_args.copy() + ["--mpi=pmix", "--no-container-mount-home", "--container-writable"]
if log_dir != get_nemorun_home():
err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
if len(err_msgs) > 0:
logging.error("\n".join(err_msgs))
sys.exit(1)
if gpu.lower() not in ['b200']:
# TODO: we currently disable PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
# on B200 as it causes an unexpected error. Add back when issue is debugged and fixed.
PERF_ENV_VARS["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
PERF_ENV_VARS["NEMORUN_HOME"] = log_dir
if wandb_key is not None:
PERF_ENV_VARS["WANDB_API_KEY"] = wandb_key
if gpu.lower() == 'gb200':
PERF_ENV_VARS["NCCL_NET_GDR_LEVEL"] = "PHB" # For NCCL 2.25
PERF_ENV_VARS["NCCL_NET_GDR_C2C"] = "1" # For NCCL 2.26
if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this to 'DEFAULT_NEMO_HOME'/'NEMO_HOME'
PERF_ENV_VARS["NEMO_HOME"] = nemo_home
mounts.extend([f"{nemo_home}:{nemo_home}"])
if hf_token is not None:
PERF_ENV_VARS.update({"HF_TOKEN": hf_token, "HF_HUB_OFFLINE": "0"})
PERF_ENV_VARS |= custom_env_vars
mounts.extend(custom_mounts)
# add --segment flag to sbatch if job uses GB200 and goes beyond one rack.
segment = None
if num_gpus_per_node == 4 and nodes > 18:
for segment_candidate in range(18, 0, -1):
if nodes % segment_candidate == 0:
segment = segment_candidate
break
numa_divisor = 2 if gpu.lower() == 'gb200' else 4
numa_cmd = f"numactl --cpunodebind=$((SLURM_LOCALID/{numa_divisor})) --membind=$((SLURM_LOCALID/{numa_divisor}))"
custom_bash_cmds.append(numa_cmd)
launcher = SlurmTemplate(
template_inline=INLINE_TEMPLATE,
template_vars={"pre_cmds": " ; ".join(custom_bash_cmds)},
)
executor = run.SlurmExecutor(
account=account,
partition=partition,
tunnel=run.LocalTunnel(job_dir=os.path.join(log_dir, "experiments")),
nodes=nodes,
ntasks_per_node=num_gpus_per_node,
gpus_per_node=optional_gpus_per_node,
container_image=container_image,
container_mounts=mounts,
env_vars=PERF_ENV_VARS,
srun_args=srun_args,
time=time_limit,
mem="0",
exclusive=True,
packager=run.GitArchivePackager(),
segment=segment,
network=network,
launcher=launcher,
additional_parameters=additional_slurm_params,
)
return executor