# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional import nemo_run as run from nemo.collections import llm def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor: env_vars = { "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) return executor def slurm_executor( user: str, host: str, remote_job_dir: str, account: str, partition: str, nodes: int, devices: int, time: str = "01:00:00", custom_mounts: Optional[list[str]] = None, custom_env_vars: Optional[dict[str, str]] = None, container_image: str = "nvcr.io/nvidia/nemo:dev", retries: int = 0, ) -> run.SlurmExecutor: if not (user and host and remote_job_dir and account and partition and nodes and devices): raise RuntimeError( "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." ) mounts = [] if custom_mounts: mounts.extend(custom_mounts) env_vars = { "TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory } if custom_env_vars: env_vars |= custom_env_vars executor = run.SlurmExecutor( account=account, partition=partition, tunnel=run.SSHTunnel( user=user, host=host, job_dir=remote_job_dir, ), nodes=nodes, ntasks_per_node=devices, gpus_per_node=devices, mem="0", exclusive=True, gres="gpu:8", packager=run.GitArchivePackager(subpath="examples/llm/run"), ) executor.container_image = container_image executor.container_mounts = mounts executor.env_vars = env_vars executor.retries = retries executor.time = time return executor def my_slurm_executor(): # TODO: Set your custom parameters for the Slurm Executor. return slurm_executor( user="", host="", remote_job_dir="", account="", partition="", nodes=1, devices=2, ) if __name__ == "__main__": run.cli.main(llm.pretrain, default_executor=local_executor_torchrun) # This will re-expose the pretrain entrypoint with your custom local executor as default. # To run, for instance, the llama3_8b recipe, use the following command: # python default_executor.py --factory llama3_8b # To run with any overrides, use the following command: # python default_executor.py --factory llama3_8b trainer.max_steps=2000 # To use your custom Slurm executor, use the following command: # python default_executor.py --executor my_slurm_executor --factory llama3_8b