Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

subhankarg commited on 5 days ago

Commit

0558aa4

verified ·

1 Parent(s): 5b5c9b5

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.coveragerc +36 -0
.dockerignore +19 -0
.flake8 +9 -0
.flake8.other +9 -0
.flake8.speech +9 -0
.gitattributes +88 -0
.github/CODEOWNERS +4 -0
.github/ISSUE_TEMPLATE/bug_report.md +42 -0
.github/ISSUE_TEMPLATE/config.yml +2 -0
.github/ISSUE_TEMPLATE/dev_container_bug_report.md +35 -0
.github/ISSUE_TEMPLATE/feature_request.md +25 -0
.github/PULL_REQUEST_TEMPLATE.md +57 -0
.github/actions/cancel-workflow/action.yml +25 -0
.github/actions/test-template/action.yml +227 -0
.github/labeler.yml +55 -0
.github/scripts/__init__.py +0 -0
.github/scripts/components_to_run.py +84 -0
.github/scripts/nemo_dependencies.py +400 -0
.github/scripts/notify.py +79 -0
.github/workflows/_build_container.yml +89 -0
.github/workflows/_bump_mcore_tag.yml +56 -0
.github/workflows/build-test-publish-wheel.yml +38 -0
.github/workflows/changelog-build.yml +123 -0
.github/workflows/cherry-pick-release-commit.yml +14 -0
.github/workflows/cicd-approve-test-queue.yml +175 -0
.github/workflows/cicd-main-nemo2.yml +299 -0
.github/workflows/cicd-main-speech.yml +216 -0
.github/workflows/cicd-main-testcopy.yml +472 -0
.github/workflows/cicd-main-unit-tests.yml +212 -0
.github/workflows/cicd-main.yml +450 -0
.github/workflows/cicd-relabel-bot.yml +36 -0
.github/workflows/close-inactive-issue-pr.yml +25 -0
.github/workflows/code-formatting.yml +73 -0
.github/workflows/code-init-file-checker.yml +23 -0
.github/workflows/code-linting.yml +160 -0
.github/workflows/codeql.yml +75 -0
.github/workflows/community-bot.yml +15 -0
.github/workflows/config/changelog-config.json +134 -0
.github/workflows/config/codeql.yml +9 -0
.github/workflows/copyright-check.yml +22 -0
.github/workflows/gh-docs.yml +81 -0
.github/workflows/install-test.yml +286 -0
.github/workflows/labeler.yml +14 -0
.github/workflows/mcore-tag-bump-bot.yml +62 -0
.github/workflows/monitor-single-vm.yml +54 -0
.github/workflows/monitor-vms.yml +54 -0
.github/workflows/release-freeze.yml +85 -0
.github/workflows/release.yml +48 -0
.github/workflows/secrets-detector.yml +43 -0
.github/workflows/update-buildcache.yml +110 -0

.coveragerc ADDED Viewed

	@@ -0,0 +1,36 @@

+[run]
+concurrency = thread,multiprocessing
+omit =
+    /tmp/*
+    /home/TestData/*
+    /workspace/Megatron-LM/*
+    nemo/collections/multimodal/*
+    nemo/collections/multimodal_autoregressive/*
+    nemo/collections/vision/*
+    nemo/collections/diffusion/*
+    nemo/collections/nlp/*
+    nemo/collections/asr/*
+    nemo/collections/speechlm/*
+    nemo/collections/tts/*
+    # omit from audio
+    nemo/collections/audio/data/data_simulation.py
+    nemo/collections/audio/metrics/squim.py
+    nemo/collections/audio/losses/maxine/*
+    nemo/collections/audio/models/maxine/*
+    nemo/collections/audio/parts/utils/maxine.py
+    nemo/core/*
+    nemo/collections/common/*
+    /workspace/config-3.12.py
+    /workspace/config-3.py
+    /workspace/config.py
+[paths]
+source =
+    nemo/
+    /home/runner/work/NeMo/NeMo/nemo
+    /workspace/nemo

.dockerignore ADDED Viewed

	@@ -0,0 +1,19 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+*.log
+.git
+**/*.nemo
+**/*.ckpt

.flake8 ADDED Viewed

	@@ -0,0 +1,9 @@

+[flake8]
+max-line-length = 119
+select =
+    F541, # f-string without any placeholders
+    F841, # local variable 'x' is assigned to but never used
+    F401, # 'x' imported but unused
+    E741, # ambiguous variable name 'l'
+    F821, # undefined name 'x'
+    E266, # too many leading '#' for block comment

.flake8.other ADDED Viewed

	@@ -0,0 +1,9 @@

+[flake8]
+max-line-length = 119
+select =
+    F541, # f-string without any placeholders
+    F841, # local variable 'x' is assigned to but never used
+    F401, # 'x' imported but unused
+    E741, # ambiguous variable name 'l'
+    F821, # undefined name 'x'
+    E266, # too many leading '#' for block comment

.flake8.speech ADDED Viewed

	@@ -0,0 +1,9 @@

+[flake8]
+max-line-length = 119
+select =
+    F541, # f-string without any placeholders
+    F841, # local variable 'x' is assigned to but never used
+    F401, # 'x' imported but unused
+    E741, # ambiguous variable name 'l'
+    F821, # undefined name 'x'
+    E266, # too many leading '#' for block comment

.gitattributes CHANGED Viewed

@@ -33,3 +33,91 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/citrinet_vertical.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/conf-ensembles-overview.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/conformer_ctc.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/hat.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/hybrid_asr_tts_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/jasper_vertical.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/quartz_vertical.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/images/squeezeformer.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/aosc_3spk_example.gif filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/asr_sd_diagram.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/ats.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/cache_fifo_chunk.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/data_flow.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/e2e_and_cascaded_diar_systems.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/ms_trade_off.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/scale_weight_cnn.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/streaming_steps.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_diarization/images/weighted_sum.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speaker_recognition/images/titanet_network.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speech_classification/images/marblenet_vertical.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speech_classification/images/matchboxnet_vertical.png filter=lfs diff=lfs merge=lfs -text
+docs/source/asr/speech_intent_slot/images/example.png filter=lfs diff=lfs merge=lfs -text
+docs/source/core/whyntypes.gif filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/entity_linking_overview.jpg filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/customization_forward.png filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/customization_module.png filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/ddp.gif filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/pnom.gif filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/pp.gif filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/pp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/tp1.png filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/tp2.png filter=lfs diff=lfs merge=lfs -text
+docs/source/nlp/nemo_megatron/images/tp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tools/images/scrsh_2.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tools/images/scrsh_9.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tools/images/sde_mls_player.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tools/images/sde_player.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tools/images/sde_samples.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/audiocodec_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/data_labeling_pipeline.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/fastpitch_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/hifigan_d_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/hifigan_g_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/mixertts_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/radaligner_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/radtts_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/tacotron2_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/univnet_model.png filter=lfs diff=lfs merge=lfs -text
+docs/source/tts/images/waveglow_model.png filter=lfs diff=lfs merge=lfs -text
+nemo/collections/diffusion/assets/mixed_training.png filter=lfs diff=lfs merge=lfs -text
+nemo/collections/diffusion/assets/pipeline_conditioning.png filter=lfs diff=lfs merge=lfs -text
+nemo/collections/diffusion/assets/st_dit_hybrid_parallel.png filter=lfs diff=lfs merge=lfs -text
+output_audio_context.wav filter=lfs diff=lfs merge=lfs -text
+output_baked.wav filter=lfs diff=lfs merge=lfs -text
+tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
+tools/speech_data_simulator/pictures/audio_session.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/canary2_timestamps.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/multi_instance.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/multilang_asr_inference.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/multilang_asr_train.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/promptformat.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/speaker_injection.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/test_wer_wandb.png filter=lfs diff=lfs merge=lfs -text
+tutorials/asr/images/tokenizer.png filter=lfs diff=lfs merge=lfs -text
+tutorials/llm/llama/domain-adaptive-pretraining/code/imgs/tokenization_diagram.png filter=lfs diff=lfs merge=lfs -text
+tutorials/llm/qwen/pruning-distillation/imgs/val_loss_comparison.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/affinity_matrix_fusion.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/ats.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/cache_fifo_chunk.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/cascaded_diar_diagram.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/msdd_inputs.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/msdd_output_loss.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/multiscale_example.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
+tutorials/speaker_tasks/images/streaming_steps.png filter=lfs diff=lfs merge=lfs -text
+tutorials/tts/audio_samples/new_dict_entry.wav filter=lfs diff=lfs merge=lfs -text
+tutorials/tts/audio_samples/phonemes_as_input.wav filter=lfs diff=lfs merge=lfs -text
+tutorials/tts/images/tacotron2_diagram.png filter=lfs diff=lfs merge=lfs -text

.github/CODEOWNERS ADDED Viewed

	@@ -0,0 +1,4 @@

+.github/ @pablo-garay @ko3n1g @thomasdhc @chtruong814
+docker/Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
+.pylintrc.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
+.flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814

.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,42 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**Steps/Code to reproduce bug**
+Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
+A  helpful guide on on how to craft a minimal bug report  http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Environment overview (please complete the following information)**
+ - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider - AWS, Azure, GCP, Collab)]
+ - Method of NeMo install: [pip install or from source]. Please specify exact commands you used to install.
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
+**Environment details**
+If NVIDIA docker image is used you don't need to specify these.
+Otherwise, please provide:
+- OS version
+- PyTorch version
+- Python version
+**Additional context**
+Add any other context about the problem here.
+Example: GPU model

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ blank_issues_enabled: false
2	+

.github/ISSUE_TEMPLATE/dev_container_bug_report.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+container pulled on date: mm/dd/yyyy
+name: Dev container - Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**Steps/Code to reproduce bug**
+Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
+A  helpful guide on on how to craft a minimal bug report  http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Environment overview (please complete the following information)**
+ - Environment location: Docker
+ - Method of install: Please specify exact commands you used to install.
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
+**Additional context**
+Add any other context about the problem here.
+Example: GPU model

.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,25 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: feature request
+assignees: okuchaiev
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+Provide a code snippet on how new APIs/changes would be used by others.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,57 @@

+> [!IMPORTANT]
+> The `Update branch` button must only be pressed in very rare occassions.
+> An outdated branch is never blocking the merge of a PR.
+> Please reach out to the automation team before pressing that button.
+# What does this PR do ?
+Add a one line overview of what this PR aims to accomplish.
+**Collection**: [Note which collection this PR will affect]
+# Changelog
+- Add specific line by line info of high level changes in this PR.
+# Usage
+- You can potentially add a usage example below
+```python
+# Add a code snippet demonstrating how to use this
+```
+# GitHub Actions CI
+The Jenkins CI system has been replaced by GitHub Actions self-hosted runners.
+The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR.
+To re-run CI remove and add the label again.
+To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run".
+# Before your PR is "Ready for review"
+**Pre checks**:
+- [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
+- [ ] Did you write any new necessary tests?
+- [ ] Did you add or update any necessary documentation?
+- [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc)
+  - [ ] Reviewer: Does the PR have correct import guards for all optional libraries?
+**PR Type**:
+- [ ] New Feature
+- [ ] Bugfix
+- [ ] Documentation
+If you haven't finished some of the above items you can still open "Draft" PR.
+## Who can review?
+Anyone in the community is free to review the PR once the checks have passed.
+[Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
+# Additional Information
+- Related to # (issue)

.github/actions/cancel-workflow/action.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Cancel Workflow
+description: >
+  Cancels the current workflow run, i.e. all jobs. Useful if you want to cancel the rest of the workflow when one job
+  fails. Note that this will cause the workflow to appear cancelled, not failed.
+# Cancelling the workflow in a post-script (like this:
+# https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runspost; can also be done with
+# this action: https://github.com/webiny/action-post-run, see Git history of this file) wouldn't help the status, it
+# would still be cancelled. It actually indeed is, but it would be nicer to set it to failed, but there seems to be no
+# way to do this.
+runs:
+  using: "composite"
+  steps:
+    - name: Cancel Workflow
+      # # Fork PRs won't have a token with write access to Actions, thus won't be able to cancel the workflow.
+      # if: github.event.pull_request == '' || github.event.pull_request.head.repo.fork == false
+      shell: bash
+      run: |
+        curl --verbose \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ github.token }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel

.github/actions/test-template/action.yml ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "Test Template"
+description: "Template for running NeMo tests in a containerized environment"
+inputs:
+  runner:
+    description: "Runner to use for test"
+    required: true
+  timeout:
+    description: "Max runtime of test in minutes"
+    required: false
+    default: "10"
+  script:
+    description: "Test script to execute"
+    required: true
+  after_script:
+    description: "Script to run after main test"
+    required: false
+    default: ":"
+  is_optional:
+    description: "Failure will cancel all other tests if set to true"
+    required: false
+    default: "false"
+  is_unit_test:
+    description: "Upload coverage as unit test"
+    required: false
+    default: "false"
+  tests_to_run:
+    description: "Tests to run"
+    required: false
+    default: '["all"]'
+  image:
+    description: "Image to use for test"
+    required: false
+    default: "nemo_container"
+  cpu-only:
+    description: "Run tests on CPU only"
+    required: false
+    default: "false"
+runs:
+  using: "composite"
+  steps:
+    - name: Noop
+      shell: bash
+      run: |
+        chmod -R u+rwX ${{ github.run_id }}
+        echo "noop"
+    - name: Docker system cleanup
+      shell: bash
+      run: |
+        docker system prune -af --filter "until=24h" --filter "label!=nemo.pr_number=${{ github.event.pull_request.number || 0 }}" --force || true
+    - name: Docker pull image
+      shell: bash
+      run: |
+        docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }}
+    - name: Clean repos
+      shell: bash
+      run: |
+    - name: Create UUID
+      id: uuid
+      shell: bash
+      run: |
+        echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
+    - name: Checkout NeMo
+      uses: actions/checkout@v4
+      env:
+        DIR: ${{ github.run_id }}
+      with:
+        path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
+    - name: Start container
+      shell: bash
+      env:
+        DIR: ${{ github.run_id }}
+      run: |
+        mkdir -p $DIR
+        # Map of runner names to GPU device configurations
+        declare -A GPU_CONFIGS=(
+          ["myVm-01"]="0,1"
+          ["myVm-02"]="2,3"
+          ["myVm-03"]="4,5"
+          ["myVm-04"]="6,7"
+        )
+        ARG=("")
+        if [[ "${{ inputs.cpu-only }}" == "false" ]]; then
+          ARG=("--runtime=nvidia --gpus all")
+        fi
+        cmd=$(cat <<RUN_TEST_EOF
+        #!/bin/bash
+        docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
+        docker run \
+          --rm \
+          -d \
+          --name nemo_container_${{ github.run_id }}_${{ inputs.runner }} ${ARG[@]} \
+          --shm-size=64g \
+          --env TRANSFORMERS_OFFLINE=0 \
+          --env HYDRA_FULL_ERROR=1 \
+          --env HF_HOME=/home/TestData/HF_HOME \
+          --env RUN_ID=${{ github.run_id }} \
+          --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
+          --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
+          bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
+        RUN_TEST_EOF
+        )
+        echo "$cmd" | tee "$DIR/retry_job.sh"
+        bash $DIR/retry_job.sh
+    - name: Create run-script
+      id: create
+      env:
+        DIR: ${{ github.run_id }}
+      shell: bash
+      run: |
+        COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
+        echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
+        mkdir -p $DIR
+        rm $DIR/.coverage || true
+        rm $DIR/err.log || true
+        cmd=$(cat <<RUN_TEST_EOF
+        #!/bin/bash
+        (
+          set -e
+          docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\
+            cp -r /opt/Megatron-LM/ /workspace/ && \
+            bash tests/functional_tests/${{ inputs.script }}.sh && \
+            echo "Finished successfully." || echo "Did not finish."'
+        ) 2>&1 | tee $DIR/err.log
+        RUN_TEST_EOF
+        )
+        echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
+        echo "$cmd" | tee "$DIR/job.sh"
+    - name: Run main script
+      uses: nick-fields/retry@v3
+      with:
+        timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
+        max_attempts: 3
+        shell: bash
+        retry_on: timeout
+        command: /bin/bash ${{ github.run_id }}/job.sh
+        on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh
+    - name: Check result
+      id: check
+      shell: bash
+      env:
+        DIR: ${{ github.run_id }}
+      run: |
+        cat $DIR/err.log
+        log=$(tail -c 2000 $DIR/err.log |  base64 -w 0)
+        echo "log=$log" >> "$GITHUB_OUTPUT"
+        potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
+        echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
+        docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage combine
+        docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage xml
+        docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/.coverage $DIR/.coverage
+        docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/coverage.xml $DIR/coverage.xml
+        coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
+        echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
+        IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
+        if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is_optional }}" == "true" ]]; then
+          echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
+          IS_SUCCESS=true
+        fi
+        if [[ "$IS_SUCCESS" == "false" ]]; then
+          echo Test did not finish successfully.
+          exit 1
+        fi
+        exit $EXIT_CODE
+    - name: Test coverage
+      shell: bash -x -e -u -o pipefail {0}
+      run: |
+        docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage report -i
+    - name: Upload artifacts
+      uses: actions/upload-artifact@v4
+      if: ${{ steps.check.outputs.coverage_report != 'none' }}
+      with:
+        name: ${{ steps.check.outputs.coverage_report }}
+        path: |
+          ${{ github.run_id }}/coverage.xml
+          ${{ github.run_id }}/.coverage
+        include-hidden-files: true
+    - name: Container shutdown
+      if: always()
+      shell: bash
+      run: |
+        docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c "chown -R $(id -u):$(id -g) /workspace"
+        rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
+        docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true

.github/labeler.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+ASR:
+- nemo/collections/asr/**/*
+- examples/asr/**/*
+- tutorials/asr/**/*
+- docs/source/asr/**/*
+- tests/collections/asr/**
+NLP:
+- nemo/collections/nlp/**/*
+- examples/nlp/**/*
+- tutorials/nlp/**/*
+- docs/source/nlp/**/*
+- tests/collections/nlp/**
+Multi Modal:
+- nemo/collections/multimodal/**/*
+- examples/multimodal/**/*
+- tutorials/multimodal/**/*
+- docs/source/multimodal/**/*
+- tests/collections/multimodal/**
+Speaker Tasks:
+- examples/speaker_tasks/**/*
+- tutorials/speaker_tasks/**/*
+TTS:
+- nemo/collections/tts/**/*
+- nemo/collections/common/tokenizers/text_to_speech/**
+- examples/tts/**/*
+- tutorials/tts/**/*
+- docs/source/tts/**/*
+- scripts/dataset_processing/tts/**
+- scripts/tts_dataset_files/**
+- tests/collections/tts/**
+- tests/collections/common/tokenizers/text_to_speech/**
+Audio:
+- nemo/collections/audio/**/*
+- examples/audio/**/*
+- tutorials/audio/**/*
+- docs/source/audio/**/*
+- tests/collections/audio/**
+core:
+- nemo/core/**/*
+- tests/core/**
+common:
+- nemo/collections/common/**/*
+CI:
+- .github/**/*
+- Jenkinsfile
+- Dockerfile
+- ci.groovy

.github/scripts/__init__.py ADDED Viewed

File without changes

.github/scripts/components_to_run.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env python3
+import json
+import os
+import sys
+from typing import Any, Dict, List, Set
+import click
+import git
+import nemo_dependencies
+def get_changed_files(source_sha: str, target_sha: str) -> List[str]:
+    """
+    Fetch the changelog between current branch and main.
+    Returns a list of dictionaries containing commit information.
+    """
+    try:
+        # Initialize the repo object - go up two levels from this file's location
+        repo = git.Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
+        # Get the diff between target and source
+        diff_index = repo.commit(target_sha).diff(repo.commit(source_sha))
+        # Get just the changed filenames
+        changed_files = []
+        for diff in diff_index:
+            changed_files.append(diff.a_path if diff.a_path else diff.b_path)
+        return changed_files
+    except git.exc.GitCommandError as e:
+        print(f"Error fetching changelog: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Unexpected error: {e}", file=sys.stderr)
+        sys.exit(1)
+@click.command()
+@click.option('--source-sha', type=str, required=True, help='Source commit SHA')
+@click.option('--target-sha', type=str, required=True, help='Target commit sha')
+def main(source_sha: str, target_sha: str):
+    """
+    Main function to fetch and output the changelog and changed files.
+    """
+    # Output unique changed files
+    print("\nChanged files:")
+    changed_files = get_changed_files(source_sha, target_sha)
+    print(json.dumps(sorted(list(changed_files)), indent=2))
+    nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    # Build dependency graph
+    dependencies = nemo_dependencies.build_dependency_graph(nemo_root)
+    test_modules: List[str] = []
+    for changed_file in changed_files:
+        if changed_file in dependencies:
+            test_modules.extend(dependencies[changed_file])
+    test_modules = list(set(test_modules))
+    with open("test_modules.json", "w", encoding="utf-8") as f:
+        json.dump(test_modules, f)
+if __name__ == "__main__":
+    main()

.github/scripts/nemo_dependencies.py ADDED Viewed

	@@ -0,0 +1,400 @@

+#!/usr/bin/env python3
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+NeMo dependency structure definition.
+This module analyzes the codebase to determine internal dependencies between NeMo collections and core components.
+"""
+import ast
+import json
+import os
+from typing import Dict, List, Set
+def find_python_files(directory: str) -> List[str]:
+    """Find all Python files in the given directory and its subdirectories."""
+    python_files = []
+    # Look in nemo directory and other relevant directories
+    relevant_dirs = ['nemo', 'scripts', 'examples', 'tests']
+    for dir_name in relevant_dirs:
+        dir_path = os.path.join(directory, dir_name)
+        if os.path.exists(dir_path):
+            for root, _, files in os.walk(dir_path):
+                for file in files:
+                    if file.endswith('.py'):
+                        python_files.append(os.path.join(root, file))
+    return python_files
+def analyze_imports(nemo_root: str, file_path: str) -> Set[str]:
+    """Analyze a Python file and return its NeMo package dependencies using AST parsing."""
+    imports = set()
+    visited = set()  # Track visited modules to prevent circular imports
+    def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]:
+        """Recursively analyze imports from __init__.py files and map them to their final destinations."""
+        # Prevent infinite recursion
+        if depth > 10 or module_path in visited:  # Limit depth to 10 levels
+            return {}
+        visited.add(module_path)
+        init_path = os.path.join(module_path, '__init__.py')
+        if not os.path.exists(init_path):
+            return {}
+        try:
+            with open(init_path, 'r', encoding='utf-8') as f:
+                init_tree = ast.parse(f.read(), filename=init_path)
+            import_map = {}
+            for node in ast.walk(init_tree):
+                if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
+                    if node.names:
+                        for name in node.names:
+                            if name.name == '*':
+                                continue
+                            # Get the full module path for the import
+                            module_parts = node.module.split('.')
+                            module_dir = os.path.join(nemo_root, *module_parts)
+                            # If the imported module has an __init__.py, recursively analyze it
+                            if os.path.exists(os.path.join(module_dir, '__init__.py')):
+                                sub_imports = get_init_imports(module_dir, depth + 1)
+                                if name.name in sub_imports:
+                                    import_map[name.name] = sub_imports[name.name]
+                                else:
+                                    # If not found in sub-imports, it might be from the module itself
+                                    module_file = os.path.join(module_dir, f"{module_parts[-1]}.py")
+                                    if os.path.exists(module_file):
+                                        import_map[name.name] = f"{node.module}.{name.name}"
+                            else:
+                                # Direct module import
+                                import_map[name.name] = f"{node.module}.{name.name}"
+            return import_map
+        except Exception as e:
+            print(f"Error analyzing {init_path}: {e}")
+            return {}
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            tree = ast.parse(f.read(), filename=file_path)
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
+                # Split the module path
+                parts = node.module.split('.')
+                if len(parts) == 1:
+                    continue
+                if len(parts) >= 2:
+                    module_type = parts[1]
+                    if module_type == 'collections':
+                        if len(parts) == 2:
+                            continue
+                        if node.names:
+                            for name in node.names:
+                                if name.name == '*':
+                                    continue
+                                # Check if this is an __init__ import
+                                module_path = os.path.join(nemo_root, *parts)
+                                init_imports = get_init_imports(module_path)
+                                if name.name in init_imports:
+                                    # Use the mapped import path
+                                    imports.add(init_imports[name.name])
+                                else:
+                                    imports.add(f"{node.module}.{name.name}")
+                    elif module_type in find_top_level_packages(nemo_root):
+                        if node.names:
+                            for name in node.names:
+                                if name.name == '*':
+                                    continue
+                                # Check if this is an __init__ import
+                                module_path = os.path.join(nemo_root, *parts)
+                                init_imports = get_init_imports(module_path)
+                                if name.name in init_imports:
+                                    # Use the mapped import path
+                                    imports.add(init_imports[name.name])
+                                else:
+                                    imports.add(f"{node.module}.{name.name}")
+    except Exception as e:
+        print(f"Error analyzing {file_path}: {e}")
+    return imports
+def find_top_level_packages(nemo_root: str) -> List[str]:
+    """Find all top-level packages under nemo directory."""
+    packages: List[str] = []
+    nemo_dir = os.path.join(nemo_root, 'nemo')
+    tests_dir = os.path.join(nemo_root, 'tests')
+    if not os.path.exists(nemo_dir):
+        print(f"Warning: nemo directory not found at {nemo_dir}")
+        return packages
+    if not os.path.exists(tests_dir):
+        print(f"Warning: nemo directory not found at {nemo_dir}")
+        return packages
+    for item in os.listdir(nemo_dir) + os.listdir(tests_dir):
+        item_path = os.path.join(nemo_dir, item)
+        if os.path.isdir(item_path) and not item.startswith('__'):
+            packages.append(item)
+    return sorted(packages)
+def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]:
+    """Find all modules within collections."""
+    collection_modules: Dict[str, List[str]] = {}
+    collections_dir = os.path.join(nemo_root, 'nemo', 'collections')
+    if not os.path.exists(collections_dir):
+        print(f"Warning: collections directory not found at {collections_dir}")
+        return collection_modules
+    for collection in os.listdir(collections_dir):
+        collection_path = os.path.join(collections_dir, collection)
+        if os.path.isdir(collection_path) and not collection.startswith('__'):
+            collection_modules[f"nemo.collections.{collection}"] = []
+    return collection_modules
+def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
+    """Build a dependency graph by analyzing all Python files."""
+    # Find all top-level packages
+    top_level_packages = find_top_level_packages(nemo_root)
+    print(f"Found top-level packages: {top_level_packages}")
+    dependencies: Dict[str, List[str]] = {}
+    for file_path in find_python_files(nemo_root):
+        relative_path = os.path.relpath(file_path, nemo_root)
+        parts = relative_path.split(os.sep)
+        if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"):
+            continue
+        module_path = relative_path.replace(".py", "").replace("/", ".")
+        if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests':
+            dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
+        elif parts[0] == 'tests':
+            dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")]
+        elif parts[1] == 'collections':
+            dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
+    # Flip the dependency graph to show reverse dependencies
+    reverse_dependencies: Dict[str, List[str]] = {}
+    # Handle top-level package dependencies
+    for package, deps in dependencies.items():
+        for dep in deps:
+            if dep not in reverse_dependencies:
+                reverse_dependencies[dep] = []
+            reverse_dependencies[dep].append(package)
+    dependencies = reverse_dependencies
+    # Follow and extend records with transitive dependencies
+    transitive_dependencies = dependencies.copy()
+    # Keep iterating until no new dependencies are added
+    while True:
+        changes_made = False
+        new_dependencies = transitive_dependencies.copy()
+        # For each package and its direct dependencies
+        for package, deps in transitive_dependencies.items():
+            # For each direct dependency
+            for dep in deps:
+                # If the dependency has its own dependencies
+                if dep in transitive_dependencies:
+                    # Add those transitive dependencies to the original package
+                    for transitive_dep in transitive_dependencies[dep]:
+                        if transitive_dep not in new_dependencies[package]:
+                            new_dependencies[package].append(transitive_dep)
+                            changes_made = True
+        # Update dependencies with new transitive ones
+        transitive_dependencies = new_dependencies
+        # If no new dependencies were added, we're done
+        if not changes_made:
+            break
+    dependencies = transitive_dependencies
+    # Simplify values: Either top-level package or collection module
+    simplified_dependencies: Dict[str, List[str]] = {}
+    for package, deps in dependencies.items():
+        package_parts = package.split('.')
+        if package_parts[0] == "tests":
+            simplified_package_path = f"{os.path.join(*package_parts)}.py"
+        elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
+            simplified_package_path = file_path
+        elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")):
+            simplified_package_path = file_path
+        else:
+            simplified_package_path = package
+        for dep in deps:
+            dep_parts = dep.split('.')
+            if simplified_package_path not in simplified_dependencies:
+                simplified_dependencies[simplified_package_path] = []
+            if (
+                len(dep_parts) >= 2
+                and (dep_parts[1] in find_top_level_packages(nemo_root))
+                and dep_parts[1] != 'collections'
+            ):
+                simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}")
+            elif dep_parts[0] == "tests":
+                simplified_dependencies[simplified_package_path].append(".".join(dep_parts))
+            elif len(dep_parts) >= 3 and (
+                simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}"
+            ) in find_collection_modules(nemo_root):
+                simplified_dependencies[simplified_package_path].append(simplified_name)
+            simplified_dependencies[simplified_package_path].append(package)
+            simplified_dependencies[simplified_package_path] = sorted(
+                list(set(simplified_dependencies[simplified_package_path]))
+            )
+    dependencies = simplified_dependencies
+    # Bucket
+    bucket_deps: Dict[str, List[str]] = {}
+    for package, deps in dependencies.items():
+        new_deps = []
+        for dep in deps:
+            if (
+                "nemo.collections.asr" in dep
+                or "nemo.collections.tts" in dep
+                or "nemo.collections.speechlm" in dep
+                or "nemo.collections.audio" in dep
+                or "tests.collections.asr" in dep
+                or "tests.collections.tts" in dep
+                or "tests.collections.speechlm" in dep
+                or "tests.collections.audio" in dep
+            ):
+                new_deps.append("speech")
+                new_deps.append("unit-tests")
+            if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep:
+                new_deps.append("export-deploy")
+                new_deps.append("unit-tests")
+            if (
+                "nemo.collections.llm" in dep
+                or "nemo.collections.vlm" in dep
+                or "nemo.automodel" in dep
+                or "tests.collections.llm" in dep
+                or "tests.collections.vlm" in dep
+                or "tests.automodel" in dep
+            ):
+                new_deps.append("automodel")
+                new_deps.append("unit-tests")
+            if "tests" in dep and "tests.functional_tests" not in dep:
+                new_deps.append("unit-tests")
+            if (
+                "nemo.collections" in dep
+                and "nemo.collections.asr" not in dep
+                and "nemo.collections.tts" not in dep
+                and "nemo.collections.speechlm" not in dep
+                and "nemo.collections.audio" not in dep
+                and "tests.collections.asr" not in dep
+                and "tests.collections.tts" not in dep
+                and "tests.collections.speechlm" not in dep
+                and "tests.collections.audio" not in dep
+            ):
+                new_deps.append("nemo2")
+                new_deps.append("unit-tests")
+        bucket_deps[package] = sorted(list(set(new_deps)))
+    dependencies = bucket_deps
+    # Additional dependencies
+    # Add all files in requirements/ directory
+    requirements_dir = os.path.join(nemo_root, "requirements")
+    if os.path.exists(requirements_dir):
+        for filename in os.listdir(requirements_dir):
+            filepath = os.path.join("requirements", filename)
+            relative_path = os.path.relpath(filepath, nemo_root)
+            dependencies[relative_path] = [
+                "nemo2",
+                "unit-tests",
+                "speech",
+                "automodel",
+                "export-deploy",
+            ]
+    # Add all Dockerfile files
+    for root, _, files in os.walk(nemo_root):
+        for file_path in files:
+            full_path = os.path.join(root, file_path)
+            relative_path = os.path.relpath(full_path, nemo_root)
+            if "cicd-main-export-deploy" in file_path:
+                dependencies[relative_path] = ["export-deploy"]
+            if "cicd-main-nemo2" in file_path:
+                dependencies[relative_path] = ["nemo2"]
+            if "cicd-main-speech" in file_path:
+                dependencies[relative_path] = ["speech"]
+            if "cicd-main-automodel" in file_path:
+                dependencies[relative_path] = ["automodel"]
+            if "cicd-main-unit-tests" in file_path:
+                dependencies[relative_path] = ["unit-tests"]
+            if "Dockerfile" in file_path:
+                dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"]
+    # Sort dependencies by length of values (number of dependencies)
+    dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True))
+    return dependencies
+def main():
+    """Main function to analyze dependencies and output JSON."""
+    # Get the root directory of the NeMo project
+    nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    # Build dependency graph
+    dependencies = build_dependency_graph(nemo_root)
+    # Output as JSON
+    data = json.dumps(dependencies, indent=4)
+    with open('nemo_dependencies.json', 'w', encoding='utf-8') as f:
+        f.write(data)
+if __name__ == "__main__":
+    main()

.github/scripts/notify.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import requests
+from github import Github
+def send_slack_notification():
+    # Get environment variables
+    gh_token = os.environ.get('GH_TOKEN')
+    webhook_url = os.environ.get('SLACK_WEBHOOK')
+    repository = os.environ.get('REPOSITORY')
+    run_id = os.environ.get('RUN_ID')
+    server_url = os.environ.get('SERVER_URL', 'https://github.com')
+    pr_number = int(os.environ.get('PR_NUMBER', 0))
+    branch_name = os.environ.get('BRANCH_NAME')
+    # Get failure info from GitHub API
+    gh = Github(gh_token)
+    repo = gh.get_repo(repository)
+    # Get failed jobs
+    failed_jobs = [job.name for job in repo.get_workflow_run(int(run_id)).jobs() if job.conclusion == 'failure']
+    if pr_number != 0:
+        pr = repo.get_pull(pr_number)
+        title = f"*<{server_url}/{repository}/pull/{pr_number}|PR#{pr_number}>: {pr.title.replace('`', '')}*"
+        author = f"<{server_url}/{pr.user.login}|{pr.user.login}>"
+        branch = f"<{server_url}/{pr.head.repo.full_name}/tree/{pr.head.ref}|{pr.head.ref}>"
+    else:
+        title = f"*Run on <{server_url}/{repository}/tree/{branch_name}|{branch_name}>*"
+        author = "No author"
+        branch = f"<{server_url}/{repository}/tree/{branch_name}|{branch_name}>"
+    blocks = [
+        {
+            "type": "section",
+            "text": {
+                "type": "mrkdwn",
+                "text": (
+                    f"{title}\n"
+                    f"• Author: {author}\n"
+                    f"• Branch: {branch}\n"
+                    f"• Pipeline: <{server_url}/{repository}/actions/runs/{run_id}|View Run>\n"
+                    f"• Failed Jobs:\n"
+                    + "\n".join(
+                        [
+                            f"    • <{server_url}/{repository}/actions/runs/{run_id}|{job.split('/')[-1]}>"
+                            for job in failed_jobs
+                            if job.split('/')[-1] != 'Nemo_CICD_Test'
+                        ]
+                    )
+                ),
+            },
+        }
+    ]
+    print({"blocks": blocks})
+    # Send to Slack
+    response = requests.post(webhook_url, json={"blocks": blocks})
+    response.raise_for_status()
+if __name__ == "__main__":
+    send_slack_notification()

.github/workflows/_build_container.yml ADDED Viewed

	@@ -0,0 +1,89 @@

+name: ~Build container template
+on:
+  workflow_call:
+    inputs:
+      image-name:
+        required: true
+        type: string
+        description: "The name of the image to build"
+      dockerfile:
+        required: true
+        type: string
+      runner:
+        required: false
+        default: self-hosted-azure-builder
+        type: string
+        description: "The runner to use for the build"
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
+      cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Parse manifest.json
+        id: manifest
+        run: |
+          BUILD_ARGS=$(cat << EOF
+          BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
+          TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
+          TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
+          MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
+          MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
+          TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
+          TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
+          APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
+          APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
+          EOF
+          )
+          echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
+          echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Get last merged PR
+        id: cache_from
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          LAST_PRS=$(gh api graphql -f query='
+            query {
+              repository(owner: "NVIDIA", name: "NeMo") {
+                pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
+                  nodes {
+                    number
+                  }
+                }
+              }
+            }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
+              echo "nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:$number"
+            done)
+          echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
+          echo "$LAST_PRS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+  build:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    needs: [pre-flight]
+    with:
+      image-name: ${{ inputs.image-name }}
+      dockerfile: ${{ inputs.dockerfile }}
+      image-label: nemo-core
+      build-args: |
+        IMAGE_LABEL=nemo-core
+        NEMO_TAG=${{ github.sha }}
+        NEMO_REPO=https://github.com/NVIDIA/NeMo
+        PR_NUMBER=${{ github.event.pull_request.number || 0 }}
+        ${{ needs.pre-flight.outputs.build_args }}
+      prune-filter-timerange: 24h
+      use-inline-cache: false
+      cache-from: |
+        nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:main
+        nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:${{ github.event.pull_request.number || 0 }}
+        ${{ needs.pre-flight.outputs.cache-from }}
+      runner: ${{ inputs.runner }}

.github/workflows/_bump_mcore_tag.yml ADDED Viewed

	@@ -0,0 +1,56 @@

+name: ~Bump Megatron Tag template
+on:
+  workflow_call:
+    inputs:
+      nemo-target-branch:
+        required: true
+        type: string
+        description: "The target branch to bump"
+      mcore-target-branch:
+        required: true
+        type: string
+        description: "The target branch to bump"
+    secrets:
+      PAT:
+        required: true
+jobs:
+  update-branch:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          ref: ${{ inputs.nemo-target-branch }}
+      - name: Set Git config
+        run: |
+          git config --local user.email "[email protected]"
+          git config --local user.name "Github Actions"
+      - name: Merge weekly-bump-${{ inputs.nemo-target-branch }} back to base branch
+        env:
+          SOURCE_BRANCH: weekly-bump-${{ inputs.nemo-target-branch }}
+          TARGET_BRANCH: ${{ inputs.nemo-target-branch }}
+        run: |
+          if git ls-remote --exit-code origin $SOURCE_BRANCH; then
+            git fetch --unshallow
+            git checkout $SOURCE_BRANCH
+            git pull
+            git merge --no-ff $TARGET_BRANCH -m "chore: Auto-merge $TARGET_BRANCH into $SOURCE_BRANCH"
+          else
+            git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
+          fi
+          git push -u origin $SOURCE_BRANCH
+  mcore:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    needs: [update-branch]
+    with:
+      source-repository: NVIDIA/Megatron-LM
+      source-ref: ${{ inputs.mcore-target-branch }}
+      yaml-path: '."vcs-dependencies"."megatron-lm".ref'
+      file: requirements/manifest.json
+      base-branch: weekly-bump-${{ inputs.nemo-target-branch }}
+      cicd-labels: Run CICD,no-fail-fast
+      pr-reviewers: ${{ inputs.pr-reviewers }}
+    secrets:
+      PAT: ${{ secrets.PAT }}

.github/workflows/build-test-publish-wheel.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Build, test, and publish a PyPi wheel (to testpypi).
+on:
+  push:
+    branches:
+      - main
+      - "r**"
+defaults:
+  run:
+    shell: bash -x -e -u -o pipefail {0}
+jobs:
+  build-test-publish-wheel:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    with:
+      dry-run: true
+      python-package: nemo
+      python-version: "3.10"
+    secrets:
+      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}

.github/workflows/changelog-build.yml ADDED Viewed

	@@ -0,0 +1,123 @@

+name: 'Changelog Build (Release)'
+on:
+  workflow_dispatch:
+    inputs:
+      last-release-tag:
+        description: Last Git tag to start from (exclusive) (e.g. `v2.0.0`)
+        type: string
+        required: true
+      release-branch:
+        description: Release branch to build changelog on (e.g. `r2.1.0`)
+        type: string
+        required: true
+      changelog-main-content:
+        description: Custom changelog content to include before detailed changelogs
+        type: string
+        required: false
+        default: ''
+jobs:
+  changelog:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          fetch-depth: 0
+      - name: Build Changelog
+        id: github_tag
+        uses: mikepenz/[email protected]
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          # Configuration file is setup with filters for domains
+          # owner:repo must point to current repo
+          # fromTag: Auto resolved from historical tag order (previous tag compared to current tag)
+          # toTag: Current tag reference
+          configuration: ".github/workflows/config/changelog-config.json"
+          owner: ${{ github.repository_owner }}
+          repo: ${{ github.event.repository.name }}
+          ignorePreReleases: "false"
+          failOnError: "false"
+          fromTag: ${{ inputs.last-release-tag }}
+          toTag: ${{ inputs.release-branch }}
+      - name: Update changelog file
+        env:
+          RELEASE_BRANCH: ${{ inputs.release-branch }}
+          CHANGELOG: ${{ steps.github_tag.outputs.changelog }}
+          MAIN_CONTENT: ${{ inputs.changelog-main-content }}
+        shell: bash -x -e -u -o pipefail {0}
+        run: |
+          RELEASE_VERSION=${RELEASE_BRANCH#r}
+          CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/')
+          # Build release notes starting with version header
+          RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION"
+          # Add custom content if provided
+          if [ -n "$MAIN_CONTENT" ]; then
+            RELEASE_NOTES="$RELEASE_NOTES
+          $MAIN_CONTENT"
+          fi
+          # Add detailed changelogs section
+          RELEASE_NOTES="$RELEASE_NOTES
+          ### Detailed Changelogs:
+          $CHANGELOG"
+          printf "%s\n" "$RELEASE_NOTES" | sed '/<!-- Next changelog -->/r /dev/stdin' CHANGELOG.md > CHANGELOG.tmp.md
+          mv CHANGELOG.tmp.md CHANGELOG.md
+      - name: Inspect new changelog file
+        run: cat CHANGELOG.md
+      - name: Create or update label
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const labelName = '${{ inputs.release-branch }}';
+            const labelColor = '0366d6'; // Blue color
+            const labelDescription = `Release ${labelName}`;
+            try {
+              // Try to get the label
+              await github.rest.issues.getLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                name: labelName
+              });
+              console.log(`Label '${labelName}' already exists`);
+            } catch (error) {
+              if (error.status === 404) {
+                // Label doesn't exist, create it
+                await github.rest.issues.createLabel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  name: labelName,
+                  color: labelColor,
+                  description: labelDescription
+                });
+                console.log(`Created label '${labelName}'`);
+              } else {
+                throw error;
+              }
+            }
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          commit-message: "beep boop: Update changelog"
+          title: "Update changelog for `${{ inputs.release-branch }}`"
+          signoff: true
+          sign-commits: true
+          base: main
+          branch: bot/chore/update-changelog-into-${{ inputs.release-branch }}
+          labels: ${{ inputs.release-branch }}

.github/workflows/cherry-pick-release-commit.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: Create PR to main with cherry-pick from release
+on:
+  push:
+    branches:
+      - main
+jobs:
+  cherry-pick:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    secrets:
+      PAT: ${{ secrets.PAT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

.github/workflows/cicd-approve-test-queue.yml ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Approve Test Queue
+on:
+  schedule:
+    - cron: '*/5 * * * *'  # Runs every 5 minutes
+  workflow_dispatch:  # Allows manual triggering
+jobs:
+  approve-queue:
+    runs-on: ubuntu-latest
+    environment: main
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+      - name: Approve waiting deployments
+        env:
+          GITHUB_TOKEN: ${{ secrets.PAT }}
+          MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
+        run: |
+          python - <<EOF
+          import os
+          import requests
+          # GitHub API configuration
+          GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+          REPO = os.environ["GITHUB_REPOSITORY"]
+          MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
+          API_BASE = f"https://api.github.com/repos/{REPO}"
+          # Headers for GitHub API
+          headers = {
+              "Authorization": f"token {GITHUB_TOKEN}",
+              "Accept": "application/vnd.github.v3+json",
+              "X-GitHub-Api-Version": "2022-11-28",
+          }
+          def make_request(endpoint, method="GET", data=None):
+              """Make a request to the GitHub API with error handling."""
+              url = f"{API_BASE}/{endpoint}"
+              try:
+                  if method == "GET":
+                      response = requests.get(url, headers=headers)
+                  else:
+                      response = requests.post(url, headers=headers, json=data)
+                  response.raise_for_status()
+                  response_json = response.json()
+                  if hasattr(response, "links") and "actions/runs?status" in endpoint:
+                      response_json["next"] = response.links.get("next", {}).get("url")
+                  return response_json
+              except requests.exceptions.RequestException as e:
+                  print(f"Error making request to {endpoint}: {str(e)}")
+                  if hasattr(e.response, 'text'):
+                      print(f"Response: {e.response.text}")
+                  return None
+          def get_workflow_runs(status):
+              """Get all workflow runs for a given status."""
+              all_results = []
+              endpoint = f"actions/runs?status={status}"
+              while endpoint:
+                  response = make_request(endpoint)
+                  if not response:
+                      break
+                  all_results.extend(response.get("workflow_runs", []))
+                  endpoint = None
+                  next_url = response.get("next")
+                  if next_url:
+                      endpoint = f"actions/runs?{next_url.split('?')[1]}"
+              return all_results
+          # Get current running and queued workflows
+          print("Fetching workflow runs...")
+          queued_workflow_runs = get_workflow_runs("queued")
+          in_progress_workflow_runs = get_workflow_runs("in_progress")
+          # Count running and queued workflows
+          queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
+          in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
+          total_workflows = queued_workflows + in_progress_workflows
+          print(f"Current queued workflows: {queued_workflows}")
+          print(f"Current running workflows: {in_progress_workflows}")
+          print(f"Total workflows: {total_workflows}")
+          print(f"Max concurrency: {MAX_CONCURRENCY}")
+          if total_workflows >= MAX_CONCURRENCY:
+              print("Maximum concurrency reached, no new approvals will be made")
+              exit(0)
+          # Get waiting CI workflows for test environment
+          print("Fetching deployments...")
+          pending_workflows = get_workflow_runs("waiting")
+          pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
+          # Sort deployments by creation date (oldest first)
+          print("Sorting workflows...")
+          pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
+          # Process each deployment
+          print("Processing ...")
+          for workflow in pending_workflows:
+              if total_workflows >= MAX_CONCURRENCY:
+                  print("Maximum concurrency reached, stopping approvals")
+                  break
+              workflow_id = workflow["id"]
+              workflow_name = workflow["display_title"]
+              print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
+              deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
+              deployment = make_request(deployment_url)[0]
+              environment_id = deployment["environment"]["id"]
+              # Approve the deployment
+              status_data = {
+                  "environment_ids": [environment_id],
+                  "state": "approved",
+                  "comment": "Automatically approved by queue manager"
+              }
+              result = make_request(deployment_url, method="POST", data=status_data)
+              if result:
+                  total_workflows += 1
+              else:
+                  print(f"Failed to approve deployment {deployment['id']}")
+                  exit(1)
+          EOF
+  notify:
+    if: failure()
+    runs-on: ubuntu-latest
+    needs: [approve-queue]
+    steps:
+      - name: Notify
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+        run: |
+          curl -X POST \
+            -H 'Content-type: application/json' \
+            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
+            $SLACK_WEBHOOK

.github/workflows/cicd-main-nemo2.yml ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: NeMo E2E NeMo2 Tests
+on:
+  workflow_call:
+    inputs:
+      test_to_run:
+        required: true
+        type: string
+      image-name:
+        required: false
+        default: nemo_container_nemo2
+        type: string
+jobs:
+  build:
+    uses: ./.github/workflows/_build_container.yml
+    with:
+      image-name: ${{ inputs.image-name }}
+      dockerfile: docker/Dockerfile.ci
+  e2e-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L2_NeMo_2_GPT_Pretraining_no_transformer_engine
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_llama3_pretraining_recipe
+            runner: self-hosted-azure
+          # - script: L2_NeMo_2_llama3_pytorch_profiler
+          #   runner: self-hosted-azure
+          #   timeout: 20
+          - script: L2_NeMo_2_llama3_fault_tolerance_plugin
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_llama3_straggler_detection
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_llama3_local_ckpt
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_DDP_Param_Parity_check
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Hyena_Conversion_from_HF
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Hyena_DDP_Pretraining_Test
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Hyena_Mixer_Test
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_Hyena_PP_Pretraining_Test
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Hyena_TP_Pretraining_Test
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Hyena_CP_Pretraining_Test
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_SSM_Pretraining
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_SSM_Finetuning
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_HF_MODEL_IMPORT
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_jit_callback
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_T5_Pretraining
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_T5_MockData_Pretraining
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_T5_Finetuning
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_T5_Squad
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_T5_LoRA
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_BERT_Pretraining_Megatron
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_BERT_Pretraining_HuggingFace
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_TP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_PP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_CP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_TP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_ENERGON_FINETUNE_TP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_PP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_CP2
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_PAD
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_TRUNC
+            runner: self-hosted-azure-gpus-2-h100
+          - script: L2_NeMo_2_NEVA_LOAD_GENERATE
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_LLAVA_IMPORT
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NEMO_2_MLLAMA_Inference
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_MLLAMA_MOCK_FINETUNE_TP2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_MLLAMA_PRELOADED_FINETUNE_TP2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_MLLAMA_ENERGON_FINETUNE_TP2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_MLLAMA_IMPORT
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_Mixtral_Pretraining
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_TE_op_fuser
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2_exclude
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1_exclude
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
+            runner: self-hosted-azure
+          - script: L2_NEMO_2_LoRA_MERGE
+            runner: self-hosted-azure
+          - script: L2_NEMO_2_LoRA_Inference
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
+            runner: self-hosted-azure
+            is-optional: true
+          - script: L2_NeMo_2_PTQ_Llama2_FP8_trtllm
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_PTQ_Llama2_FP8_nemo
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Distill_Llama3_TP1PP2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Prune_Llama_TP1PP2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_GPT_Speculative_Llama3_TP2PP1
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_LLAVA_NEXT_HF_CONVERSION
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_TRAIN
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_PACKED_TRAIN
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_AVLM_MOCK_TRAINING
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_AVLM_ENERGON_TRAIN
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_AVLM_ENERGON_CP2_TRAIN
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_CLIP_PRETRAIN
+            runner: self-hosted-azure
+            timeout: 20
+          - script: L2_NeMo_2_CLIP_INFER
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Auto_Configurator_llama_TP1_PP1_MBS124
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_Auto_Configurator_bert_TP1_PP1_MBS124
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_Auto_Configurator_t5_TP1_PP1_MBS124
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_Auto_Configurator_callbacks
+            runner: self-hosted-azure-gpus-1
+          - script: L2_NeMo_2_Conversion_Test_Baichuan2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_ChatGLM
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_DeepSeek
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Gemma
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Gemma2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Gemma3_llm
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Gemma3_vlm
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Mistral
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Llama
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Llama_Embedding
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Llama4
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Llama4_Text
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_PTQ_Llama4_FP8_nemo
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Nemotron
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Nemotron_H_4B
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Phi3Mini
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Qwen2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Qwen3
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Starcoder
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_Starcoder2
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_BERT
+            runner: self-hosted-azure
+          - script: L2_NeMo_2_Conversion_Test_T5
+            runner: self-hosted-azure
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_QWEN2VL_MOCK_FINETUNE_TP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_QWEN2VL_PRELOADED_FINETUNE_TP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_QWEN2VL_ENERGON_FINETUNE_TP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_PP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_CP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_LLAMA4_ENERGON_FINETUNE_EP2
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Diffusion_Recipe_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Diffusion_Taskencoder_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_Import_Test
+            is-optional: true
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_Inference_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_Training_DDP_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_Training_FSDP_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_ControlNet_Training_DDP_Test
+          - runner: self-hosted-azure
+            script: L2_NeMo_2_Flux_ControlNet_Training_FSDP_Test
+            is-optional: true
+    needs: [build]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          tests_to_run: ${{ inputs.test_to_run }}
+          image: ${{ inputs.image-name }}
+          is_optional: ${{ matrix.is-optional || false }}
+          timeout: ${{ matrix.timeout || 10 }}

.github/workflows/cicd-main-speech.yml ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: NeMo E2E Speech Tests
+on:
+  workflow_call:
+    inputs:
+      test_to_run:
+        required: true
+        type: string
+      image-name:
+        required: false
+        default: nemo_container_speech
+        type: string
+jobs:
+  build:
+    uses: ./.github/workflows/_build_container.yml
+    with:
+      image-name: ${{ inputs.image-name }}
+      dockerfile: docker/Dockerfile.ci
+  unit-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_ASR
+            runner: self-hosted-azure-gpus-1
+            timeout: 30
+          - script: L0_Unit_Tests_CPU_ASR
+            runner: azure-gpu-vm-runner1-cpu
+            cpu-only: true
+            timeout: 30
+          - script: L0_Unit_Tests_GPU_TTS
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_TTS
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+          - script: L0_Unit_Tests_GPU_Audio
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Audio
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+          - script: L0_Unit_Tests_GPU_SpeechLM2
+            runner: self-hosted-azure-gpus-1
+            timeout: 20
+          - script: L0_Unit_Tests_CPU_SpeechLM2
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+            timeout: 20
+    needs: [build]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          image: ${{ inputs.image-name }}
+          timeout: ${{ matrix.timeout || 10 }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}
+  e2e-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run_Speech_to_Text
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run_Speech_Pre-training_-_CitriNet
+          - runner: self-hosted-azure-gpus-1
+            script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
+            is-optional: true
+          - runner: self-hosted-azure-gpus-1
+            script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
+            is-optional: true
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run_Speech_to_Text_Hybrid_RNNT_CTC_Prompt
+          - runner: self-hosted-azure-gpus-1
+            script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
+          - runner: self-hosted-azure-gpus-1
+            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
+          - runner: self-hosted-azure-gpus-1
+            script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
+          - runner: self-hosted-azure-gpus-1
+            script: L2_ASR_Adapters_Linear_Adapters
+          - runner: self-hosted-azure-gpus-1
+            script: L2_ASR_Adapters_RelPos_MHA_Adapters
+          - runner: self-hosted-azure
+            script: L2_Speech_to_Text_EMA
+          - runner: self-hosted-azure-gpus-1
+            script: L2_Speech_to_Text_AED
+          - runner: self-hosted-azure-gpus-1
+            script: L2_Speaker_dev_run_Speech_to_Label
+          - runner: self-hosted-azure
+            script: L2_Speech_Estimate_Duration_Bins
+          - runner: self-hosted-azure
+            script: L2_Speech_Batch_Size_OOMptimizer
+          - runner: self-hosted-azure
+            script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
+            is-optional: true
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Transcribe
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Streaming_Inference
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
+          - runner: self-hosted-azure
+            script: L2_Speech_Transcription_Canary_Streaming_Full_Manifest
+          - runner: self-hosted-azure
+            script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
+          - runner: self-hosted-azure
+            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
+          - runner: self-hosted-azure
+            script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
+          - runner: self-hosted-azure-gpus-1
+            script: Speech_Checkpoints_tests
+            timeout: 20
+          - runner: self-hosted-azure-gpus-1
+            script: L2_Speaker_dev_run_Speaker_Recognition
+          - runner: self-hosted-azure-gpus-1
+            script: L2_Speaker_dev_run_Speaker_Diarization
+          - runner: self-hosted-azure-gpus-1
+            script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
+          - runner: self-hosted-azure
+            script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
+          - runner: self-hosted-azure
+            script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
+          - runner: self-hosted-azure
+            script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
+          - runner: self-hosted-azure
+            script: L2_Speaker_dev_run_Neural_Diarizer_Inference
+          - runner: self-hosted-azure
+            script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
+          - runner: self-hosted-azure
+            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
+          - runner: self-hosted-azure
+            script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
+          - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
+            runner: self-hosted-azure
+          - runner: self-hosted-azure-gpus-1
+            script: L2_TTS_Fast_dev_runs_1_Tacotron_2
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_1_WaveGlow
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_1_FastPitch
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_1_Hifigan
+          - runner: self-hosted-azure
+            script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
+          - runner: self-hosted-azure
+            script: SPEECHLM_HF_Training_DuplexS2S
+          - runner: self-hosted-azure
+            script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
+          - runner: self-hosted-azure
+            script: SPEECHLM_HF_Training_SALM
+            timeout: 20
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_Magpietts_DecoderContext
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_Magpietts_MultiEncoder
+          - runner: self-hosted-azure
+            script: L2_TTS_Fast_dev_runs_Magpietts_OnlinePO
+          - runner: self-hosted-azure
+            script: L2_TTS_InferEvaluate_Magpietts_ZeroShot
+          - runner: self-hosted-azure
+            script: L2_TTS_InferEvaluate_Magpietts_SeenSpeakers
+    needs: [unit-tests]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          tests_to_run: ${{ inputs.test_to_run }}
+          image: ${{ inputs.image-name }}
+          timeout: ${{ matrix.timeout || 10 }}
+          is_optional: ${{ matrix.is-optional || false }}

.github/workflows/cicd-main-testcopy.yml ADDED Viewed

	@@ -0,0 +1,472 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "[debug] CICD NeMo"
+on:
+  schedule:
+    - cron: 0 0 * * *
+    - cron: "*/5 * * * *" # Runs every 5 minutes
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+    inputs:
+      test_to_run:
+        required: false
+        default: all
+        type: string
+        description: Comma-separated list of tests to run. Use "all" to run the full test suite.
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      test_to_run: ${{ steps.test_to_run.outputs.main }}
+      is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
+      no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
+      components_to_run: ${{ steps.components_to_run.outputs.main }}
+    env:
+      TESTS_TO_RUN: ${{ inputs.test_to_run }}
+      EVENT_NAME: ${{ github.event_name }}
+      HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Select components to run
+        id: components_to_run
+        run: |
+          pip install -U pip
+          pip install git-python
+          if [[ "$EVENT_NAME" == "pull_request" ]]; then
+            python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
+          else
+            echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json
+          fi
+          components_to_run=$(cat test_modules.json)
+          echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
+      - name: Select tests to run
+        id: test_to_run
+        run: |
+          # For manual dispatch, we replace `all` with the actual job names
+          if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
+            TESTS_TO_RUN=$TESTS_TO_RUN
+          # For correctly labeled PR, we replace `all` with the actual job names
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
+            TESTS_TO_RUN=all
+          # For incorrectly labeled PR, run no tests
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
+            TESTS_TO_RUN=""
+          # For push events, run all tests. This is so that we can generate coverage
+          # on branch `main`.
+          elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
+            TESTS_TO_RUN=all
+          else
+            echo "Unsupported event_name $EVENT_NAME provided".
+            exit 1
+          fi
+          parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
+          echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
+      - name: Check if this is a CI workload
+        shell: bash
+        id: is_ci_workload
+        run: |
+          branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+          if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
+            is_ci_workload=true
+            echo "main=true" | tee -a "$GITHUB_OUTPUT"
+          else
+            is_ci_workload=false
+          fi
+          echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
+      - name: Check if no-fail-fast is set
+        shell: bash
+        id: no_fail_fast
+        env:
+          HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
+        run: |
+          if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
+            no_fail_fast=true
+          else
+            no_fail_fast=false
+          fi
+          echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
+  code-linting:
+    if: needs.pre-flight.outputs.test_to_run != '[]'
+    needs: [pre-flight]
+    uses: ./.github/workflows/code-linting.yml
+  cicd-wait-in-queue:
+    needs: [pre-flight]
+    runs-on: ubuntu-latest
+    environment: test
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.is_ci_workload == 'false'
+    steps:
+      - name: Running CI tests
+        run: |
+          echo "Running CI tests"
+  cicd-test-container-build:
+    uses: ./.github/workflows/_build_container.yml
+    needs: [pre-flight, code-linting, cicd-wait-in-queue]
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    with:
+      image-name: nemo_container
+      dockerfile: docker/Dockerfile.ci
+  # cicd-import-tests:
+  #   if: |
+  #     needs.pre-flight.outputs.test_to_run != '[]'
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   needs: [cicd-test-container-build, pre-flight]
+  #   runs-on: self-hosted-azure-gpus-1
+  #   steps:
+  #     - name: Create UUID
+  #       id: uuid
+  #       run: |
+  #         echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
+  #     - name: Checkout NeMo
+  #       uses: actions/checkout@v2
+  #       with:
+  #         repository: NVIDIA/NeMo
+  #         path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
+  #     - name: Run some checks
+  #       run: |
+  #         docker run \
+  #             --rm \
+  #           --device=/dev/nvidia0 \
+  #           --gpus all \
+  #           --shm-size=8g \
+  #           --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
+  #           --env TRANSFORMERS_OFFLINE=0 \
+  #           --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
+  #           # PyTorch Lightning version
+  #           python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
+  #           # PyTorch Lightning DDP Checks
+  #           CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
+  #           # Basic Import Checks
+  #           python tests/core_ptl/check_imports.py --domain asr
+  #           python tests/core_ptl/check_imports.py --domain nlp
+  #           python tests/core_ptl/check_imports.py --domain tts
+  #         '
+  # L0_Setup_Test_Data_And_Models:
+  #   needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
+  #   runs-on: self-hosted-azure
+  #   if: |
+  #     needs.pre-flight.outputs.test_to_run != '[]'
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         path: ${{ github.run_id }}
+  #     - name: main
+  #       uses: NVIDIA/NeMo/.github/actions/test-template@main
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: L0_Setup_Test_Data_And_Models
+  #         tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
+  # cicd-main-unit-tests:
+  #   needs: [pre-flight, cicd-test-container-build]
+  #   uses: ./.github/workflows/cicd-main-unit-tests.yml
+  #   if: |
+  #     needs.pre-flight.outputs.test_to_run != '[]'
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   with:
+  #     test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  # cicd-main-export-deploy:
+  #   needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+  #   uses: ./.github/workflows/cicd-main-export-deploy.yml
+  #   if: |
+  #     (
+  #       needs.pre-flight.outputs.test_to_run != '[]'
+  #       && (
+  #         contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy')
+  #       )
+  #     )
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   with:
+  #     test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  # cicd-main-speech:
+  #   needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+  #   uses: ./.github/workflows/cicd-main-speech.yml
+  #   if: |
+  #     (
+  #       needs.pre-flight.outputs.test_to_run != '[]'
+  #       && (
+  #         contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
+  #       )
+  #     )
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   with:
+  #     test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  # cicd-main-automodel:
+  #   needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+  #   uses: ./.github/workflows/cicd-main-automodel.yml
+  #   if: |
+  #     (
+  #       needs.pre-flight.outputs.test_to_run != '[]'
+  #       && (
+  #         contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel')
+  #       )
+  #     )
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   with:
+  #     test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  # cicd-main-nemo2:
+  #   needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+  #   uses: ./.github/workflows/cicd-main-nemo2.yml
+  #   if: |
+  #     (
+  #       needs.pre-flight.outputs.test_to_run != '[]'
+  #       && (
+  #         contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
+  #         || needs.pre-flight.outputs.components_to_run == '["all"]'
+  #       )
+  #     )
+  #     && (
+  #       success()
+  #       || (
+  #         needs.cicd-wait-in-queue.result == 'skipped'
+  #         && needs.pre-flight.outputs.is_ci_workload == 'true'
+  #       )
+  #     )
+  #     && !cancelled()
+  #   with:
+  #     test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  Nemo_CICD_Test_Debug:
+    needs:
+      - pre-flight
+      - cicd-test-container-build
+      # - cicd-import-tests
+      # - L0_Setup_Test_Data_And_Models
+      # - cicd-main-unit-tests
+      # - cicd-main-nemo2
+      # - cicd-main-export-deploy
+      # - cicd-main-automodel
+      # - cicd-main-speech
+    if: always()
+    runs-on: ubuntu-latest
+    permissions: write-all
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Get workflow result
+        id: result
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+        run: |
+          # Get workflow run details and check job conclusions
+          NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
+          NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
+          if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then
+            RESULT="success"
+          else
+            RESULT="failure"
+          fi
+          # Output the final status
+          echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
+      - name: Checkout for GH CLI
+        uses: actions/checkout@v4
+      - name: Remove label if not cancelled
+        if: ${{ steps.result.outputs.code != 'cancelled' && github.event.label.name == 'Run CICD' && github.event.pull_request.head.repo.full_name == github.repository }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.number }}
+        run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
+      - name: Pipeline successful, add PR comment
+        if: ${{ always() && steps.result.outputs.code == 'success' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
+        uses: peter-evans/create-or-update-comment@v4
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          REPOSITORY: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+        with:
+          issue-number: ${{ github.event.number }}
+          body: |
+            [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
+            We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
+            So it might be time to merge this PR or get some approvals.
+            Due to a major CI change, merges are currently handled by the automation team.
+            We will reach out to you quickly to merge this PR, but you can always reach us with the following handles:
+            //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
+      - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
+        if: ${{ always() && steps.result.outputs.code == 'failure' && env.SLACK_WEBHOOK != '' }}
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPOSITORY: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          PR_NUMBER: ${{ github.event.number }}
+          SERVER_URL: ${{ github.server_url }}
+        run: |
+          set -x
+          pip install PyGithub
+          export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+          python .github/scripts/notify.py
+      - name: Exit
+        if: ${{ always() }}
+        env:
+          RESULT: ${{ steps.result.outputs.code }}
+        run: |
+          if [ $RESULT == "success" ]; then
+            exit 0
+          else
+            exit 1
+          fi
+  Coverage:
+    runs-on: ubuntu-latest
+    needs: [Nemo_CICD_Test_Debug]
+    strategy:
+      matrix:
+        flag: [unit-test, e2e]
+    if: |
+      (
+        success()
+        || needs.Nemo_CICD_Test.result == 'success'
+      )
+      && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Download coverage reports of current branch
+        uses: actions/download-artifact@v4
+        with:
+          pattern: coverage-${{ matrix.flag }}-*
+      - name: Get total coverage of current branch
+        shell: bash -x -e -u -o pipefail {0}
+        if: always()
+        run: |
+          pip install coverage
+          ls -al .
+          ls -al coverage-*/
+          coverage combine --keep $(ls coverage-*/.coverage)
+          coverage report -i
+          rm -rf coverage-*
+          ls -al
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
+          flags: ${{ matrix.flag }}
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-${{ matrix.flag }}-aggregated
+          path: |
+            .coverage
+          include-hidden-files: true

.github/workflows/cicd-main-unit-tests.yml ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: NeMo Unit Tests
+on:
+  workflow_call:
+    inputs:
+      test_to_run:
+        required: true
+        type: string
+jobs:
+  collections-common-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_Common
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Common
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+  collections-llm-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_LLM
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_LLM
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}
+  collections-multimodal-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_Multimodal
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Multimodal
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}
+  collections-vlm-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_VLM
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_VLM
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}
+  core-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_Core
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Core
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+          - script: L0_Unit_Tests_GPU_Hydra
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Hydra
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+  lightning-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_Lightning
+            runner: self-hosted-azure
+          - script: L0_Unit_Tests_CPU_Lightning
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}
+  other-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU_Others
+            runner: self-hosted-azure-gpus-1
+          - script: L0_Unit_Tests_CPU_Others
+            runner: self-hosted-azure-cpu
+            cpu-only: true
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          is_unit_test: true
+          tests_to_run: ${{ inputs.test_to_run }}
+          cpu-only: ${{ matrix.cpu-only || false }}
+          is_optional: ${{ matrix.is-optional || false }}

.github/workflows/cicd-main.yml ADDED Viewed

	@@ -0,0 +1,450 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: CICD NeMo
+on:
+  schedule:
+    - cron: 0 0 * * *
+  pull_request:
+    branches:
+      - main
+      - r**
+      - weekly-bump*
+    types: [labeled]
+  workflow_dispatch:
+    inputs:
+      test_to_run:
+        required: false
+        default: all
+        type: string
+        description: Comma-separated list of tests to run. Use "all" to run the full test suite.
+concurrency:
+  # group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}-${{ github.event_name }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
+  cancel-in-progress: true
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      test_to_run: ${{ steps.test_to_run.outputs.main }}
+      is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
+      no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
+      components_to_run: ${{ steps.components_to_run.outputs.main }}
+    env:
+      TESTS_TO_RUN: ${{ inputs.test_to_run }}
+      EVENT_NAME: ${{ github.event_name }}
+      HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Select components to run
+        id: components_to_run
+        run: |
+          pip install -U pip
+          pip install git-python
+          if [[ "$EVENT_NAME" == "pull_request" ]]; then
+            python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
+          else
+            echo '["nemo2", "export-deploy", "speech"]' | tee -a test_modules.json
+          fi
+          components_to_run=$(cat test_modules.json)
+          echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
+      - name: Select tests to run
+        id: test_to_run
+        run: |
+          # For manual dispatch, we replace `all` with the actual job names
+          if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
+            TESTS_TO_RUN=$TESTS_TO_RUN
+          # For correctly labeled PR, we replace `all` with the actual job names
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
+            TESTS_TO_RUN=all
+          # For incorrectly labeled PR, run no tests
+          elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
+            TESTS_TO_RUN=""
+          # For push events, run all tests. This is so that we can generate coverage
+          # on branch `main`.
+          elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
+            TESTS_TO_RUN=all
+          else
+            echo "Unsupported event_name $EVENT_NAME provided".
+            exit 1
+          fi
+          parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
+          echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
+      - name: Check if this is a CI workload
+        shell: bash
+        id: is_ci_workload
+        run: |
+          branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+          if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
+            is_ci_workload=true
+            echo "main=true" | tee -a "$GITHUB_OUTPUT"
+          else
+            is_ci_workload=false
+          fi
+          echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
+      - name: Check if no-fail-fast is set
+        shell: bash
+        id: no_fail_fast
+        env:
+          HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
+        run: |
+          if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
+            no_fail_fast=true
+          else
+            no_fail_fast=false
+          fi
+          echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
+  code-linting:
+    if: needs.pre-flight.outputs.test_to_run != '[]'
+    needs: [pre-flight]
+    uses: ./.github/workflows/code-linting.yml
+  cicd-wait-in-queue:
+    needs: [pre-flight, code-linting]
+    runs-on: ubuntu-latest
+    environment: test
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && needs.pre-flight.outputs.is_ci_workload == 'false'
+    steps:
+      - name: Running CI tests
+        run: |
+          echo "Running CI tests"
+  cicd-test-container-build:
+    uses: ./.github/workflows/_build_container.yml
+    needs: [pre-flight, code-linting, cicd-wait-in-queue]
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    with:
+      image-name: nemo_container
+      dockerfile: docker/Dockerfile.ci
+  cicd-import-tests:
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    needs: [cicd-test-container-build, pre-flight]
+    runs-on: self-hosted-azure-gpus-1
+    steps:
+      - name: Create UUID
+        id: uuid
+        run: |
+          echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
+      - name: Checkout NeMo
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
+      - name: Run some checks
+        run: |
+          docker run \
+              --rm \
+            --device=/dev/nvidia0 \
+            --gpus all \
+            --shm-size=8g \
+            --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
+            --env TRANSFORMERS_OFFLINE=0 \
+            --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
+            # PyTorch Lightning version
+            python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
+            # PyTorch Lightning DDP Checks
+            CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
+            # Basic Import Checks
+            python tests/core_ptl/check_imports.py --domain asr
+            python tests/core_ptl/check_imports.py --domain tts
+          '
+  L0_Setup_Test_Data_And_Models:
+    needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
+    runs-on: self-hosted-azure
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+      - name: main
+        uses: NVIDIA/NeMo/.github/actions/test-template@main
+        with:
+          runner: ${{ runner.name }}
+          script: L0_Setup_Test_Data_And_Models
+          tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
+  cicd-main-unit-tests:
+    needs: [pre-flight, cicd-test-container-build]
+    uses: ./.github/workflows/cicd-main-unit-tests.yml
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    with:
+      test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  cicd-main-speech:
+    needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+    uses: ./.github/workflows/cicd-main-speech.yml
+    if: |
+      (
+        needs.pre-flight.outputs.test_to_run != '[]'
+        && (
+          contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
+        )
+      )
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    with:
+      test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  cicd-main-nemo2:
+    needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
+    uses: ./.github/workflows/cicd-main-nemo2.yml
+    if: |
+      (
+        needs.pre-flight.outputs.test_to_run != '[]'
+        && (
+          contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
+          || needs.pre-flight.outputs.components_to_run == '["all"]'
+        )
+      )
+      && (
+        success()
+        || (
+          needs.cicd-wait-in-queue.result == 'skipped'
+          && needs.pre-flight.outputs.is_ci_workload == 'true'
+        )
+      )
+      && !cancelled()
+    with:
+      test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
+  Nemo_CICD_Test:
+    needs:
+      - pre-flight
+      - cicd-test-container-build
+      - cicd-import-tests
+      - L0_Setup_Test_Data_And_Models
+      - cicd-main-unit-tests
+      - cicd-main-nemo2
+      - cicd-main-speech
+    if: always()
+    runs-on: ubuntu-latest
+    permissions: write-all
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Get workflow result
+        id: result
+        env:
+          GH_TOKEN: ${{ github.token }}
+          RUN_ID: ${{ github.run_id }}
+          HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
+          IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
+        run: |
+          # Get workflow run details and check job conclusions
+          LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
+          NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
+          NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
+          if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then
+            RESULT="success"
+          elif [[ $NUM_CANCELLED -gt 0 ]]; then
+            RESULT="cancelled"
+          else
+            RESULT="failure"
+          fi
+          # Output the final status
+          echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
+      - name: Checkout for GH CLI
+        uses: actions/checkout@v4
+      - name: Remove label if not cancelled
+        if: |
+          steps.result.outputs.code != 'cancelled'
+          && github.event.label.name == 'Run CICD'
+          && github.event.pull_request.head.repo.full_name == github.repository
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.number }}
+        run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
+      - name: Pipeline successful, add PR comment
+        if: |
+          steps.result.outputs.code == 'success'
+          && github.event_name == 'pull_request'
+          && env.SLACK_WEBHOOK != ''
+        uses: peter-evans/create-or-update-comment@v4
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          REPOSITORY: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+        with:
+          issue-number: ${{ github.event.number }}
+          body: |
+            [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
+            We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
+            So it might be time to merge this PR or get some approvals.
+            //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
+      - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
+        if: |
+          steps.result.outputs.code == 'failure'
+          && github.event.label.name == 'Run CICD'
+          && env.SLACK_WEBHOOK != ''
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPOSITORY: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+          PR_NUMBER: ${{ github.event.number }}
+          SERVER_URL: ${{ github.server_url }}
+        run: |
+          set -x
+          pip install PyGithub
+          export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
+          python .github/scripts/notify.py
+      - name: Exit
+        if: ${{ always() }}
+        env:
+          RESULT: ${{ steps.result.outputs.code }}
+        run: |
+          if [ $RESULT == "success" ]; then
+            exit 0
+          else
+            exit 1
+          fi
+  Coverage:
+    runs-on: ubuntu-latest
+    needs: [pre-flight, Nemo_CICD_Test]
+    if: |
+      needs.pre-flight.outputs.test_to_run != '[]'
+      && needs.pre-flight.outputs.components_to_run != '[]'
+      && (
+        success()
+        || needs.Nemo_CICD_Test.result == 'success'
+      )
+      && !cancelled()
+    strategy:
+      matrix:
+        flag: [unit-test, e2e]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Download coverage reports of current branch
+        uses: actions/download-artifact@v4
+        with:
+          pattern: coverage-${{ matrix.flag }}-*
+      - name: Get total coverage of current branch
+        shell: bash -x -e -u -o pipefail {0}
+        if: always()
+        run: |
+          pip install coverage
+          ls -al .
+          ls -al coverage-*/
+          coverage combine --keep $(ls coverage-*/.coverage)
+          coverage report -i
+          rm -rf coverage-*
+          ls -al
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          verbose: true
+          flags: ${{ matrix.flag }}
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-${{ matrix.flag }}-aggregated
+          path: |
+            .coverage
+          include-hidden-files: true

.github/workflows/cicd-relabel-bot.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+# If the PR get's updated by a new commit, it prevents auto-merges
+# since there's no CI event attached to the commit anymore.
+# This workflow re-attaches the label after a push, if the PR
+# was already labeled prior to the push.
+name: CICD Relabel bot
+on:
+  pull_request_target:
+jobs:
+  relabel:
+    runs-on: ubuntu-latest
+    env:
+      PR_NUMBER: ${{ github.event.number }}
+      GH_TOKEN: ${{ secrets.NEMO_RELABEL_TOKEN }}
+      HOSTNAME: ${{ github.server_url }}
+    permissions: write-all
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Check if PR was already labeled with `Run CICD`
+        id: pre-flight
+        run: |
+          LABELS=$(gh pr view "$PR_NUMBER" --json labels)
+          HAS_LABEL=$(echo $LABELS \
+            | jq '[.labels[].name] | any(. == "Run CICD")'
+          )
+          echo "has-label=$HAS_LABEL" | tee -a "$GITHUB_OUTPUT"
+      - name: Relabel PR
+        if: ${{ steps.pre-flight.outputs.has-label == 'true' }}
+        run: |
+          gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
+          gh pr edit "$PR_NUMBER" --add-label "Run CICD"

.github/workflows/close-inactive-issue-pr.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Stale-Close-Inactive-Issues-PRs
+on:
+  schedule:
+    - cron: "30 1 * * *"
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v6
+        with:
+          operations-per-run: 100
+          days-before-issue-stale: 30
+          days-before-issue-close: 7
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
+          close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
+          days-before-pr-stale: 14
+          days-before-pr-close: 7
+          stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. Remove stale label or comment or update or this will be closed in 7 days."
+          close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale."
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/code-formatting.yml ADDED Viewed

	@@ -0,0 +1,73 @@

+name: Isort and Black Formatting
+# Incrementally reformat only changed files with black, all files with isort
+#
+# Replaces pre-commit.ci, since it reformats all the files.
+# See issue https://github.com/pre-commit-ci/issues/issues/90
+#
+# The action requires a custom token to trigger workflow after pushing reformatted files back to the branch.
+# `secrets.GITHUB_TOKEN` can be used instead, but this will result
+# in not running necessary checks after reformatting, which is undesirable.
+# For details see https://github.com/orgs/community/discussions/25702
+on:
+  pull_request_target:
+    paths:
+      - "**.py"
+    types: [opened, synchronize, reopened, labeled, unlabeled]
+defaults:
+  run:
+    shell: bash -x -e -u -o pipefail {0}
+jobs:
+  reformat_with_isort_and_black:
+    runs-on: ubuntu-latest
+    permissions:
+      # write permissions required to commit changes
+      contents: write
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+        with:
+          # setup repository and ref for PRs, see
+          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+          # custom token is required to trigger actions after reformatting + pushing
+          token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
+          fetch-depth: 0
+      - name: Get changed files
+        id: changed-files
+        uses: step-security/[email protected]
+        with:
+          files: |
+            **.py
+      - name: Setup Python env
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: black
+        uses: psf/black@stable
+        if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
+        with:
+          options: "--verbose"
+          # apply only to changed files (pass explicitly the files)
+          src: "${{ steps.changed-files.outputs.all_changed_files }}"
+          version: "~= 24.3"
+      - name: isort
+        uses: isort/isort-action@v1
+        if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
+        with:
+          isort-version: "5.13.2"
+          # reformat all files with isort – safe since the whole repo is already reformatted
+          configuration: ""
+      - uses: EndBug/add-and-commit@v9
+        # Commit changes. Nothing is committed if no changes.
+        with:
+          message: Apply isort and black reformatting
+          commit: --signoff

.github/workflows/code-init-file-checker.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: Check __init__ files
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check-init-files:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Install init-file-checker
+        run: pip install init-file-checker
+      - name: Run init-file-checker
+        run: init-file-checker nemo/

.github/workflows/code-linting.yml ADDED Viewed

	@@ -0,0 +1,160 @@

+name: PyLint and flake8 linting
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, labeled, unlabeled]
+  workflow_call:
+jobs:
+  linting:
+    name: "Domain: ${{ matrix.domain }}"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        domain: [speech, other]
+    env:
+      DOMAIN: ${{ matrix.domain }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Select filter
+        id: filter
+        run: |
+          if [[ "$DOMAIN" == "speech" ]]; then
+            FILTER=$(jq -crn '[
+              "nemo/collections/common/data/lhotse/*.py",
+              "nemo/collections/asr/**/*.py",
+              "nemo/collections/tts/**/*.py",
+              "nemo/collections/audio/**/*.py",
+              "nemo/collections/multimodal/speech_llm/**/*.py",
+              "nemo/collections/speechlm/**/*.py",
+              "nemo/collections/speechlm2/**/*.py"
+            ] | join(",")')
+          else
+            FILTER=$(jq -crn '[
+              "nemo/**/*.py",
+              "!nemo/collections/common/data/lhotse/*.py",
+              "!nemo/collections/asr/**/*.py",
+              "!nemo/collections/tts/**/*.py",
+              "!nemo/collections/audio/**/*.py",
+              "!nemo/collections/multimodal/speech_llm/**/*.py",
+              "!nemo/collections/speechlm/**/*.py",
+              "!nemo/collections/speechlm2/**/*.py",
+              "!nemo/export/**/*.py"
+            ] | join(",")')
+          fi
+          echo "main=$FILTER" | tee -a "$GITHUB_OUTPUT"
+      - name: Get changed files
+        id: changed-files
+        uses: step-security/[email protected]
+        with:
+          files: ${{ steps.filter.outputs.main }}
+          files_separator: ","
+          separator: " "
+      - name: Run PyLint
+        id: pylint
+        env:
+          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+          SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
+          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
+        run: |
+          if [[ -z "$CHANGED_FILES" ]]; then
+            echo Nothing to lint.
+            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          if [[ $SKIP_DOCS == true ]]; then
+            ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
+          else
+            ADDITIONAL_PYLINT_ARGS=""
+          fi
+          if [[ $SKIP_LINTING == true ]]; then
+            ADDITIONAL_PYLINT_ARGS="--exit-zero"
+          fi
+          pip install pylint
+          set +e
+          pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.$DOMAIN.txt" --rcfile ".pylintrc.$DOMAIN" ${CHANGED_FILES[@]}
+          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
+      - name: Run flake8
+        id: flake8
+        env:
+          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+          SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
+        run: |
+          if [[ -z "$CHANGED_FILES" ]]; then
+            echo Nothing to lint.
+            echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          if [[ $SKIP_LINTING == true ]]; then
+            ADDITIONAL_FLAKE8_ARGS="--exit-zero"
+          else
+            ADDITIONAL_FLAKE8_ARGS=""
+          fi
+          pip install flake8
+          set +e
+          flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.$DOMAIN.txt" --config ".flake8.$DOMAIN" ${CHANGED_FILES[@]}
+          echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
+      - name: Summary
+        env:
+          PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
+          FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
+        run: |
+          if [[ "$PYLINT" != "true" ]]; then
+            echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            cat pylintrc.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+          fi
+          if [[ "$FLAKE8" != "true" ]]; then
+            echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            cat flake8.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+          fi
+          if [[ "$PYLINT" != "true" ||  "$FLAKE8" != "true" ]]; then
+            echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
+            echo '```' | tee -a $GITHUB_STEP_SUMMARY
+            exit 1
+          fi
+  Nemo_Linting_Test:
+    needs: linting
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Main
+        env:
+          RESULTS: ${{ toJson(needs.linting) }}
+        run: |
+          RESULT=$(echo "$RESULTS" | jq -r '.result')
+          if [[ "$RESULT" == "success" ]]; then
+            echo "All passed."
+            exit 0
+          else
+            echo "Some linting domains failed."
+            exit 1
+          fi

.github/workflows/codeql.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+on:
+  push:
+    branches: [ "main", "[rv][0-9]*",  "gh-pages-src" ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ "main" ]
+  schedule:
+    - cron: '19 1 * * 4'
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'python' ]
+        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v2
+      with:
+        languages: ${{ matrix.language }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+        # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        queries: security-and-quality  # security-extended,
+        config-file: ./.github/workflows/config/codeql.yml
+    # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
+    # If this step fails, then you should remove it and run the build manually (see below)
+    - name: Autobuild
+      uses: github/codeql-action/autobuild@v2
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    #   If the Autobuild fails above, remove it and uncomment the following three lines.
+    #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+    # - run: |
+    #   echo "Run, Build Application using script"
+    #   ./location_of_script_within_repo/buildscript.sh
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v2
+      with:
+        category: "/language:${{matrix.language}}"

.github/workflows/community-bot.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+name: Community Bot
+on:
+  issues:
+    types: [opened, edited, reopened, closed, deleted]
+  issue_comment:
+    types: [created, edited, deleted]
+jobs:
+  community-bot:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    with:
+      community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
+    secrets:
+      GH_TOKEN: ${{ secrets.PAT }}

.github/workflows/config/changelog-config.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+    "categories": [
+      {
+        "title": "## ASR\n\n<details><summary>Changelog</summary>",
+        "labels": ["asr"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## TTS\n\n<details><summary>Changelog</summary>",
+        "labels": ["tts"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## NLP / NMT\n\n<details><summary>Changelog</summary>",
+        "labels": ["nlp", "nmt", "megatron"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## Text Normalization / Inverse Text Normalization\n\n<details><summary>Changelog</summary>",
+        "labels": ["tn", "itn"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## NeMo Tools\n\n<details><summary>Changelog</summary>",
+        "labels": ["tools"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## Export\n\n<details><summary>Changelog</summary>",
+        "labels": ["export"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## Documentation\n\n<details><summary>Changelog</summary>",
+        "labels": ["docs"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## Bugfixes\n\n<details><summary>Changelog</summary>",
+        "labels": ["bug"],
+        "exclude_labels": ["cherry-pick"]
+      },
+      {
+        "title": "</details>\n\n## Cherrypick\n\n<details><summary>Changelog</summary>",
+        "labels": ["cherry-pick"],
+        "exclude_labels": ["cherry-pick"]
+      }
+    ],
+    "ignore_labels": [
+      "ignore"
+    ],
+    "sort": "ASC",
+    "template": "\n${{CHANGELOG}}</details>\n\n## Uncategorized:\n\n<details><summary>Changelog</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
+    "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
+    "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
+    "label_extractor": [
+      {
+        "pattern": "(.*tts.*)|(.*g2p.*)",
+        "target": "tts",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*asr.*)|(.*ctc.*)|(.*rnnt.*)|(.*transducer.*)|(.*dali.*)|(.*k2.*)",
+        "target": "asr",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*nlp.*)|(.*punctuation.*)|(.*capitalization.*)|(.*entity.*)|(.*glue.*)|(.*entity.*)|(.*retrieval.*)|(.*entity.*)|(.*intent.*)|(.*slot.*)|(.*entity.*)|(.*language.*)|(.*qa.*)|(.*token class.*)|(.*text class.*)",
+        "target": "nlp",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*nmt.*)|(.*bignlp.*)|(.*megatron.*)|(.*machine.*)|(.*translation.*)|(.*gpt.*)",
+        "target": "nmt",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*tn.*)|(.*itn.*)|(.*text norm.*)",
+        "target": "tn",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*sde.*)|(.*ctc segment.*)",
+        "target": "tools",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*trt.*)|(.*onnx.*)|(.*export.*)",
+        "target": "export",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*\\[x\\] Documentation.*)",
+        "target": "docs",
+        "flags": "gmu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*\\[x\\] Bugfix.*)|(.*patch.*)",
+        "target": "bug",
+        "flags": "gmu",
+        "on_property": ["title", "body"]
+      },
+      {
+        "pattern": "(.*cherry-pick.*)|(.*cherrypick.*)",
+        "target": "cherrypick",
+        "flags": "gimu",
+        "on_property": ["title", "body"]
+      }
+    ],
+    "duplicate_filter": {
+      "pattern": ".+",
+      "on_property": "title",
+      "method": "match"
+    },
+    "transformers": [
+    ],
+    "max_tags_to_fetch": 100,
+    "max_pull_requests": 500,
+    "max_back_track_time_days": 365,
+    "exclude_merge_branches": [
+    ],
+    "tag_resolver": {
+      "method": "semver"
+    }
+}

.github/workflows/config/codeql.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+name: "CodeQL config"
+paths:
+  - nemo/
+  - tests/
+  - tools/
+  - scripts/
+  - examples/
+  - .github/

.github/workflows/copyright-check.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Copyright check
+on:
+  pull_request:
+jobs:
+  copyright-check:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]

.github/workflows/gh-docs.yml ADDED Viewed

	@@ -0,0 +1,81 @@

+name: gh-docs-build
+on:
+  push:
+  pull_request:
+    paths:
+      - "**"
+# Set the access for individual scopes
+permissions: write-all
+env:
+  PYTHON_VERSION: "3.11"
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    container:
+      image: squidfunk/mkdocs-material
+    steps:
+      - uses: actions/checkout@v4
+        if: github.event.repository.fork == false
+        with:
+          ref: gh-pages-src
+      - name: "Correct github config"
+        if: github.event.repository.fork == false
+        run: |
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+          git config --global user.name "${GITHUB_ACTOR}"
+          git config --global user.email "${GITHUB_ACTOR}@users.noreply.${GITHUB_DOMAIN:-"github.com"}"
+          remote_repo="https://x-access-token:${GITHUB_TOKEN}@${GITHUB_DOMAIN:-"github.com"}/${GITHUB_REPOSITORY}.git"
+          echo "${remote_repo}"
+          git remote rm origin
+          git remote add origin "${remote_repo}"
+      - name: "Deploy Github Page"
+        continue-on-error: true
+        run:  mkdocs gh-deploy --force
+  linkcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Get changed files
+        id: changed-files
+        uses: step-security/[email protected]
+        with:
+          files: docs/**
+          files_separator: ","
+          separator: " "
+      - name: Set up Python ${{ env.PYTHON_VERSION }}
+        if: steps.changed-files.outputs.any_changed == 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ env.PYTHON_VERSION }}
+      - name: Install Sphinx dependencies
+        if: steps.changed-files.outputs.any_changed == 'true'
+        run: python3 -m pip install -r requirements/requirements_docs.txt
+      - name: Linkcheck docs build
+        if: steps.changed-files.outputs.any_changed == 'true'
+        run: make -C docs linkcheck || true
+      - name: Eliminate false positives
+        if: steps.changed-files.outputs.any_changed == 'true'
+        run: ./docs/check_for_broken_links.sh
+      - name: Upload linkcheck output
+        if: steps.changed-files.outputs.any_changed == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: linkcheck-artifact
+          path: docs/build/linkcheck
+          if-no-files-found: error
+          retention-days: 7

.github/workflows/install-test.yml ADDED Viewed

	@@ -0,0 +1,286 @@

+name: CI-Install-Check
+on:
+  pull_request:
+    paths:
+      - "**"
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  test-installs-macos:
+    name: ${{ matrix.os }}-py${{ matrix.python }}-${{ matrix.installer }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-latest]
+        python: ["3.10", "3.11", "3.12"]
+        installer: ["pip-install", "nemo-install"]
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Check disk space before cleanup
+        run: df -h
+      - name: Free up disk space
+        run: |
+          # Remove unnecessary files on macOS
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /usr/local/lib/node_modules || true
+          brew cleanup || true
+          # Clear pip cache
+          pip cache purge || true
+      - name: Check disk space after cleanup
+        run: df -h
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "${{ matrix.python }}"
+      - name: Install NeMo
+        env:
+          INSTALLER: ${{ matrix.installer }}
+          NEMO_TAG: ${{ github.sha }}
+          NEMO_REPO: ${{ github.server_url }}/${{ github.repository }}
+        run: |
+          if [[ "$INSTALLER" == "pip-install" ]]; then
+            pip install --no-cache-dir -U pip
+            pip install --no-cache-dir ".[all]"
+          else
+            export NEMO_TAG
+            export NEMO_REPO
+            export INSTALL_DIR=$(pwd)
+            bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
+            pip install --no-cache-dir ".[all]"
+          fi
+      - name: Check disk space after installation
+        run: df -h
+      - name: Run import checks
+        run: |
+          # Run import checks
+          for collection in "asr" "tts" "lightning" "core"; do
+            python tests/core_ptl/check_imports.py --domain "$collection"
+          done
+  test-installs-linux-amd:
+    name: ubuntu-22.04-amd-py${{ matrix.python }}-${{ matrix.installer }}
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+        installer: ["pip-install", "nemo-install"]
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Check disk space before cleanup
+        run: df -h
+      - name: Free up disk space
+        run: |
+          # Remove unnecessary packages and files on Ubuntu
+          sudo apt-get clean
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/az || true
+          # Clear pip and npm caches
+          pip cache purge || true
+          sudo npm cache clean --force || true
+      - name: Check disk space after cleanup
+        run: df -h
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install NeMo
+        env:
+          INSTALLER: ${{ matrix.installer }}
+        run: |
+          if [ "$INSTALLER" = "pip-install" ]; then
+            pip install --no-cache-dir --upgrade pip
+            pip install --no-cache-dir ".[all]"
+          else
+            export INSTALL_DIR=$(pwd)
+            bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
+            pip install --no-cache-dir ".[all]"
+          fi
+      - name: Check disk space after installation
+        run: df -h
+      - name: Run import checks
+        run: |
+          # Run import checks
+          for collection in "asr" "tts" "lightning" "core"; do
+            python tests/core_ptl/check_imports.py --domain "$collection"
+          done
+  test-asr-install-linux-amd:
+    name: ubuntu-22.04-amd-py${{ matrix.python }}-asr
+    runs-on: ubuntu-22.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Check disk space before cleanup
+        run: df -h
+      - name: Free up disk space
+        run: |
+          # Remove unnecessary packages and files on Ubuntu
+          sudo apt-get clean
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/az || true
+          # Clear pip and npm caches
+          pip cache purge || true
+          sudo npm cache clean --force || true
+      - name: Check disk space after cleanup
+        run: df -h
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install NeMo
+        run: |
+          pip install --no-cache-dir --upgrade pip
+          pip install --no-cache-dir ".[asr]"
+      - name: Check disk space after installation
+        run: df -h
+      - name: Run import checks
+        run: |
+          # Run import checks
+          python tests/core_ptl/check_imports.py --domain asr
+  test-installs-linux-arm:
+    name: ubuntu-22.04-arm-py${{ matrix.python }}-${{ matrix.installer }}
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+        installer: ["pip-install", "nemo-install"]
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Check disk space before cleanup
+        run: df -h
+      - name: Free up disk space
+        run: |
+          # Remove unnecessary packages and files on Ubuntu ARM
+          sudo apt-get clean
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/az || true
+          # Clear pip and npm caches
+          pip cache purge || true
+          sudo npm cache clean --force || true
+      - name: Check disk space after cleanup
+        run: df -h
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install NeMo
+        env:
+          INSTALLER: ${{ matrix.installer }}
+        run: |
+          if [ "$INSTALLER" = "pip-install" ]; then
+            pip install --no-cache-dir --upgrade pip
+            pip install --no-cache-dir ".[all]"
+          else
+            export INSTALL_DIR=$(pwd)
+            bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
+            pip install --no-cache-dir ".[all]"
+          fi
+      - name: Check disk space after installation
+        run: df -h
+      - name: Run import checks
+        run: |
+          # Run import checks
+          for collection in "asr" "tts" "lightning" "core"; do
+            python tests/core_ptl/check_imports.py --domain "$collection"
+          done
+  test-asr-installs-linux-arm:
+    name: ubuntu-22.04-arm-py${{ matrix.python }}-asr
+    runs-on: ubuntu-22.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Check disk space before cleanup
+        run: df -h
+      - name: Free up disk space
+        run: |
+          # Remove unnecessary packages and files on Ubuntu ARM
+          sudo apt-get clean
+          sudo rm -rf /usr/local/lib/android || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf /usr/local/.ghcup || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/az || true
+          # Clear pip and npm caches
+          pip cache purge || true
+          sudo npm cache clean --force || true
+      - name: Check disk space after cleanup
+        run: df -h
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install NeMo
+        run: |
+          pip install --no-cache-dir --upgrade pip
+          pip install --no-cache-dir ".[asr]"
+      - name: Check disk space after installation
+        run: df -h
+      - name: Run import checks
+        run: |
+          # Run import checks
+          python tests/core_ptl/check_imports.py --domain asr

.github/workflows/labeler.yml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: "Pull Request Labeler"
+on:
+- pull_request_target
+jobs:
+  triage:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v4
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"

.github/workflows/mcore-tag-bump-bot.yml ADDED Viewed

	@@ -0,0 +1,62 @@

+# Regularly updates the CI container
+name: Megatron Tag Bump Bot
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 0 * * *
+jobs:
+  get-release-branch-names:
+    runs-on: ubuntu-latest
+    outputs:
+      mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
+      nemo: ${{ steps.get-branch.outputs.nemo_release_branch }}
+    steps:
+      - name: Get release branch names
+        id: get-branch
+        run: |
+          latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' |
+            grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' |
+            sort -V |
+            tail -n1)
+          echo "mcore_release_branch=$latest_branch" >> $GITHUB_OUTPUT
+          latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/NeMo.git 'refs/heads/r*' |
+            grep -o 'r[0-9]\+\.[0-9]\+\.[0-9]\+' |
+            sort -V |
+            tail -n1)
+          echo "nemo_release_branch=$latest_branch" >> $GITHUB_OUTPUT
+  bump-tags:
+    needs: [get-release-branch-names]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - nemo-target-branch: ${{ needs.get-release-branch-names.outputs.nemo }}
+            mcore-target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
+          - nemo-target-branch: main
+            mcore-target-branch: main
+    uses: ./.github/workflows/_bump_mcore_tag.yml
+    with:
+      nemo-target-branch: ${{ matrix.nemo-target-branch }}
+      mcore-target-branch: ${{ matrix.mcore-target-branch }}
+    secrets:
+      PAT: ${{ secrets.PAT }}
+  notify:
+    if: failure()
+    runs-on: ubuntu-latest
+    needs: [bump-tags]
+    steps:
+      - name: Notify
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+        run: |
+          curl -X POST \
+            -H 'Content-type: application/json' \
+            --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Mcore-bump-bot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
+            $SLACK_WEBHOOK

.github/workflows/monitor-single-vm.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: ~shut down a single VM
+on:
+  workflow_call:
+    inputs:
+      vm:
+        type: string
+        description: Name of VM
+        required: true
+      n_gpus:
+        type: string
+        description: Number of GPUs this VM has
+        required: true
+jobs:
+  check-status-and-maybe-shutdown:
+    environment: main
+    runs-on: ${{ inputs.vm }}
+    outputs:
+      status: ${{ steps.status.outputs.main }}
+    steps:
+      - name: Check status
+        id: status
+        run: |
+          docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi
+          NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+          if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then
+            echo "Issues with GPU detected, will take this runner offline."
+            echo "main=degraded" >> "$GITHUB_OUTPUT"
+          else
+            echo "main=healthy" >> "$GITHUB_OUTPUT"
+          fi
+      - name: Send Slack message & Disconnect runner from GitHub
+        if: ${{ steps.status.outputs.main == 'degraded' || failure() }}
+        run: |
+          MESSAGE='{
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": ":alert: VM bot 🤖: Hey <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>: VM `${{ inputs.vm }}` is having not the best day of their life, maybe bring them an apple or so."
+                }
+              }
+            ]
+          }'
+          curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
+          cd /home/azureuser/actions-runner
+          echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop

.github/workflows/monitor-vms.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Regularly updates the CI container
+name: Reboots VMs in a controlled way
+on:
+  schedule:
+    - cron: 0/15 * * * *
+  workflow_dispatch:
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    if: github.repository_owner == 'NVIDIA'
+    outputs:
+      list-of-vms: ${{ steps.main.outputs.main }}
+    environment: main
+    steps:
+      - name: Get list of VMs
+        id: main
+        env:
+          GITHUB_TOKEN: ${{ secrets.PAT }}
+        run: |
+          RUNNERS=$(curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer $GITHUB_TOKEN" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
+          MATRIX=$(echo $RUNNERS \
+            | jq -c '[
+                .runners[]
+                | select(.status == "online")
+                | select(.name | contains("cpu") | not)
+                | {
+                  "vm": .name,
+                  "n_gpus": [
+                    .labels[]
+                    | select(.name | endswith("gpu")) | .name
+                  ][0][:1]
+                }
+              ]
+            '
+          )
+          echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
+  maintenance:
+    needs: pre-flight
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
+    uses: ./.github/workflows/monitor-single-vm.yml
+    with:
+      vm: ${{ matrix.vm }}
+      n_gpus: ${{ matrix.n_gpus }}
+    secrets: inherit  # pragma: allowlist secret

.github/workflows/release-freeze.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+name: "Code freeze"
+on:
+  workflow_dispatch:
+    inputs:
+      type_of_release:
+        type: choice
+        description: Type of release
+        options:
+          - major
+          - minor
+      freeze-commit:
+        type: string
+        description: Commit SHA to use for cut-off
+        required: false
+        default: main
+      mcore_version:
+        description: "Version of MCore to use (must be a valid git ref)"
+        required: true
+        type: string
+      dry-run:
+        type: boolean
+        description: Dry-run of code-freeze
+        required: false
+        default: true
+jobs:
+  code-freeze:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    with:
+      library-name: NeMo-Toolkit
+      python-package: nemo
+      release-type: ${{ inputs.type_of_release }}
+      freeze-commit: ${{ inputs.freeze-commit }}
+      dry-run: ${{ inputs.dry-run }}
+      use-pat: true
+    secrets:
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      PAT: ${{ secrets.PAT }}
+  freeze-tags:
+    runs-on: ubuntu-latest
+    needs: [code-freeze]
+    environment: main
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+          token: ${{ secrets.PAT }}
+          fetch-depth: 0
+          fetch-tags: true
+          ref: ${{ inputs.dry-run == true && inputs.freeze-commit || needs.code-freeze.outputs.release-branch }}
+      - name: Pin branch name in Notebooks
+        run: |
+          cd ${{ github.run_id }}
+          find tutorials -type f -name "*.ipynb" -exec sed -i "s/BRANCH = 'main'/BRANCH = '${{ needs.code-freeze.outputs.release-branch }}'/g" {} +
+      - name: Pin MCore in Dockerfile
+        run: |
+          cd ${{ github.run_id }}
+          sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' docker/Dockerfile.ci
+      - name: Show status
+        run: |
+          cd ${{ github.run_id }}
+          git status
+      - name: Create PR
+        uses: peter-evans/create-pull-request@v6
+        id: create-pull-request
+        if: ${{ inputs.dry-run != true }}
+        with:
+          path: ${{ github.run_id }}
+          base: ${{ needs.code-freeze.outputs.release-branch }}
+          branch: ci/freeze-tags-${{ needs.code-freeze.outputs.release-branch }}
+          title: "Freeze tags in in `${{ needs.code-freeze.outputs.release-branch }}`"
+          body: |
+            🚀 PR to freeze tags in `${{ needs.code-freeze.outputs.release-branch }}`.
+          commit-message: "[🤠]: Howdy folks, let's release NeMo `${{ needs.code-freeze.outputs.release-branch }}` !"
+          signoff: true
+          assignees: okoenig

.github/workflows/release.yml ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "Release Neural Modules"
+on:
+  workflow_dispatch:
+    inputs:
+      release-ref:
+        description: Ref (SHA or branch name) to release
+        required: true
+        type: string
+      version-bump-branch:
+        description: Branch for version bump
+        required: true
+        type: string
+      dry-run:
+        description: Do not publish a wheel and GitHub release.
+        required: true
+        default: true
+        type: boolean
+jobs:
+  release:
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    with:
+      release-ref: ${{ inputs.release-ref }}
+      python-package: nemo
+      python-version: "3.10"
+      library-name: Neural Modules
+      dry-run: ${{ inputs.dry-run }}
+      version-bump-branch: ${{ inputs.version-bump-branch }}
+    secrets:
+      TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+      TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+      SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
+      PAT: ${{ secrets.PAT }}

.github/workflows/secrets-detector.yml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Secrets detector
+on:
+  pull_request_target:
+    branches:
+      - 'main'
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
+      - name: Install secrets detector
+        run: pip install detect-secrets
+      - name: Run on change-set
+        run: |
+          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --disable-plugin HexHighEntropyString --baseline .secrets.baseline
+      - uses: EndBug/add-and-commit@v9
+        # Commit changes. Nothing is committed if no changes.
+        if: always()
+        with:
+          message: Update baseline
+          commit: --signoff

.github/workflows/update-buildcache.yml ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Update build cache
+on:
+  schedule:
+    - cron: 0 0 * * *
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+    inputs:
+      runner:
+        required: false
+        default: self-hosted-azure-builder
+        type: string
+        description: VM to use for build
+jobs:
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
+      cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v4
+      - name: Parse manifest.json
+        id: manifest
+        run: |
+          BUILD_ARGS=$(cat << EOF
+          BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
+          TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
+          TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
+          MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
+          MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
+          TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
+          TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
+          APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
+          APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
+          EOF
+          )
+          echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
+          echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Get last merged PR
+        id: cache_from
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          LAST_PRS=$(gh api graphql -f query='
+            query {
+              repository(owner: "NVIDIA", name: "NeMo") {
+                pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
+                  nodes {
+                    number
+                  }
+                }
+              }
+            }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
+              echo "nemoci.azurecr.io/nemo_container-buildcache:$number"
+            done)
+          echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
+          echo "$LAST_PRS" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+  cicd-test-container-build:
+    needs: [pre-flight]
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - dockerfile: docker/Dockerfile.ci
+            image-name: nemo_container_automodel
+          - dockerfile: docker/Dockerfile.ci
+            image-name: nemo_container_nemo2
+          - dockerfile: docker/Dockerfile.ci
+            image-name: nemo_container_speech
+          - dockerfile: docker/Dockerfile.ci
+            image-name: nemo_container
+    with:
+      image-name: ${{ matrix.image-name }}
+      dockerfile: ${{ matrix.dockerfile }}
+      image-label: nemo-core
+      build-args: |
+        IMAGE_LABEL=nemo-core
+        NEMO_TAG=${{ github.sha }}
+        NEMO_REPO=https://github.com/NVIDIA/NeMo
+        ${{ needs.pre-flight.outputs.BUILD_ARGS }}
+      runner: ${{ inputs.runner || 'self-hosted-azure-builder' }}
+      use-inline-cache: false
+      prune-filter-timerange: 24h
+      cache-from: |
+        nemoci.azurecr.io/${{ matrix.image-name }}-buildcache:main
+        ${{ needs.pre-flight.outputs.cache-from }}