subhankarg commited on
Commit
0558aa4
·
verified ·
1 Parent(s): 5b5c9b5

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .coveragerc +36 -0
  2. .dockerignore +19 -0
  3. .flake8 +9 -0
  4. .flake8.other +9 -0
  5. .flake8.speech +9 -0
  6. .gitattributes +88 -0
  7. .github/CODEOWNERS +4 -0
  8. .github/ISSUE_TEMPLATE/bug_report.md +42 -0
  9. .github/ISSUE_TEMPLATE/config.yml +2 -0
  10. .github/ISSUE_TEMPLATE/dev_container_bug_report.md +35 -0
  11. .github/ISSUE_TEMPLATE/feature_request.md +25 -0
  12. .github/PULL_REQUEST_TEMPLATE.md +57 -0
  13. .github/actions/cancel-workflow/action.yml +25 -0
  14. .github/actions/test-template/action.yml +227 -0
  15. .github/labeler.yml +55 -0
  16. .github/scripts/__init__.py +0 -0
  17. .github/scripts/components_to_run.py +84 -0
  18. .github/scripts/nemo_dependencies.py +400 -0
  19. .github/scripts/notify.py +79 -0
  20. .github/workflows/_build_container.yml +89 -0
  21. .github/workflows/_bump_mcore_tag.yml +56 -0
  22. .github/workflows/build-test-publish-wheel.yml +38 -0
  23. .github/workflows/changelog-build.yml +123 -0
  24. .github/workflows/cherry-pick-release-commit.yml +14 -0
  25. .github/workflows/cicd-approve-test-queue.yml +175 -0
  26. .github/workflows/cicd-main-nemo2.yml +299 -0
  27. .github/workflows/cicd-main-speech.yml +216 -0
  28. .github/workflows/cicd-main-testcopy.yml +472 -0
  29. .github/workflows/cicd-main-unit-tests.yml +212 -0
  30. .github/workflows/cicd-main.yml +450 -0
  31. .github/workflows/cicd-relabel-bot.yml +36 -0
  32. .github/workflows/close-inactive-issue-pr.yml +25 -0
  33. .github/workflows/code-formatting.yml +73 -0
  34. .github/workflows/code-init-file-checker.yml +23 -0
  35. .github/workflows/code-linting.yml +160 -0
  36. .github/workflows/codeql.yml +75 -0
  37. .github/workflows/community-bot.yml +15 -0
  38. .github/workflows/config/changelog-config.json +134 -0
  39. .github/workflows/config/codeql.yml +9 -0
  40. .github/workflows/copyright-check.yml +22 -0
  41. .github/workflows/gh-docs.yml +81 -0
  42. .github/workflows/install-test.yml +286 -0
  43. .github/workflows/labeler.yml +14 -0
  44. .github/workflows/mcore-tag-bump-bot.yml +62 -0
  45. .github/workflows/monitor-single-vm.yml +54 -0
  46. .github/workflows/monitor-vms.yml +54 -0
  47. .github/workflows/release-freeze.yml +85 -0
  48. .github/workflows/release.yml +48 -0
  49. .github/workflows/secrets-detector.yml +43 -0
  50. .github/workflows/update-buildcache.yml +110 -0
.coveragerc ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [run]
2
+ concurrency = thread,multiprocessing
3
+ omit =
4
+ /tmp/*
5
+ /home/TestData/*
6
+ /workspace/Megatron-LM/*
7
+ nemo/collections/multimodal/*
8
+ nemo/collections/multimodal_autoregressive/*
9
+ nemo/collections/vision/*
10
+ nemo/collections/diffusion/*
11
+ nemo/collections/nlp/*
12
+
13
+ nemo/collections/asr/*
14
+ nemo/collections/speechlm/*
15
+ nemo/collections/tts/*
16
+
17
+ # omit from audio
18
+ nemo/collections/audio/data/data_simulation.py
19
+ nemo/collections/audio/metrics/squim.py
20
+ nemo/collections/audio/losses/maxine/*
21
+ nemo/collections/audio/models/maxine/*
22
+ nemo/collections/audio/parts/utils/maxine.py
23
+
24
+ nemo/core/*
25
+ nemo/collections/common/*
26
+
27
+ /workspace/config-3.12.py
28
+ /workspace/config-3.py
29
+ /workspace/config.py
30
+
31
+ [paths]
32
+ source =
33
+ nemo/
34
+ /home/runner/work/NeMo/NeMo/nemo
35
+ /workspace/nemo
36
+
.dockerignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env
7
+ pip-log.txt
8
+ pip-delete-this-directory.txt
9
+ .tox
10
+ .coverage
11
+ .coverage.*
12
+ .cache
13
+ nosetests.xml
14
+ coverage.xml
15
+ *,cover
16
+ *.log
17
+ .git
18
+ **/*.nemo
19
+ **/*.ckpt
.flake8 ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 119
3
+ select =
4
+ F541, # f-string without any placeholders
5
+ F841, # local variable 'x' is assigned to but never used
6
+ F401, # 'x' imported but unused
7
+ E741, # ambiguous variable name 'l'
8
+ F821, # undefined name 'x'
9
+ E266, # too many leading '#' for block comment
.flake8.other ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 119
3
+ select =
4
+ F541, # f-string without any placeholders
5
+ F841, # local variable 'x' is assigned to but never used
6
+ F401, # 'x' imported but unused
7
+ E741, # ambiguous variable name 'l'
8
+ F821, # undefined name 'x'
9
+ E266, # too many leading '#' for block comment
.flake8.speech ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 119
3
+ select =
4
+ F541, # f-string without any placeholders
5
+ F841, # local variable 'x' is assigned to but never used
6
+ F401, # 'x' imported but unused
7
+ E741, # ambiguous variable name 'l'
8
+ F821, # undefined name 'x'
9
+ E266, # too many leading '#' for block comment
.gitattributes CHANGED
@@ -33,3 +33,91 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/source/asr/images/citrinet_vertical.png filter=lfs diff=lfs merge=lfs -text
37
+ docs/source/asr/images/conf-ensembles-overview.png filter=lfs diff=lfs merge=lfs -text
38
+ docs/source/asr/images/conformer_ctc.png filter=lfs diff=lfs merge=lfs -text
39
+ docs/source/asr/images/hat.png filter=lfs diff=lfs merge=lfs -text
40
+ docs/source/asr/images/hybrid_asr_tts_model.png filter=lfs diff=lfs merge=lfs -text
41
+ docs/source/asr/images/jasper_vertical.png filter=lfs diff=lfs merge=lfs -text
42
+ docs/source/asr/images/quartz_vertical.png filter=lfs diff=lfs merge=lfs -text
43
+ docs/source/asr/images/squeezeformer.png filter=lfs diff=lfs merge=lfs -text
44
+ docs/source/asr/speaker_diarization/images/aosc_3spk_example.gif filter=lfs diff=lfs merge=lfs -text
45
+ docs/source/asr/speaker_diarization/images/asr_sd_diagram.png filter=lfs diff=lfs merge=lfs -text
46
+ docs/source/asr/speaker_diarization/images/ats.png filter=lfs diff=lfs merge=lfs -text
47
+ docs/source/asr/speaker_diarization/images/cache_fifo_chunk.png filter=lfs diff=lfs merge=lfs -text
48
+ docs/source/asr/speaker_diarization/images/data_flow.png filter=lfs diff=lfs merge=lfs -text
49
+ docs/source/asr/speaker_diarization/images/e2e_and_cascaded_diar_systems.png filter=lfs diff=lfs merge=lfs -text
50
+ docs/source/asr/speaker_diarization/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
51
+ docs/source/asr/speaker_diarization/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
52
+ docs/source/asr/speaker_diarization/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
53
+ docs/source/asr/speaker_diarization/images/ms_trade_off.png filter=lfs diff=lfs merge=lfs -text
54
+ docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
55
+ docs/source/asr/speaker_diarization/images/scale_weight_cnn.png filter=lfs diff=lfs merge=lfs -text
56
+ docs/source/asr/speaker_diarization/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
57
+ docs/source/asr/speaker_diarization/images/streaming_steps.png filter=lfs diff=lfs merge=lfs -text
58
+ docs/source/asr/speaker_diarization/images/weighted_sum.png filter=lfs diff=lfs merge=lfs -text
59
+ docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png filter=lfs diff=lfs merge=lfs -text
60
+ docs/source/asr/speaker_recognition/images/titanet_network.png filter=lfs diff=lfs merge=lfs -text
61
+ docs/source/asr/speech_classification/images/marblenet_vertical.png filter=lfs diff=lfs merge=lfs -text
62
+ docs/source/asr/speech_classification/images/matchboxnet_vertical.png filter=lfs diff=lfs merge=lfs -text
63
+ docs/source/asr/speech_intent_slot/images/example.png filter=lfs diff=lfs merge=lfs -text
64
+ docs/source/core/whyntypes.gif filter=lfs diff=lfs merge=lfs -text
65
+ docs/source/nlp/entity_linking_overview.jpg filter=lfs diff=lfs merge=lfs -text
66
+ docs/source/nlp/nemo_megatron/customization_forward.png filter=lfs diff=lfs merge=lfs -text
67
+ docs/source/nlp/nemo_megatron/customization_module.png filter=lfs diff=lfs merge=lfs -text
68
+ docs/source/nlp/nemo_megatron/images/ddp.gif filter=lfs diff=lfs merge=lfs -text
69
+ docs/source/nlp/nemo_megatron/images/pnom.gif filter=lfs diff=lfs merge=lfs -text
70
+ docs/source/nlp/nemo_megatron/images/pp.gif filter=lfs diff=lfs merge=lfs -text
71
+ docs/source/nlp/nemo_megatron/images/pp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
72
+ docs/source/nlp/nemo_megatron/images/tp1.png filter=lfs diff=lfs merge=lfs -text
73
+ docs/source/nlp/nemo_megatron/images/tp2.png filter=lfs diff=lfs merge=lfs -text
74
+ docs/source/nlp/nemo_megatron/images/tp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
75
+ docs/source/tools/images/scrsh_2.png filter=lfs diff=lfs merge=lfs -text
76
+ docs/source/tools/images/scrsh_9.png filter=lfs diff=lfs merge=lfs -text
77
+ docs/source/tools/images/sde_mls_player.png filter=lfs diff=lfs merge=lfs -text
78
+ docs/source/tools/images/sde_player.png filter=lfs diff=lfs merge=lfs -text
79
+ docs/source/tools/images/sde_samples.png filter=lfs diff=lfs merge=lfs -text
80
+ docs/source/tts/images/audiocodec_model.png filter=lfs diff=lfs merge=lfs -text
81
+ docs/source/tts/images/data_labeling_pipeline.png filter=lfs diff=lfs merge=lfs -text
82
+ docs/source/tts/images/fastpitch_model.png filter=lfs diff=lfs merge=lfs -text
83
+ docs/source/tts/images/hifigan_d_model.png filter=lfs diff=lfs merge=lfs -text
84
+ docs/source/tts/images/hifigan_g_model.png filter=lfs diff=lfs merge=lfs -text
85
+ docs/source/tts/images/mixertts_model.png filter=lfs diff=lfs merge=lfs -text
86
+ docs/source/tts/images/radaligner_model.png filter=lfs diff=lfs merge=lfs -text
87
+ docs/source/tts/images/radtts_model.png filter=lfs diff=lfs merge=lfs -text
88
+ docs/source/tts/images/tacotron2_model.png filter=lfs diff=lfs merge=lfs -text
89
+ docs/source/tts/images/univnet_model.png filter=lfs diff=lfs merge=lfs -text
90
+ docs/source/tts/images/waveglow_model.png filter=lfs diff=lfs merge=lfs -text
91
+ nemo/collections/diffusion/assets/mixed_training.png filter=lfs diff=lfs merge=lfs -text
92
+ nemo/collections/diffusion/assets/pipeline_conditioning.png filter=lfs diff=lfs merge=lfs -text
93
+ nemo/collections/diffusion/assets/st_dit_hybrid_parallel.png filter=lfs diff=lfs merge=lfs -text
94
+ output_audio_context.wav filter=lfs diff=lfs merge=lfs -text
95
+ output_baked.wav filter=lfs diff=lfs merge=lfs -text
96
+ tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
97
+ tools/speech_data_simulator/pictures/audio_session.png filter=lfs diff=lfs merge=lfs -text
98
+ tutorials/asr/images/canary2_timestamps.png filter=lfs diff=lfs merge=lfs -text
99
+ tutorials/asr/images/multi_instance.png filter=lfs diff=lfs merge=lfs -text
100
+ tutorials/asr/images/multilang_asr_inference.png filter=lfs diff=lfs merge=lfs -text
101
+ tutorials/asr/images/multilang_asr_train.png filter=lfs diff=lfs merge=lfs -text
102
+ tutorials/asr/images/promptformat.png filter=lfs diff=lfs merge=lfs -text
103
+ tutorials/asr/images/speaker_injection.png filter=lfs diff=lfs merge=lfs -text
104
+ tutorials/asr/images/test_wer_wandb.png filter=lfs diff=lfs merge=lfs -text
105
+ tutorials/asr/images/tokenizer.png filter=lfs diff=lfs merge=lfs -text
106
+ tutorials/llm/llama/domain-adaptive-pretraining/code/imgs/tokenization_diagram.png filter=lfs diff=lfs merge=lfs -text
107
+ tutorials/llm/qwen/pruning-distillation/imgs/val_loss_comparison.png filter=lfs diff=lfs merge=lfs -text
108
+ tutorials/speaker_tasks/images/affinity_matrix_fusion.png filter=lfs diff=lfs merge=lfs -text
109
+ tutorials/speaker_tasks/images/ats.png filter=lfs diff=lfs merge=lfs -text
110
+ tutorials/speaker_tasks/images/cache_fifo_chunk.png filter=lfs diff=lfs merge=lfs -text
111
+ tutorials/speaker_tasks/images/cascaded_diar_diagram.png filter=lfs diff=lfs merge=lfs -text
112
+ tutorials/speaker_tasks/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
113
+ tutorials/speaker_tasks/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
114
+ tutorials/speaker_tasks/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
115
+ tutorials/speaker_tasks/images/msdd_inputs.png filter=lfs diff=lfs merge=lfs -text
116
+ tutorials/speaker_tasks/images/msdd_output_loss.png filter=lfs diff=lfs merge=lfs -text
117
+ tutorials/speaker_tasks/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
118
+ tutorials/speaker_tasks/images/multiscale_example.png filter=lfs diff=lfs merge=lfs -text
119
+ tutorials/speaker_tasks/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
120
+ tutorials/speaker_tasks/images/streaming_steps.png filter=lfs diff=lfs merge=lfs -text
121
+ tutorials/tts/audio_samples/new_dict_entry.wav filter=lfs diff=lfs merge=lfs -text
122
+ tutorials/tts/audio_samples/phonemes_as_input.wav filter=lfs diff=lfs merge=lfs -text
123
+ tutorials/tts/images/tacotron2_diagram.png filter=lfs diff=lfs merge=lfs -text
.github/CODEOWNERS ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .github/ @pablo-garay @ko3n1g @thomasdhc @chtruong814
2
+ docker/Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
3
+ .pylintrc.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
4
+ .flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+
12
+ A clear and concise description of what the bug is.
13
+
14
+ **Steps/Code to reproduce bug**
15
+
16
+ Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
17
+
18
+ A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
19
+
20
+
21
+ **Expected behavior**
22
+
23
+ A clear and concise description of what you expected to happen.
24
+
25
+ **Environment overview (please complete the following information)**
26
+
27
+ - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider - AWS, Azure, GCP, Collab)]
28
+ - Method of NeMo install: [pip install or from source]. Please specify exact commands you used to install.
29
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
30
+
31
+ **Environment details**
32
+
33
+ If NVIDIA docker image is used you don't need to specify these.
34
+ Otherwise, please provide:
35
+ - OS version
36
+ - PyTorch version
37
+ - Python version
38
+
39
+ **Additional context**
40
+
41
+ Add any other context about the problem here.
42
+ Example: GPU model
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ blank_issues_enabled: false
2
+
.github/ISSUE_TEMPLATE/dev_container_bug_report.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ container pulled on date: mm/dd/yyyy
3
+ name: Dev container - Bug report
4
+ about: Create a report to help us improve
5
+ title: ''
6
+ labels: bug
7
+ assignees: ''
8
+
9
+ ---
10
+
11
+ **Describe the bug**
12
+
13
+ A clear and concise description of what the bug is.
14
+
15
+ **Steps/Code to reproduce bug**
16
+
17
+ Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
18
+
19
+ A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
20
+
21
+
22
+ **Expected behavior**
23
+
24
+ A clear and concise description of what you expected to happen.
25
+
26
+ **Environment overview (please complete the following information)**
27
+
28
+ - Environment location: Docker
29
+ - Method of install: Please specify exact commands you used to install.
30
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
31
+
32
+ **Additional context**
33
+
34
+ Add any other context about the problem here.
35
+ Example: GPU model
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: feature request
6
+ assignees: okuchaiev
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+
12
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13
+
14
+ **Describe the solution you'd like**
15
+
16
+ A clear and concise description of what you want to happen.
17
+ Provide a code snippet on how new APIs/changes would be used by others.
18
+
19
+ **Describe alternatives you've considered**
20
+
21
+ A clear and concise description of any alternative solutions or features you've considered.
22
+
23
+ **Additional context**
24
+
25
+ Add any other context or screenshots about the feature request here.
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > [!IMPORTANT]
2
+ > The `Update branch` button must only be pressed in very rare occassions.
3
+ > An outdated branch is never blocking the merge of a PR.
4
+ > Please reach out to the automation team before pressing that button.
5
+
6
+ # What does this PR do ?
7
+
8
+ Add a one line overview of what this PR aims to accomplish.
9
+
10
+ **Collection**: [Note which collection this PR will affect]
11
+
12
+ # Changelog
13
+
14
+ - Add specific line by line info of high level changes in this PR.
15
+
16
+ # Usage
17
+
18
+ - You can potentially add a usage example below
19
+
20
+ ```python
21
+ # Add a code snippet demonstrating how to use this
22
+ ```
23
+
24
+ # GitHub Actions CI
25
+
26
+ The Jenkins CI system has been replaced by GitHub Actions self-hosted runners.
27
+
28
+ The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR.
29
+ To re-run CI remove and add the label again.
30
+ To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run".
31
+
32
+ # Before your PR is "Ready for review"
33
+
34
+ **Pre checks**:
35
+
36
+ - [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
37
+ - [ ] Did you write any new necessary tests?
38
+ - [ ] Did you add or update any necessary documentation?
39
+ - [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc)
40
+ - [ ] Reviewer: Does the PR have correct import guards for all optional libraries?
41
+
42
+ **PR Type**:
43
+
44
+ - [ ] New Feature
45
+ - [ ] Bugfix
46
+ - [ ] Documentation
47
+
48
+ If you haven't finished some of the above items you can still open "Draft" PR.
49
+
50
+ ## Who can review?
51
+
52
+ Anyone in the community is free to review the PR once the checks have passed.
53
+ [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
54
+
55
+ # Additional Information
56
+
57
+ - Related to # (issue)
.github/actions/cancel-workflow/action.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Cancel Workflow
2
+ description: >
3
+ Cancels the current workflow run, i.e. all jobs. Useful if you want to cancel the rest of the workflow when one job
4
+ fails. Note that this will cause the workflow to appear cancelled, not failed.
5
+
6
+ # Cancelling the workflow in a post-script (like this:
7
+ # https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runspost; can also be done with
8
+ # this action: https://github.com/webiny/action-post-run, see Git history of this file) wouldn't help the status, it
9
+ # would still be cancelled. It actually indeed is, but it would be nicer to set it to failed, but there seems to be no
10
+ # way to do this.
11
+
12
+ runs:
13
+ using: "composite"
14
+ steps:
15
+ - name: Cancel Workflow
16
+ # # Fork PRs won't have a token with write access to Actions, thus won't be able to cancel the workflow.
17
+ # if: github.event.pull_request == '' || github.event.pull_request.head.repo.fork == false
18
+ shell: bash
19
+ run: |
20
+ curl --verbose \
21
+ -X POST \
22
+ -H "Accept: application/vnd.github+json" \
23
+ -H "Authorization: Bearer ${{ github.token }}" \
24
+ -H "X-GitHub-Api-Version: 2022-11-28" \
25
+ https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel
.github/actions/test-template/action.yml ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "Test Template"
15
+ description: "Template for running NeMo tests in a containerized environment"
16
+
17
+ inputs:
18
+ runner:
19
+ description: "Runner to use for test"
20
+ required: true
21
+ timeout:
22
+ description: "Max runtime of test in minutes"
23
+ required: false
24
+ default: "10"
25
+ script:
26
+ description: "Test script to execute"
27
+ required: true
28
+ after_script:
29
+ description: "Script to run after main test"
30
+ required: false
31
+ default: ":"
32
+ is_optional:
33
+ description: "Failure will cancel all other tests if set to true"
34
+ required: false
35
+ default: "false"
36
+ is_unit_test:
37
+ description: "Upload coverage as unit test"
38
+ required: false
39
+ default: "false"
40
+ tests_to_run:
41
+ description: "Tests to run"
42
+ required: false
43
+ default: '["all"]'
44
+ image:
45
+ description: "Image to use for test"
46
+ required: false
47
+ default: "nemo_container"
48
+ cpu-only:
49
+ description: "Run tests on CPU only"
50
+ required: false
51
+ default: "false"
52
+ runs:
53
+ using: "composite"
54
+ steps:
55
+ - name: Noop
56
+ shell: bash
57
+ run: |
58
+ chmod -R u+rwX ${{ github.run_id }}
59
+ echo "noop"
60
+
61
+ - name: Docker system cleanup
62
+ shell: bash
63
+ run: |
64
+ docker system prune -af --filter "until=24h" --filter "label!=nemo.pr_number=${{ github.event.pull_request.number || 0 }}" --force || true
65
+
66
+ - name: Docker pull image
67
+ shell: bash
68
+ run: |
69
+ docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }}
70
+
71
+ - name: Clean repos
72
+ shell: bash
73
+ run: |
74
+
75
+ - name: Create UUID
76
+ id: uuid
77
+ shell: bash
78
+ run: |
79
+ echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
80
+
81
+ - name: Checkout NeMo
82
+ uses: actions/checkout@v4
83
+ env:
84
+ DIR: ${{ github.run_id }}
85
+ with:
86
+ path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
87
+
88
+ - name: Start container
89
+ shell: bash
90
+ env:
91
+ DIR: ${{ github.run_id }}
92
+ run: |
93
+ mkdir -p $DIR
94
+
95
+ # Map of runner names to GPU device configurations
96
+ declare -A GPU_CONFIGS=(
97
+ ["myVm-01"]="0,1"
98
+ ["myVm-02"]="2,3"
99
+ ["myVm-03"]="4,5"
100
+ ["myVm-04"]="6,7"
101
+ )
102
+
103
+ ARG=("")
104
+ if [[ "${{ inputs.cpu-only }}" == "false" ]]; then
105
+ ARG=("--runtime=nvidia --gpus all")
106
+ fi
107
+
108
+ cmd=$(cat <<RUN_TEST_EOF
109
+ #!/bin/bash
110
+ docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
111
+ docker run \
112
+ --rm \
113
+ -d \
114
+ --name nemo_container_${{ github.run_id }}_${{ inputs.runner }} ${ARG[@]} \
115
+ --shm-size=64g \
116
+ --env TRANSFORMERS_OFFLINE=0 \
117
+ --env HYDRA_FULL_ERROR=1 \
118
+ --env HF_HOME=/home/TestData/HF_HOME \
119
+ --env RUN_ID=${{ github.run_id }} \
120
+ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
121
+ --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
122
+ bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
123
+ RUN_TEST_EOF
124
+ )
125
+
126
+ echo "$cmd" | tee "$DIR/retry_job.sh"
127
+ bash $DIR/retry_job.sh
128
+
129
+ - name: Create run-script
130
+ id: create
131
+ env:
132
+ DIR: ${{ github.run_id }}
133
+ shell: bash
134
+ run: |
135
+ COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
136
+ echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
137
+
138
+ mkdir -p $DIR
139
+ rm $DIR/.coverage || true
140
+ rm $DIR/err.log || true
141
+
142
+ cmd=$(cat <<RUN_TEST_EOF
143
+ #!/bin/bash
144
+
145
+ (
146
+ set -e
147
+
148
+ docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\
149
+ cp -r /opt/Megatron-LM/ /workspace/ && \
150
+ bash tests/functional_tests/${{ inputs.script }}.sh && \
151
+ echo "Finished successfully." || echo "Did not finish."'
152
+ ) 2>&1 | tee $DIR/err.log
153
+
154
+ RUN_TEST_EOF
155
+ )
156
+
157
+ echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
158
+ echo "$cmd" | tee "$DIR/job.sh"
159
+
160
+ - name: Run main script
161
+ uses: nick-fields/retry@v3
162
+ with:
163
+ timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
164
+ max_attempts: 3
165
+ shell: bash
166
+ retry_on: timeout
167
+ command: /bin/bash ${{ github.run_id }}/job.sh
168
+ on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh
169
+
170
+ - name: Check result
171
+ id: check
172
+ shell: bash
173
+ env:
174
+ DIR: ${{ github.run_id }}
175
+ run: |
176
+ cat $DIR/err.log
177
+
178
+ log=$(tail -c 2000 $DIR/err.log | base64 -w 0)
179
+ echo "log=$log" >> "$GITHUB_OUTPUT"
180
+
181
+ potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
182
+ echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
183
+
184
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage combine
185
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage xml
186
+ docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/.coverage $DIR/.coverage
187
+ docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/coverage.xml $DIR/coverage.xml
188
+
189
+ coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
190
+ echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
191
+
192
+ IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
193
+
194
+ if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is_optional }}" == "true" ]]; then
195
+ echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
196
+ IS_SUCCESS=true
197
+ fi
198
+
199
+ if [[ "$IS_SUCCESS" == "false" ]]; then
200
+ echo Test did not finish successfully.
201
+ exit 1
202
+ fi
203
+
204
+ exit $EXIT_CODE
205
+
206
+ - name: Test coverage
207
+ shell: bash -x -e -u -o pipefail {0}
208
+ run: |
209
+ docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage report -i
210
+
211
+ - name: Upload artifacts
212
+ uses: actions/upload-artifact@v4
213
+ if: ${{ steps.check.outputs.coverage_report != 'none' }}
214
+ with:
215
+ name: ${{ steps.check.outputs.coverage_report }}
216
+ path: |
217
+ ${{ github.run_id }}/coverage.xml
218
+ ${{ github.run_id }}/.coverage
219
+ include-hidden-files: true
220
+
221
+ - name: Container shutdown
222
+ if: always()
223
+ shell: bash
224
+ run: |
225
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c "chown -R $(id -u):$(id -g) /workspace"
226
+ rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
227
+ docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
.github/labeler.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASR:
2
+ - nemo/collections/asr/**/*
3
+ - examples/asr/**/*
4
+ - tutorials/asr/**/*
5
+ - docs/source/asr/**/*
6
+ - tests/collections/asr/**
7
+
8
+ NLP:
9
+ - nemo/collections/nlp/**/*
10
+ - examples/nlp/**/*
11
+ - tutorials/nlp/**/*
12
+ - docs/source/nlp/**/*
13
+ - tests/collections/nlp/**
14
+
15
+ Multi Modal:
16
+ - nemo/collections/multimodal/**/*
17
+ - examples/multimodal/**/*
18
+ - tutorials/multimodal/**/*
19
+ - docs/source/multimodal/**/*
20
+ - tests/collections/multimodal/**
21
+
22
+ Speaker Tasks:
23
+ - examples/speaker_tasks/**/*
24
+ - tutorials/speaker_tasks/**/*
25
+
26
+ TTS:
27
+ - nemo/collections/tts/**/*
28
+ - nemo/collections/common/tokenizers/text_to_speech/**
29
+ - examples/tts/**/*
30
+ - tutorials/tts/**/*
31
+ - docs/source/tts/**/*
32
+ - scripts/dataset_processing/tts/**
33
+ - scripts/tts_dataset_files/**
34
+ - tests/collections/tts/**
35
+ - tests/collections/common/tokenizers/text_to_speech/**
36
+
37
+ Audio:
38
+ - nemo/collections/audio/**/*
39
+ - examples/audio/**/*
40
+ - tutorials/audio/**/*
41
+ - docs/source/audio/**/*
42
+ - tests/collections/audio/**
43
+
44
+ core:
45
+ - nemo/core/**/*
46
+ - tests/core/**
47
+
48
+ common:
49
+ - nemo/collections/common/**/*
50
+
51
+ CI:
52
+ - .github/**/*
53
+ - Jenkinsfile
54
+ - Dockerfile
55
+ - ci.groovy
.github/scripts/__init__.py ADDED
File without changes
.github/scripts/components_to_run.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ #!/usr/bin/env python3
16
+ import json
17
+ import os
18
+ import sys
19
+ from typing import Any, Dict, List, Set
20
+
21
+ import click
22
+ import git
23
+
24
+ import nemo_dependencies
25
+
26
+
27
+ def get_changed_files(source_sha: str, target_sha: str) -> List[str]:
28
+ """
29
+ Fetch the changelog between current branch and main.
30
+ Returns a list of dictionaries containing commit information.
31
+ """
32
+ try:
33
+ # Initialize the repo object - go up two levels from this file's location
34
+ repo = git.Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
35
+
36
+ # Get the diff between target and source
37
+ diff_index = repo.commit(target_sha).diff(repo.commit(source_sha))
38
+
39
+ # Get just the changed filenames
40
+ changed_files = []
41
+ for diff in diff_index:
42
+ changed_files.append(diff.a_path if diff.a_path else diff.b_path)
43
+
44
+ return changed_files
45
+
46
+ except git.exc.GitCommandError as e:
47
+ print(f"Error fetching changelog: {e}", file=sys.stderr)
48
+ sys.exit(1)
49
+ except Exception as e:
50
+ print(f"Unexpected error: {e}", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+
54
+ @click.command()
55
+ @click.option('--source-sha', type=str, required=True, help='Source commit SHA')
56
+ @click.option('--target-sha', type=str, required=True, help='Target commit sha')
57
+ def main(source_sha: str, target_sha: str):
58
+ """
59
+ Main function to fetch and output the changelog and changed files.
60
+ """
61
+
62
+ # Output unique changed files
63
+ print("\nChanged files:")
64
+ changed_files = get_changed_files(source_sha, target_sha)
65
+
66
+ print(json.dumps(sorted(list(changed_files)), indent=2))
67
+
68
+ nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
69
+ # Build dependency graph
70
+ dependencies = nemo_dependencies.build_dependency_graph(nemo_root)
71
+
72
+ test_modules: List[str] = []
73
+ for changed_file in changed_files:
74
+ if changed_file in dependencies:
75
+ test_modules.extend(dependencies[changed_file])
76
+
77
+ test_modules = list(set(test_modules))
78
+
79
+ with open("test_modules.json", "w", encoding="utf-8") as f:
80
+ json.dump(test_modules, f)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
.github/scripts/nemo_dependencies.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ NeMo dependency structure definition.
18
+ This module analyzes the codebase to determine internal dependencies between NeMo collections and core components.
19
+ """
20
+
21
+ import ast
22
+ import json
23
+ import os
24
+ from typing import Dict, List, Set
25
+
26
+
27
+ def find_python_files(directory: str) -> List[str]:
28
+ """Find all Python files in the given directory and its subdirectories."""
29
+ python_files = []
30
+ # Look in nemo directory and other relevant directories
31
+ relevant_dirs = ['nemo', 'scripts', 'examples', 'tests']
32
+
33
+ for dir_name in relevant_dirs:
34
+ dir_path = os.path.join(directory, dir_name)
35
+ if os.path.exists(dir_path):
36
+ for root, _, files in os.walk(dir_path):
37
+ for file in files:
38
+ if file.endswith('.py'):
39
+ python_files.append(os.path.join(root, file))
40
+
41
+ return python_files
42
+
43
+
44
+ def analyze_imports(nemo_root: str, file_path: str) -> Set[str]:
45
+ """Analyze a Python file and return its NeMo package dependencies using AST parsing."""
46
+ imports = set()
47
+ visited = set() # Track visited modules to prevent circular imports
48
+
49
+ def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]:
50
+ """Recursively analyze imports from __init__.py files and map them to their final destinations."""
51
+ # Prevent infinite recursion
52
+ if depth > 10 or module_path in visited: # Limit depth to 10 levels
53
+ return {}
54
+
55
+ visited.add(module_path)
56
+ init_path = os.path.join(module_path, '__init__.py')
57
+ if not os.path.exists(init_path):
58
+ return {}
59
+
60
+ try:
61
+ with open(init_path, 'r', encoding='utf-8') as f:
62
+ init_tree = ast.parse(f.read(), filename=init_path)
63
+
64
+ import_map = {}
65
+ for node in ast.walk(init_tree):
66
+ if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
67
+ if node.names:
68
+ for name in node.names:
69
+ if name.name == '*':
70
+ continue
71
+
72
+ # Get the full module path for the import
73
+ module_parts = node.module.split('.')
74
+ module_dir = os.path.join(nemo_root, *module_parts)
75
+
76
+ # If the imported module has an __init__.py, recursively analyze it
77
+ if os.path.exists(os.path.join(module_dir, '__init__.py')):
78
+ sub_imports = get_init_imports(module_dir, depth + 1)
79
+ if name.name in sub_imports:
80
+ import_map[name.name] = sub_imports[name.name]
81
+ else:
82
+ # If not found in sub-imports, it might be from the module itself
83
+ module_file = os.path.join(module_dir, f"{module_parts[-1]}.py")
84
+ if os.path.exists(module_file):
85
+ import_map[name.name] = f"{node.module}.{name.name}"
86
+ else:
87
+ # Direct module import
88
+ import_map[name.name] = f"{node.module}.{name.name}"
89
+
90
+ return import_map
91
+ except Exception as e:
92
+ print(f"Error analyzing {init_path}: {e}")
93
+ return {}
94
+
95
+ try:
96
+ with open(file_path, 'r', encoding='utf-8') as f:
97
+ tree = ast.parse(f.read(), filename=file_path)
98
+
99
+ for node in ast.walk(tree):
100
+ if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
101
+ # Split the module path
102
+ parts = node.module.split('.')
103
+
104
+ if len(parts) == 1:
105
+ continue
106
+
107
+ if len(parts) >= 2:
108
+ module_type = parts[1]
109
+
110
+ if module_type == 'collections':
111
+ if len(parts) == 2:
112
+ continue
113
+ if node.names:
114
+ for name in node.names:
115
+ if name.name == '*':
116
+ continue
117
+
118
+ # Check if this is an __init__ import
119
+ module_path = os.path.join(nemo_root, *parts)
120
+ init_imports = get_init_imports(module_path)
121
+
122
+ if name.name in init_imports:
123
+ # Use the mapped import path
124
+ imports.add(init_imports[name.name])
125
+ else:
126
+ imports.add(f"{node.module}.{name.name}")
127
+
128
+ elif module_type in find_top_level_packages(nemo_root):
129
+ if node.names:
130
+ for name in node.names:
131
+ if name.name == '*':
132
+ continue
133
+
134
+ # Check if this is an __init__ import
135
+ module_path = os.path.join(nemo_root, *parts)
136
+ init_imports = get_init_imports(module_path)
137
+
138
+ if name.name in init_imports:
139
+ # Use the mapped import path
140
+ imports.add(init_imports[name.name])
141
+ else:
142
+ imports.add(f"{node.module}.{name.name}")
143
+
144
+ except Exception as e:
145
+ print(f"Error analyzing {file_path}: {e}")
146
+
147
+ return imports
148
+
149
+
150
+ def find_top_level_packages(nemo_root: str) -> List[str]:
151
+ """Find all top-level packages under nemo directory."""
152
+ packages: List[str] = []
153
+ nemo_dir = os.path.join(nemo_root, 'nemo')
154
+ tests_dir = os.path.join(nemo_root, 'tests')
155
+
156
+ if not os.path.exists(nemo_dir):
157
+ print(f"Warning: nemo directory not found at {nemo_dir}")
158
+ return packages
159
+ if not os.path.exists(tests_dir):
160
+ print(f"Warning: nemo directory not found at {nemo_dir}")
161
+ return packages
162
+
163
+ for item in os.listdir(nemo_dir) + os.listdir(tests_dir):
164
+ item_path = os.path.join(nemo_dir, item)
165
+ if os.path.isdir(item_path) and not item.startswith('__'):
166
+ packages.append(item)
167
+
168
+ return sorted(packages)
169
+
170
+
171
+ def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]:
172
+ """Find all modules within collections."""
173
+ collection_modules: Dict[str, List[str]] = {}
174
+ collections_dir = os.path.join(nemo_root, 'nemo', 'collections')
175
+
176
+ if not os.path.exists(collections_dir):
177
+ print(f"Warning: collections directory not found at {collections_dir}")
178
+ return collection_modules
179
+
180
+ for collection in os.listdir(collections_dir):
181
+ collection_path = os.path.join(collections_dir, collection)
182
+ if os.path.isdir(collection_path) and not collection.startswith('__'):
183
+ collection_modules[f"nemo.collections.{collection}"] = []
184
+
185
+ return collection_modules
186
+
187
+
188
+ def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
189
+ """Build a dependency graph by analyzing all Python files."""
190
+ # Find all top-level packages
191
+ top_level_packages = find_top_level_packages(nemo_root)
192
+ print(f"Found top-level packages: {top_level_packages}")
193
+
194
+ dependencies: Dict[str, List[str]] = {}
195
+
196
+ for file_path in find_python_files(nemo_root):
197
+ relative_path = os.path.relpath(file_path, nemo_root)
198
+
199
+ parts = relative_path.split(os.sep)
200
+
201
+ if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"):
202
+ continue
203
+
204
+ module_path = relative_path.replace(".py", "").replace("/", ".")
205
+ if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests':
206
+ dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
207
+ elif parts[0] == 'tests':
208
+ dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")]
209
+ elif parts[1] == 'collections':
210
+ dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
211
+
212
+ # Flip the dependency graph to show reverse dependencies
213
+ reverse_dependencies: Dict[str, List[str]] = {}
214
+ # Handle top-level package dependencies
215
+ for package, deps in dependencies.items():
216
+ for dep in deps:
217
+ if dep not in reverse_dependencies:
218
+ reverse_dependencies[dep] = []
219
+ reverse_dependencies[dep].append(package)
220
+ dependencies = reverse_dependencies
221
+
222
+ # Follow and extend records with transitive dependencies
223
+ transitive_dependencies = dependencies.copy()
224
+ # Keep iterating until no new dependencies are added
225
+ while True:
226
+ changes_made = False
227
+ new_dependencies = transitive_dependencies.copy()
228
+
229
+ # For each package and its direct dependencies
230
+ for package, deps in transitive_dependencies.items():
231
+ # For each direct dependency
232
+ for dep in deps:
233
+ # If the dependency has its own dependencies
234
+ if dep in transitive_dependencies:
235
+ # Add those transitive dependencies to the original package
236
+ for transitive_dep in transitive_dependencies[dep]:
237
+ if transitive_dep not in new_dependencies[package]:
238
+ new_dependencies[package].append(transitive_dep)
239
+ changes_made = True
240
+
241
+ # Update dependencies with new transitive ones
242
+ transitive_dependencies = new_dependencies
243
+
244
+ # If no new dependencies were added, we're done
245
+ if not changes_made:
246
+ break
247
+
248
+ dependencies = transitive_dependencies
249
+
250
+ # Simplify values: Either top-level package or collection module
251
+ simplified_dependencies: Dict[str, List[str]] = {}
252
+ for package, deps in dependencies.items():
253
+ package_parts = package.split('.')
254
+
255
+ if package_parts[0] == "tests":
256
+ simplified_package_path = f"{os.path.join(*package_parts)}.py"
257
+ elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
258
+ simplified_package_path = file_path
259
+ elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")):
260
+ simplified_package_path = file_path
261
+ else:
262
+ simplified_package_path = package
263
+
264
+ for dep in deps:
265
+ dep_parts = dep.split('.')
266
+
267
+ if simplified_package_path not in simplified_dependencies:
268
+ simplified_dependencies[simplified_package_path] = []
269
+
270
+ if (
271
+ len(dep_parts) >= 2
272
+ and (dep_parts[1] in find_top_level_packages(nemo_root))
273
+ and dep_parts[1] != 'collections'
274
+ ):
275
+ simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}")
276
+ elif dep_parts[0] == "tests":
277
+ simplified_dependencies[simplified_package_path].append(".".join(dep_parts))
278
+ elif len(dep_parts) >= 3 and (
279
+ simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}"
280
+ ) in find_collection_modules(nemo_root):
281
+ simplified_dependencies[simplified_package_path].append(simplified_name)
282
+
283
+ simplified_dependencies[simplified_package_path].append(package)
284
+ simplified_dependencies[simplified_package_path] = sorted(
285
+ list(set(simplified_dependencies[simplified_package_path]))
286
+ )
287
+ dependencies = simplified_dependencies
288
+
289
+ # Bucket
290
+ bucket_deps: Dict[str, List[str]] = {}
291
+ for package, deps in dependencies.items():
292
+ new_deps = []
293
+ for dep in deps:
294
+ if (
295
+ "nemo.collections.asr" in dep
296
+ or "nemo.collections.tts" in dep
297
+ or "nemo.collections.speechlm" in dep
298
+ or "nemo.collections.audio" in dep
299
+ or "tests.collections.asr" in dep
300
+ or "tests.collections.tts" in dep
301
+ or "tests.collections.speechlm" in dep
302
+ or "tests.collections.audio" in dep
303
+ ):
304
+ new_deps.append("speech")
305
+ new_deps.append("unit-tests")
306
+
307
+ if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep:
308
+ new_deps.append("export-deploy")
309
+ new_deps.append("unit-tests")
310
+
311
+ if (
312
+ "nemo.collections.llm" in dep
313
+ or "nemo.collections.vlm" in dep
314
+ or "nemo.automodel" in dep
315
+ or "tests.collections.llm" in dep
316
+ or "tests.collections.vlm" in dep
317
+ or "tests.automodel" in dep
318
+ ):
319
+ new_deps.append("automodel")
320
+ new_deps.append("unit-tests")
321
+
322
+ if "tests" in dep and "tests.functional_tests" not in dep:
323
+ new_deps.append("unit-tests")
324
+
325
+ if (
326
+ "nemo.collections" in dep
327
+ and "nemo.collections.asr" not in dep
328
+ and "nemo.collections.tts" not in dep
329
+ and "nemo.collections.speechlm" not in dep
330
+ and "nemo.collections.audio" not in dep
331
+ and "tests.collections.asr" not in dep
332
+ and "tests.collections.tts" not in dep
333
+ and "tests.collections.speechlm" not in dep
334
+ and "tests.collections.audio" not in dep
335
+ ):
336
+ new_deps.append("nemo2")
337
+ new_deps.append("unit-tests")
338
+
339
+ bucket_deps[package] = sorted(list(set(new_deps)))
340
+
341
+ dependencies = bucket_deps
342
+
343
+ # Additional dependencies
344
+ # Add all files in requirements/ directory
345
+ requirements_dir = os.path.join(nemo_root, "requirements")
346
+ if os.path.exists(requirements_dir):
347
+ for filename in os.listdir(requirements_dir):
348
+ filepath = os.path.join("requirements", filename)
349
+ relative_path = os.path.relpath(filepath, nemo_root)
350
+
351
+ dependencies[relative_path] = [
352
+ "nemo2",
353
+ "unit-tests",
354
+ "speech",
355
+ "automodel",
356
+ "export-deploy",
357
+ ]
358
+
359
+ # Add all Dockerfile files
360
+ for root, _, files in os.walk(nemo_root):
361
+ for file_path in files:
362
+ full_path = os.path.join(root, file_path)
363
+ relative_path = os.path.relpath(full_path, nemo_root)
364
+
365
+ if "cicd-main-export-deploy" in file_path:
366
+ dependencies[relative_path] = ["export-deploy"]
367
+ if "cicd-main-nemo2" in file_path:
368
+ dependencies[relative_path] = ["nemo2"]
369
+ if "cicd-main-speech" in file_path:
370
+ dependencies[relative_path] = ["speech"]
371
+ if "cicd-main-automodel" in file_path:
372
+ dependencies[relative_path] = ["automodel"]
373
+ if "cicd-main-unit-tests" in file_path:
374
+ dependencies[relative_path] = ["unit-tests"]
375
+ if "Dockerfile" in file_path:
376
+ dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"]
377
+
378
+ # Sort dependencies by length of values (number of dependencies)
379
+ dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True))
380
+
381
+ return dependencies
382
+
383
+
384
+ def main():
385
+ """Main function to analyze dependencies and output JSON."""
386
+ # Get the root directory of the NeMo project
387
+ nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
388
+
389
+ # Build dependency graph
390
+ dependencies = build_dependency_graph(nemo_root)
391
+
392
+ # Output as JSON
393
+ data = json.dumps(dependencies, indent=4)
394
+
395
+ with open('nemo_dependencies.json', 'w', encoding='utf-8') as f:
396
+ f.write(data)
397
+
398
+
399
+ if __name__ == "__main__":
400
+ main()
.github/scripts/notify.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ import requests
17
+ from github import Github
18
+
19
+
20
+ def send_slack_notification():
21
+ # Get environment variables
22
+ gh_token = os.environ.get('GH_TOKEN')
23
+ webhook_url = os.environ.get('SLACK_WEBHOOK')
24
+ repository = os.environ.get('REPOSITORY')
25
+ run_id = os.environ.get('RUN_ID')
26
+ server_url = os.environ.get('SERVER_URL', 'https://github.com')
27
+ pr_number = int(os.environ.get('PR_NUMBER', 0))
28
+ branch_name = os.environ.get('BRANCH_NAME')
29
+
30
+ # Get failure info from GitHub API
31
+ gh = Github(gh_token)
32
+ repo = gh.get_repo(repository)
33
+
34
+ # Get failed jobs
35
+ failed_jobs = [job.name for job in repo.get_workflow_run(int(run_id)).jobs() if job.conclusion == 'failure']
36
+
37
+ if pr_number != 0:
38
+ pr = repo.get_pull(pr_number)
39
+
40
+ title = f"*<{server_url}/{repository}/pull/{pr_number}|PR#{pr_number}>: {pr.title.replace('`', '')}*"
41
+ author = f"<{server_url}/{pr.user.login}|{pr.user.login}>"
42
+ branch = f"<{server_url}/{pr.head.repo.full_name}/tree/{pr.head.ref}|{pr.head.ref}>"
43
+ else:
44
+ title = f"*Run on <{server_url}/{repository}/tree/{branch_name}|{branch_name}>*"
45
+ author = "No author"
46
+ branch = f"<{server_url}/{repository}/tree/{branch_name}|{branch_name}>"
47
+
48
+ blocks = [
49
+ {
50
+ "type": "section",
51
+ "text": {
52
+ "type": "mrkdwn",
53
+ "text": (
54
+ f"{title}\n"
55
+ f"• Author: {author}\n"
56
+ f"• Branch: {branch}\n"
57
+ f"• Pipeline: <{server_url}/{repository}/actions/runs/{run_id}|View Run>\n"
58
+ f"• Failed Jobs:\n"
59
+ + "\n".join(
60
+ [
61
+ f" • <{server_url}/{repository}/actions/runs/{run_id}|{job.split('/')[-1]}>"
62
+ for job in failed_jobs
63
+ if job.split('/')[-1] != 'Nemo_CICD_Test'
64
+ ]
65
+ )
66
+ ),
67
+ },
68
+ }
69
+ ]
70
+
71
+ print({"blocks": blocks})
72
+
73
+ # Send to Slack
74
+ response = requests.post(webhook_url, json={"blocks": blocks})
75
+ response.raise_for_status()
76
+
77
+
78
+ if __name__ == "__main__":
79
+ send_slack_notification()
.github/workflows/_build_container.yml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~Build container template
2
+ on:
3
+ workflow_call:
4
+ inputs:
5
+ image-name:
6
+ required: true
7
+ type: string
8
+ description: "The name of the image to build"
9
+ dockerfile:
10
+ required: true
11
+ type: string
12
+ runner:
13
+ required: false
14
+ default: self-hosted-azure-builder
15
+ type: string
16
+ description: "The runner to use for the build"
17
+
18
+ jobs:
19
+ pre-flight:
20
+ runs-on: ubuntu-latest
21
+ outputs:
22
+ build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
23
+ cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
24
+ steps:
25
+ - name: Checkout repository
26
+ uses: actions/checkout@v4
27
+
28
+ - name: Parse manifest.json
29
+ id: manifest
30
+ run: |
31
+ BUILD_ARGS=$(cat << EOF
32
+ BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
33
+ TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
34
+ TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
35
+ MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
36
+ MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
37
+ TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
38
+ TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
39
+ APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
40
+ APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
41
+ EOF
42
+ )
43
+
44
+ echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
45
+ echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
46
+ echo "EOF" >> $GITHUB_OUTPUT
47
+
48
+ - name: Get last merged PR
49
+ id: cache_from
50
+ env:
51
+ GH_TOKEN: ${{ github.token }}
52
+ run: |
53
+ LAST_PRS=$(gh api graphql -f query='
54
+ query {
55
+ repository(owner: "NVIDIA", name: "NeMo") {
56
+ pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
57
+ nodes {
58
+ number
59
+ }
60
+ }
61
+ }
62
+ }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
63
+ echo "nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:$number"
64
+ done)
65
+
66
+ echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
67
+ echo "$LAST_PRS" >> $GITHUB_OUTPUT
68
+ echo "EOF" >> $GITHUB_OUTPUT
69
+
70
+ build:
71
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
72
+ needs: [pre-flight]
73
+ with:
74
+ image-name: ${{ inputs.image-name }}
75
+ dockerfile: ${{ inputs.dockerfile }}
76
+ image-label: nemo-core
77
+ build-args: |
78
+ IMAGE_LABEL=nemo-core
79
+ NEMO_TAG=${{ github.sha }}
80
+ NEMO_REPO=https://github.com/NVIDIA/NeMo
81
+ PR_NUMBER=${{ github.event.pull_request.number || 0 }}
82
+ ${{ needs.pre-flight.outputs.build_args }}
83
+ prune-filter-timerange: 24h
84
+ use-inline-cache: false
85
+ cache-from: |
86
+ nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:main
87
+ nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:${{ github.event.pull_request.number || 0 }}
88
+ ${{ needs.pre-flight.outputs.cache-from }}
89
+ runner: ${{ inputs.runner }}
.github/workflows/_bump_mcore_tag.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~Bump Megatron Tag template
2
+ on:
3
+ workflow_call:
4
+ inputs:
5
+ nemo-target-branch:
6
+ required: true
7
+ type: string
8
+ description: "The target branch to bump"
9
+ mcore-target-branch:
10
+ required: true
11
+ type: string
12
+ description: "The target branch to bump"
13
+ secrets:
14
+ PAT:
15
+ required: true
16
+
17
+ jobs:
18
+ update-branch:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v2
22
+ with:
23
+ ref: ${{ inputs.nemo-target-branch }}
24
+
25
+ - name: Set Git config
26
+ run: |
27
+ git config --local user.email "[email protected]"
28
+ git config --local user.name "Github Actions"
29
+ - name: Merge weekly-bump-${{ inputs.nemo-target-branch }} back to base branch
30
+ env:
31
+ SOURCE_BRANCH: weekly-bump-${{ inputs.nemo-target-branch }}
32
+ TARGET_BRANCH: ${{ inputs.nemo-target-branch }}
33
+ run: |
34
+ if git ls-remote --exit-code origin $SOURCE_BRANCH; then
35
+ git fetch --unshallow
36
+ git checkout $SOURCE_BRANCH
37
+ git pull
38
+ git merge --no-ff $TARGET_BRANCH -m "chore: Auto-merge $TARGET_BRANCH into $SOURCE_BRANCH"
39
+ else
40
+ git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
41
+ fi
42
+ git push -u origin $SOURCE_BRANCH
43
+
44
+ mcore:
45
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
46
+ needs: [update-branch]
47
+ with:
48
+ source-repository: NVIDIA/Megatron-LM
49
+ source-ref: ${{ inputs.mcore-target-branch }}
50
+ yaml-path: '."vcs-dependencies"."megatron-lm".ref'
51
+ file: requirements/manifest.json
52
+ base-branch: weekly-bump-${{ inputs.nemo-target-branch }}
53
+ cicd-labels: Run CICD,no-fail-fast
54
+ pr-reviewers: ${{ inputs.pr-reviewers }}
55
+ secrets:
56
+ PAT: ${{ secrets.PAT }}
.github/workflows/build-test-publish-wheel.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Build, test, and publish a PyPi wheel (to testpypi).
16
+
17
+ on:
18
+ push:
19
+ branches:
20
+ - main
21
+ - "r**"
22
+
23
+ defaults:
24
+ run:
25
+ shell: bash -x -e -u -o pipefail {0}
26
+
27
+ jobs:
28
+ build-test-publish-wheel:
29
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
30
+ with:
31
+ dry-run: true
32
+ python-package: nemo
33
+ python-version: "3.10"
34
+ secrets:
35
+ TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
36
+ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
37
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
38
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
.github/workflows/changelog-build.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 'Changelog Build (Release)'
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ last-release-tag:
7
+ description: Last Git tag to start from (exclusive) (e.g. `v2.0.0`)
8
+ type: string
9
+ required: true
10
+ release-branch:
11
+ description: Release branch to build changelog on (e.g. `r2.1.0`)
12
+ type: string
13
+ required: true
14
+ changelog-main-content:
15
+ description: Custom changelog content to include before detailed changelogs
16
+ type: string
17
+ required: false
18
+ default: ''
19
+
20
+ jobs:
21
+ changelog:
22
+ runs-on: ubuntu-latest
23
+ steps:
24
+ - name: Checkout branch
25
+ uses: actions/checkout@v4
26
+ with:
27
+ ref: main
28
+ fetch-depth: 0
29
+
30
+ - name: Build Changelog
31
+ id: github_tag
32
+ uses: mikepenz/[email protected]
33
+ env:
34
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35
+ with:
36
+ # Configuration file is setup with filters for domains
37
+ # owner:repo must point to current repo
38
+ # fromTag: Auto resolved from historical tag order (previous tag compared to current tag)
39
+ # toTag: Current tag reference
40
+ configuration: ".github/workflows/config/changelog-config.json"
41
+ owner: ${{ github.repository_owner }}
42
+ repo: ${{ github.event.repository.name }}
43
+ ignorePreReleases: "false"
44
+ failOnError: "false"
45
+ fromTag: ${{ inputs.last-release-tag }}
46
+ toTag: ${{ inputs.release-branch }}
47
+
48
+ - name: Update changelog file
49
+ env:
50
+ RELEASE_BRANCH: ${{ inputs.release-branch }}
51
+ CHANGELOG: ${{ steps.github_tag.outputs.changelog }}
52
+ MAIN_CONTENT: ${{ inputs.changelog-main-content }}
53
+ shell: bash -x -e -u -o pipefail {0}
54
+ run: |
55
+ RELEASE_VERSION=${RELEASE_BRANCH#r}
56
+ CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/')
57
+
58
+ # Build release notes starting with version header
59
+ RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION"
60
+
61
+ # Add custom content if provided
62
+ if [ -n "$MAIN_CONTENT" ]; then
63
+ RELEASE_NOTES="$RELEASE_NOTES
64
+
65
+ $MAIN_CONTENT"
66
+ fi
67
+
68
+ # Add detailed changelogs section
69
+ RELEASE_NOTES="$RELEASE_NOTES
70
+
71
+ ### Detailed Changelogs:
72
+
73
+ $CHANGELOG"
74
+
75
+ printf "%s\n" "$RELEASE_NOTES" | sed '/<!-- Next changelog -->/r /dev/stdin' CHANGELOG.md > CHANGELOG.tmp.md
76
+
77
+ mv CHANGELOG.tmp.md CHANGELOG.md
78
+
79
+ - name: Inspect new changelog file
80
+ run: cat CHANGELOG.md
81
+
82
+ - name: Create or update label
83
+ uses: actions/github-script@v6
84
+ with:
85
+ script: |
86
+ const labelName = '${{ inputs.release-branch }}';
87
+ const labelColor = '0366d6'; // Blue color
88
+ const labelDescription = `Release ${labelName}`;
89
+
90
+ try {
91
+ // Try to get the label
92
+ await github.rest.issues.getLabel({
93
+ owner: context.repo.owner,
94
+ repo: context.repo.repo,
95
+ name: labelName
96
+ });
97
+ console.log(`Label '${labelName}' already exists`);
98
+ } catch (error) {
99
+ if (error.status === 404) {
100
+ // Label doesn't exist, create it
101
+ await github.rest.issues.createLabel({
102
+ owner: context.repo.owner,
103
+ repo: context.repo.repo,
104
+ name: labelName,
105
+ color: labelColor,
106
+ description: labelDescription
107
+ });
108
+ console.log(`Created label '${labelName}'`);
109
+ } else {
110
+ throw error;
111
+ }
112
+ }
113
+
114
+ - name: Create Pull Request
115
+ uses: peter-evans/create-pull-request@v7
116
+ with:
117
+ commit-message: "beep boop: Update changelog"
118
+ title: "Update changelog for `${{ inputs.release-branch }}`"
119
+ signoff: true
120
+ sign-commits: true
121
+ base: main
122
+ branch: bot/chore/update-changelog-into-${{ inputs.release-branch }}
123
+ labels: ${{ inputs.release-branch }}
.github/workflows/cherry-pick-release-commit.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create PR to main with cherry-pick from release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ cherry-pick:
10
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
11
+ secrets:
12
+ PAT: ${{ secrets.PAT }}
13
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
14
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
.github/workflows/cicd-approve-test-queue.yml ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Approve Test Queue
16
+
17
+ on:
18
+ schedule:
19
+ - cron: '*/5 * * * *' # Runs every 5 minutes
20
+ workflow_dispatch: # Allows manual triggering
21
+
22
+ jobs:
23
+ approve-queue:
24
+ runs-on: ubuntu-latest
25
+ environment: main
26
+ steps:
27
+ - name: Checkout repository
28
+ uses: actions/checkout@v4
29
+
30
+ - name: Set up Python
31
+ uses: actions/setup-python@v5
32
+ with:
33
+ python-version: "3.12"
34
+
35
+ - name: Install dependencies
36
+ run: |
37
+ python -m pip install --upgrade pip
38
+ pip install requests
39
+
40
+ - name: Approve waiting deployments
41
+ env:
42
+ GITHUB_TOKEN: ${{ secrets.PAT }}
43
+ MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
44
+ run: |
45
+ python - <<EOF
46
+ import os
47
+ import requests
48
+
49
+
50
+ # GitHub API configuration
51
+ GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
52
+ REPO = os.environ["GITHUB_REPOSITORY"]
53
+ MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
54
+ API_BASE = f"https://api.github.com/repos/{REPO}"
55
+
56
+ # Headers for GitHub API
57
+ headers = {
58
+ "Authorization": f"token {GITHUB_TOKEN}",
59
+ "Accept": "application/vnd.github.v3+json",
60
+ "X-GitHub-Api-Version": "2022-11-28",
61
+ }
62
+
63
+ def make_request(endpoint, method="GET", data=None):
64
+ """Make a request to the GitHub API with error handling."""
65
+ url = f"{API_BASE}/{endpoint}"
66
+ try:
67
+ if method == "GET":
68
+ response = requests.get(url, headers=headers)
69
+ else:
70
+ response = requests.post(url, headers=headers, json=data)
71
+ response.raise_for_status()
72
+ response_json = response.json()
73
+ if hasattr(response, "links") and "actions/runs?status" in endpoint:
74
+ response_json["next"] = response.links.get("next", {}).get("url")
75
+
76
+ return response_json
77
+ except requests.exceptions.RequestException as e:
78
+ print(f"Error making request to {endpoint}: {str(e)}")
79
+ if hasattr(e.response, 'text'):
80
+ print(f"Response: {e.response.text}")
81
+ return None
82
+
83
+
84
+ def get_workflow_runs(status):
85
+ """Get all workflow runs for a given status."""
86
+ all_results = []
87
+ endpoint = f"actions/runs?status={status}"
88
+ while endpoint:
89
+ response = make_request(endpoint)
90
+ if not response:
91
+ break
92
+
93
+ all_results.extend(response.get("workflow_runs", []))
94
+ endpoint = None
95
+ next_url = response.get("next")
96
+ if next_url:
97
+ endpoint = f"actions/runs?{next_url.split('?')[1]}"
98
+
99
+ return all_results
100
+
101
+
102
+ # Get current running and queued workflows
103
+ print("Fetching workflow runs...")
104
+ queued_workflow_runs = get_workflow_runs("queued")
105
+ in_progress_workflow_runs = get_workflow_runs("in_progress")
106
+
107
+ # Count running and queued workflows
108
+ queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
109
+ in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
110
+
111
+ total_workflows = queued_workflows + in_progress_workflows
112
+ print(f"Current queued workflows: {queued_workflows}")
113
+ print(f"Current running workflows: {in_progress_workflows}")
114
+ print(f"Total workflows: {total_workflows}")
115
+ print(f"Max concurrency: {MAX_CONCURRENCY}")
116
+
117
+ if total_workflows >= MAX_CONCURRENCY:
118
+ print("Maximum concurrency reached, no new approvals will be made")
119
+ exit(0)
120
+
121
+ # Get waiting CI workflows for test environment
122
+ print("Fetching deployments...")
123
+ pending_workflows = get_workflow_runs("waiting")
124
+ pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
125
+
126
+ # Sort deployments by creation date (oldest first)
127
+ print("Sorting workflows...")
128
+ pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
129
+
130
+ # Process each deployment
131
+ print("Processing ...")
132
+ for workflow in pending_workflows:
133
+ if total_workflows >= MAX_CONCURRENCY:
134
+ print("Maximum concurrency reached, stopping approvals")
135
+ break
136
+
137
+ workflow_id = workflow["id"]
138
+ workflow_name = workflow["display_title"]
139
+ print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
140
+
141
+ deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
142
+ deployment = make_request(deployment_url)[0]
143
+ environment_id = deployment["environment"]["id"]
144
+
145
+ # Approve the deployment
146
+ status_data = {
147
+ "environment_ids": [environment_id],
148
+ "state": "approved",
149
+ "comment": "Automatically approved by queue manager"
150
+ }
151
+ result = make_request(deployment_url, method="POST", data=status_data)
152
+
153
+ if result:
154
+ total_workflows += 1
155
+ else:
156
+ print(f"Failed to approve deployment {deployment['id']}")
157
+ exit(1)
158
+
159
+ EOF
160
+ notify:
161
+ if: failure()
162
+ runs-on: ubuntu-latest
163
+ needs: [approve-queue]
164
+ steps:
165
+ - name: Notify
166
+ env:
167
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
168
+ SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
169
+ GITHUB_RUN_ID: ${{ github.run_id }}
170
+ GITHUB_REPOSITORY: ${{ github.repository }}
171
+ run: |
172
+ curl -X POST \
173
+ -H 'Content-type: application/json' \
174
+ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
175
+ $SLACK_WEBHOOK
.github/workflows/cicd-main-nemo2.yml ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E NeMo2 Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_nemo2
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci
32
+
33
+ e2e-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L2_NeMo_2_GPT_Pretraining_no_transformer_engine
39
+ runner: self-hosted-azure
40
+ - script: L2_NeMo_2_llama3_pretraining_recipe
41
+ runner: self-hosted-azure
42
+ # - script: L2_NeMo_2_llama3_pytorch_profiler
43
+ # runner: self-hosted-azure
44
+ # timeout: 20
45
+ - script: L2_NeMo_2_llama3_fault_tolerance_plugin
46
+ runner: self-hosted-azure
47
+ - script: L2_NeMo_2_llama3_straggler_detection
48
+ runner: self-hosted-azure
49
+ - script: L2_NeMo_2_llama3_local_ckpt
50
+ runner: self-hosted-azure
51
+ - script: L2_NeMo_2_GPT_DDP_Param_Parity_check
52
+ runner: self-hosted-azure
53
+ - script: L2_NeMo_2_Hyena_Conversion_from_HF
54
+ runner: self-hosted-azure
55
+ - script: L2_NeMo_2_Hyena_DDP_Pretraining_Test
56
+ runner: self-hosted-azure
57
+ - script: L2_NeMo_2_Hyena_Mixer_Test
58
+ runner: self-hosted-azure-gpus-2-h100
59
+ - script: L2_NeMo_2_Hyena_PP_Pretraining_Test
60
+ runner: self-hosted-azure
61
+ - script: L2_NeMo_2_Hyena_TP_Pretraining_Test
62
+ runner: self-hosted-azure
63
+ - script: L2_NeMo_2_Hyena_CP_Pretraining_Test
64
+ runner: self-hosted-azure
65
+ - script: L2_NeMo_2_SSM_Pretraining
66
+ runner: self-hosted-azure
67
+ - script: L2_NeMo_2_SSM_Finetuning
68
+ runner: self-hosted-azure-gpus-2-h100
69
+ - script: L2_NeMo_2_HF_MODEL_IMPORT
70
+ runner: self-hosted-azure
71
+ - script: L2_NeMo_2_jit_callback
72
+ runner: self-hosted-azure
73
+ - script: L2_NeMo_2_T5_Pretraining
74
+ runner: self-hosted-azure
75
+ - script: L2_NeMo_2_T5_MockData_Pretraining
76
+ runner: self-hosted-azure
77
+ - script: L2_NeMo_2_T5_Finetuning
78
+ runner: self-hosted-azure
79
+ - script: L2_NeMo_2_T5_Squad
80
+ runner: self-hosted-azure
81
+ - script: L2_NeMo_2_T5_LoRA
82
+ runner: self-hosted-azure
83
+ - script: L2_NeMo_2_BERT_Pretraining_Megatron
84
+ runner: self-hosted-azure
85
+ - script: L2_NeMo_2_BERT_Pretraining_HuggingFace
86
+ runner: self-hosted-azure
87
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_TP2
88
+ runner: self-hosted-azure-gpus-2-h100
89
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_PP2
90
+ runner: self-hosted-azure-gpus-2-h100
91
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_CP2
92
+ runner: self-hosted-azure-gpus-2-h100
93
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_TP2
94
+ runner: self-hosted-azure-gpus-2-h100
95
+ - script: L2_NeMo_2_NEVA_ENERGON_FINETUNE_TP2
96
+ runner: self-hosted-azure-gpus-2-h100
97
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_PP2
98
+ runner: self-hosted-azure-gpus-2-h100
99
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_CP2
100
+ runner: self-hosted-azure-gpus-2-h100
101
+ - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_PAD
102
+ runner: self-hosted-azure-gpus-2-h100
103
+ - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_TRUNC
104
+ runner: self-hosted-azure-gpus-2-h100
105
+ - script: L2_NeMo_2_NEVA_LOAD_GENERATE
106
+ runner: self-hosted-azure-gpus-1
107
+ - script: L2_NeMo_2_LLAVA_IMPORT
108
+ runner: self-hosted-azure-gpus-1
109
+ - script: L2_NEMO_2_MLLAMA_Inference
110
+ runner: self-hosted-azure-gpus-1
111
+ - script: L2_NeMo_2_MLLAMA_MOCK_FINETUNE_TP2
112
+ runner: self-hosted-azure
113
+ - script: L2_NeMo_2_MLLAMA_PRELOADED_FINETUNE_TP2
114
+ runner: self-hosted-azure
115
+ - script: L2_NeMo_2_MLLAMA_ENERGON_FINETUNE_TP2
116
+ runner: self-hosted-azure
117
+ - script: L2_NeMo_2_MLLAMA_IMPORT
118
+ runner: self-hosted-azure-gpus-1
119
+ - script: L2_NeMo_2_Mixtral_Pretraining
120
+ runner: self-hosted-azure
121
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
122
+ runner: self-hosted-azure
123
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
124
+ runner: self-hosted-azure
125
+ - script: L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
126
+ runner: self-hosted-azure
127
+ - script: L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
128
+ runner: self-hosted-azure
129
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
130
+ runner: self-hosted-azure
131
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
132
+ runner: self-hosted-azure
133
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
134
+ runner: self-hosted-azure
135
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
136
+ runner: self-hosted-azure
137
+ - script: L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
138
+ runner: self-hosted-azure
139
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
140
+ runner: self-hosted-azure
141
+ - script: L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
142
+ runner: self-hosted-azure
143
+ - script: L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED
144
+ runner: self-hosted-azure
145
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat
146
+ runner: self-hosted-azure
147
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_TE_op_fuser
148
+ runner: self-hosted-azure
149
+ - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2_exclude
150
+ runner: self-hosted-azure
151
+ - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
152
+ runner: self-hosted-azure
153
+ - script: L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
154
+ runner: self-hosted-azure
155
+ - script: L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
156
+ runner: self-hosted-azure
157
+ - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
158
+ runner: self-hosted-azure
159
+ - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1_exclude
160
+ runner: self-hosted-azure
161
+ - script: L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
162
+ runner: self-hosted-azure
163
+ - script: L2_NEMO_2_LoRA_MERGE
164
+ runner: self-hosted-azure
165
+ - script: L2_NEMO_2_LoRA_Inference
166
+ runner: self-hosted-azure-gpus-1
167
+ - script: L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
168
+ runner: self-hosted-azure
169
+ is-optional: true
170
+ - script: L2_NeMo_2_PTQ_Llama2_FP8_trtllm
171
+ runner: self-hosted-azure
172
+ - script: L2_NeMo_2_PTQ_Llama2_FP8_nemo
173
+ runner: self-hosted-azure
174
+ - script: L2_NeMo_2_Distill_Llama3_TP1PP2
175
+ runner: self-hosted-azure
176
+ - script: L2_NeMo_2_Prune_Llama_TP1PP2
177
+ runner: self-hosted-azure
178
+ - script: L2_NeMo_2_GPT_Speculative_Llama3_TP2PP1
179
+ runner: self-hosted-azure
180
+ - script: L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
181
+ runner: self-hosted-azure
182
+ - script: L2_NeMo_2_LLAVA_NEXT_HF_CONVERSION
183
+ runner: self-hosted-azure
184
+ - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_TRAIN
185
+ runner: self-hosted-azure
186
+ - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_PACKED_TRAIN
187
+ runner: self-hosted-azure
188
+ - script: L2_NeMo_2_AVLM_MOCK_TRAINING
189
+ runner: self-hosted-azure
190
+ - script: L2_NeMo_2_AVLM_ENERGON_TRAIN
191
+ runner: self-hosted-azure
192
+ - script: L2_NeMo_2_AVLM_ENERGON_CP2_TRAIN
193
+ runner: self-hosted-azure
194
+ - script: L2_NeMo_2_CLIP_PRETRAIN
195
+ runner: self-hosted-azure
196
+ timeout: 20
197
+ - script: L2_NeMo_2_CLIP_INFER
198
+ runner: self-hosted-azure
199
+ - script: L2_NeMo_2_Auto_Configurator_llama_TP1_PP1_MBS124
200
+ runner: self-hosted-azure-gpus-1
201
+ - script: L2_NeMo_2_Auto_Configurator_bert_TP1_PP1_MBS124
202
+ runner: self-hosted-azure-gpus-1
203
+ - script: L2_NeMo_2_Auto_Configurator_t5_TP1_PP1_MBS124
204
+ runner: self-hosted-azure-gpus-1
205
+ - script: L2_NeMo_2_Auto_Configurator_callbacks
206
+ runner: self-hosted-azure-gpus-1
207
+ - script: L2_NeMo_2_Conversion_Test_Baichuan2
208
+ runner: self-hosted-azure
209
+ - script: L2_NeMo_2_Conversion_Test_ChatGLM
210
+ runner: self-hosted-azure
211
+ - script: L2_NeMo_2_Conversion_Test_DeepSeek
212
+ runner: self-hosted-azure
213
+ - script: L2_NeMo_2_Conversion_Test_Gemma
214
+ runner: self-hosted-azure
215
+ - script: L2_NeMo_2_Conversion_Test_Gemma2
216
+ runner: self-hosted-azure
217
+ - script: L2_NeMo_2_Conversion_Test_Gemma3_llm
218
+ runner: self-hosted-azure
219
+ - script: L2_NeMo_2_Conversion_Test_Gemma3_vlm
220
+ runner: self-hosted-azure
221
+ - script: L2_NeMo_2_Conversion_Test_Mistral
222
+ runner: self-hosted-azure
223
+ - script: L2_NeMo_2_Conversion_Test_Llama
224
+ runner: self-hosted-azure
225
+ - script: L2_NeMo_2_Conversion_Test_Llama_Embedding
226
+ runner: self-hosted-azure
227
+ - script: L2_NeMo_2_Conversion_Test_Llama4
228
+ runner: self-hosted-azure
229
+ - script: L2_NeMo_2_Conversion_Test_Llama4_Text
230
+ runner: self-hosted-azure
231
+ - script: L2_NeMo_2_PTQ_Llama4_FP8_nemo
232
+ runner: self-hosted-azure
233
+ - script: L2_NeMo_2_Conversion_Test_Nemotron
234
+ runner: self-hosted-azure
235
+ - script: L2_NeMo_2_Conversion_Test_Nemotron_H_4B
236
+ runner: self-hosted-azure
237
+ - script: L2_NeMo_2_Conversion_Test_Phi3Mini
238
+ runner: self-hosted-azure
239
+ - script: L2_NeMo_2_Conversion_Test_Qwen2
240
+ runner: self-hosted-azure
241
+ - script: L2_NeMo_2_Conversion_Test_Qwen3
242
+ runner: self-hosted-azure
243
+ - script: L2_NeMo_2_Conversion_Test_Starcoder
244
+ runner: self-hosted-azure
245
+ - script: L2_NeMo_2_Conversion_Test_Starcoder2
246
+ runner: self-hosted-azure
247
+ - script: L2_NeMo_2_Conversion_Test_BERT
248
+ runner: self-hosted-azure
249
+ - script: L2_NeMo_2_Conversion_Test_T5
250
+ runner: self-hosted-azure
251
+ - runner: self-hosted-azure
252
+ script: L2_NeMo_2_QWEN2VL_MOCK_FINETUNE_TP2
253
+ - runner: self-hosted-azure
254
+ script: L2_NeMo_2_QWEN2VL_PRELOADED_FINETUNE_TP2
255
+ - runner: self-hosted-azure
256
+ script: L2_NeMo_2_QWEN2VL_ENERGON_FINETUNE_TP2
257
+ - runner: self-hosted-azure
258
+ script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_PP2
259
+ - runner: self-hosted-azure
260
+ script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_CP2
261
+ - runner: self-hosted-azure
262
+ script: L2_NeMo_2_LLAMA4_ENERGON_FINETUNE_EP2
263
+ - runner: self-hosted-azure
264
+ script: L2_NeMo_2_Diffusion_Recipe_Test
265
+ - runner: self-hosted-azure
266
+ script: L2_NeMo_2_Diffusion_Taskencoder_Test
267
+ - runner: self-hosted-azure
268
+ script: L2_NeMo_2_Flux_Import_Test
269
+ is-optional: true
270
+ - runner: self-hosted-azure
271
+ script: L2_NeMo_2_Flux_Inference_Test
272
+ - runner: self-hosted-azure
273
+ script: L2_NeMo_2_Flux_Training_DDP_Test
274
+ - runner: self-hosted-azure
275
+ script: L2_NeMo_2_Flux_Training_FSDP_Test
276
+ - runner: self-hosted-azure
277
+ script: L2_NeMo_2_Flux_ControlNet_Training_DDP_Test
278
+ - runner: self-hosted-azure
279
+ script: L2_NeMo_2_Flux_ControlNet_Training_FSDP_Test
280
+ is-optional: true
281
+
282
+
283
+ needs: [build]
284
+ runs-on: ${{ matrix.runner }}
285
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
286
+ steps:
287
+ - name: Checkout
288
+ uses: actions/checkout@v4
289
+ with:
290
+ path: ${{ github.run_id }}
291
+ - name: main
292
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
293
+ with:
294
+ runner: ${{ runner.name }}
295
+ script: ${{ matrix.script }}
296
+ tests_to_run: ${{ inputs.test_to_run }}
297
+ image: ${{ inputs.image-name }}
298
+ is_optional: ${{ matrix.is-optional || false }}
299
+ timeout: ${{ matrix.timeout || 10 }}
.github/workflows/cicd-main-speech.yml ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E Speech Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_speech
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci
32
+
33
+ unit-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L0_Unit_Tests_GPU_ASR
39
+ runner: self-hosted-azure-gpus-1
40
+ timeout: 30
41
+ - script: L0_Unit_Tests_CPU_ASR
42
+ runner: azure-gpu-vm-runner1-cpu
43
+ cpu-only: true
44
+ timeout: 30
45
+ - script: L0_Unit_Tests_GPU_TTS
46
+ runner: self-hosted-azure-gpus-1
47
+ - script: L0_Unit_Tests_CPU_TTS
48
+ runner: self-hosted-azure-cpu
49
+ cpu-only: true
50
+ - script: L0_Unit_Tests_GPU_Audio
51
+ runner: self-hosted-azure-gpus-1
52
+ - script: L0_Unit_Tests_CPU_Audio
53
+ runner: self-hosted-azure-cpu
54
+ cpu-only: true
55
+ - script: L0_Unit_Tests_GPU_SpeechLM2
56
+ runner: self-hosted-azure-gpus-1
57
+ timeout: 20
58
+ - script: L0_Unit_Tests_CPU_SpeechLM2
59
+ runner: self-hosted-azure-cpu
60
+ cpu-only: true
61
+ timeout: 20
62
+ needs: [build]
63
+ runs-on: ${{ matrix.runner }}
64
+ name: ${{ matrix.script }}
65
+ steps:
66
+ - name: Checkout
67
+ uses: actions/checkout@v4
68
+ with:
69
+ path: ${{ github.run_id }}
70
+ - name: main
71
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
72
+ with:
73
+ runner: ${{ runner.name }}
74
+ script: ${{ matrix.script }}
75
+ is_unit_test: true
76
+ tests_to_run: ${{ inputs.test_to_run }}
77
+ image: ${{ inputs.image-name }}
78
+ timeout: ${{ matrix.timeout || 10 }}
79
+ cpu-only: ${{ matrix.cpu-only || false }}
80
+ is_optional: ${{ matrix.is-optional || false }}
81
+
82
+ e2e-tests:
83
+ strategy:
84
+ fail-fast: false
85
+ matrix:
86
+ include:
87
+ - runner: self-hosted-azure-gpus-1
88
+ script: ASR_dev_run_Speech_to_Text
89
+ - runner: self-hosted-azure-gpus-1
90
+ script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
91
+ - runner: self-hosted-azure-gpus-1
92
+ script: ASR_dev_run_Speech_Pre-training_-_CitriNet
93
+ - runner: self-hosted-azure-gpus-1
94
+ script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
95
+ is-optional: true
96
+ - runner: self-hosted-azure-gpus-1
97
+ script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
98
+ is-optional: true
99
+ - runner: self-hosted-azure-gpus-1
100
+ script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
101
+ - runner: self-hosted-azure-gpus-1
102
+ script: ASR_dev_run_Speech_to_Text_Hybrid_RNNT_CTC_Prompt
103
+ - runner: self-hosted-azure-gpus-1
104
+ script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
105
+ - runner: self-hosted-azure-gpus-1
106
+ script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
107
+ - runner: self-hosted-azure-gpus-1
108
+ script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
109
+ - runner: self-hosted-azure-gpus-1
110
+ script: L2_ASR_Adapters_Linear_Adapters
111
+ - runner: self-hosted-azure-gpus-1
112
+ script: L2_ASR_Adapters_RelPos_MHA_Adapters
113
+ - runner: self-hosted-azure
114
+ script: L2_Speech_to_Text_EMA
115
+ - runner: self-hosted-azure-gpus-1
116
+ script: L2_Speech_to_Text_AED
117
+ - runner: self-hosted-azure-gpus-1
118
+ script: L2_Speaker_dev_run_Speech_to_Label
119
+ - runner: self-hosted-azure
120
+ script: L2_Speech_Estimate_Duration_Bins
121
+ - runner: self-hosted-azure
122
+ script: L2_Speech_Batch_Size_OOMptimizer
123
+ - runner: self-hosted-azure
124
+ script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
125
+ is-optional: true
126
+ - runner: self-hosted-azure
127
+ script: L2_Speech_Transcription_Speech_to_Text_Transcribe
128
+ - runner: self-hosted-azure
129
+ script: L2_Speech_Transcription_Speech_to_Text_Streaming_Infer
130
+ - runner: self-hosted-azure
131
+ script: L2_Speech_Transcription_Speech_to_Text_Cache_Aware_Infer
132
+ - runner: self-hosted-azure
133
+ script: L2_Speech_Transcription_Streaming_Inference
134
+ - runner: self-hosted-azure
135
+ script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
136
+ - runner: self-hosted-azure
137
+ script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
138
+ - runner: self-hosted-azure
139
+ script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
140
+ - runner: self-hosted-azure
141
+ script: L2_Speech_Transcription_Canary_Streaming_Full_Manifest
142
+ - runner: self-hosted-azure
143
+ script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
144
+ - runner: self-hosted-azure
145
+ script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
146
+ - runner: self-hosted-azure
147
+ script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
148
+ - runner: self-hosted-azure-gpus-1
149
+ script: Speech_Checkpoints_tests
150
+ timeout: 20
151
+ - runner: self-hosted-azure-gpus-1
152
+ script: L2_Speaker_dev_run_Speaker_Recognition
153
+ - runner: self-hosted-azure-gpus-1
154
+ script: L2_Speaker_dev_run_Speaker_Diarization
155
+ - runner: self-hosted-azure-gpus-1
156
+ script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
157
+ - runner: self-hosted-azure
158
+ script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
159
+ - runner: self-hosted-azure
160
+ script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
161
+ - runner: self-hosted-azure
162
+ script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
163
+ - runner: self-hosted-azure
164
+ script: L2_Speaker_dev_run_Neural_Diarizer_Inference
165
+ - runner: self-hosted-azure
166
+ script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
167
+ - runner: self-hosted-azure
168
+ script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
169
+ - runner: self-hosted-azure
170
+ script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
171
+ - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
172
+ runner: self-hosted-azure
173
+ - runner: self-hosted-azure-gpus-1
174
+ script: L2_TTS_Fast_dev_runs_1_Tacotron_2
175
+ - runner: self-hosted-azure
176
+ script: L2_TTS_Fast_dev_runs_1_WaveGlow
177
+ - runner: self-hosted-azure
178
+ script: L2_TTS_Fast_dev_runs_1_FastPitch
179
+ - runner: self-hosted-azure
180
+ script: L2_TTS_Fast_dev_runs_1_Hifigan
181
+ - runner: self-hosted-azure
182
+ script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
183
+ - runner: self-hosted-azure
184
+ script: SPEECHLM_HF_Training_DuplexS2S
185
+ - runner: self-hosted-azure
186
+ script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
187
+ - runner: self-hosted-azure
188
+ script: SPEECHLM_HF_Training_SALM
189
+ timeout: 20
190
+ - runner: self-hosted-azure
191
+ script: L2_TTS_Fast_dev_runs_Magpietts_DecoderContext
192
+ - runner: self-hosted-azure
193
+ script: L2_TTS_Fast_dev_runs_Magpietts_MultiEncoder
194
+ - runner: self-hosted-azure
195
+ script: L2_TTS_Fast_dev_runs_Magpietts_OnlinePO
196
+ - runner: self-hosted-azure
197
+ script: L2_TTS_InferEvaluate_Magpietts_ZeroShot
198
+ - runner: self-hosted-azure
199
+ script: L2_TTS_InferEvaluate_Magpietts_SeenSpeakers
200
+ needs: [unit-tests]
201
+ runs-on: ${{ matrix.runner }}
202
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
203
+ steps:
204
+ - name: Checkout
205
+ uses: actions/checkout@v4
206
+ with:
207
+ path: ${{ github.run_id }}
208
+ - name: main
209
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
210
+ with:
211
+ runner: ${{ runner.name }}
212
+ script: ${{ matrix.script }}
213
+ tests_to_run: ${{ inputs.test_to_run }}
214
+ image: ${{ inputs.image-name }}
215
+ timeout: ${{ matrix.timeout || 10 }}
216
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main-testcopy.yml ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "[debug] CICD NeMo"
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ - cron: "*/5 * * * *" # Runs every 5 minutes
19
+ push:
20
+ branches:
21
+ - main
22
+ workflow_dispatch:
23
+ inputs:
24
+ test_to_run:
25
+ required: false
26
+ default: all
27
+ type: string
28
+ description: Comma-separated list of tests to run. Use "all" to run the full test suite.
29
+
30
+ jobs:
31
+ pre-flight:
32
+ runs-on: ubuntu-latest
33
+ outputs:
34
+ test_to_run: ${{ steps.test_to_run.outputs.main }}
35
+ is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
36
+ no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
37
+ components_to_run: ${{ steps.components_to_run.outputs.main }}
38
+ env:
39
+ TESTS_TO_RUN: ${{ inputs.test_to_run }}
40
+ EVENT_NAME: ${{ github.event_name }}
41
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
42
+ steps:
43
+ - name: Checkout branch
44
+ uses: actions/checkout@v4
45
+ with:
46
+ fetch-depth: 0
47
+
48
+ - name: Select components to run
49
+ id: components_to_run
50
+ run: |
51
+ pip install -U pip
52
+ pip install git-python
53
+
54
+ if [[ "$EVENT_NAME" == "pull_request" ]]; then
55
+ python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
56
+ else
57
+ echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json
58
+ fi
59
+
60
+ components_to_run=$(cat test_modules.json)
61
+
62
+ echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
63
+
64
+ - name: Select tests to run
65
+ id: test_to_run
66
+ run: |
67
+ # For manual dispatch, we replace `all` with the actual job names
68
+ if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
69
+ TESTS_TO_RUN=$TESTS_TO_RUN
70
+
71
+ # For correctly labeled PR, we replace `all` with the actual job names
72
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
73
+ TESTS_TO_RUN=all
74
+
75
+ # For incorrectly labeled PR, run no tests
76
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
77
+ TESTS_TO_RUN=""
78
+
79
+ # For push events, run all tests. This is so that we can generate coverage
80
+ # on branch `main`.
81
+ elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
82
+ TESTS_TO_RUN=all
83
+
84
+ else
85
+ echo "Unsupported event_name $EVENT_NAME provided".
86
+ exit 1
87
+ fi
88
+
89
+ parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
90
+ echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
91
+
92
+ - name: Check if this is a CI workload
93
+ shell: bash
94
+ id: is_ci_workload
95
+ run: |
96
+ branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
97
+
98
+ if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
99
+ is_ci_workload=true
100
+ echo "main=true" | tee -a "$GITHUB_OUTPUT"
101
+ else
102
+ is_ci_workload=false
103
+ fi
104
+
105
+ echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
106
+
107
+ - name: Check if no-fail-fast is set
108
+ shell: bash
109
+ id: no_fail_fast
110
+ env:
111
+ HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
112
+ run: |
113
+ if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
114
+ no_fail_fast=true
115
+ else
116
+ no_fail_fast=false
117
+ fi
118
+
119
+ echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
120
+
121
+ code-linting:
122
+ if: needs.pre-flight.outputs.test_to_run != '[]'
123
+ needs: [pre-flight]
124
+ uses: ./.github/workflows/code-linting.yml
125
+
126
+ cicd-wait-in-queue:
127
+ needs: [pre-flight]
128
+ runs-on: ubuntu-latest
129
+ environment: test
130
+ if: |
131
+ needs.pre-flight.outputs.test_to_run != '[]'
132
+ && needs.pre-flight.outputs.is_ci_workload == 'false'
133
+ steps:
134
+ - name: Running CI tests
135
+ run: |
136
+ echo "Running CI tests"
137
+
138
+ cicd-test-container-build:
139
+ uses: ./.github/workflows/_build_container.yml
140
+ needs: [pre-flight, code-linting, cicd-wait-in-queue]
141
+ if: |
142
+ needs.pre-flight.outputs.test_to_run != '[]'
143
+ && (
144
+ success()
145
+ || (
146
+ needs.cicd-wait-in-queue.result == 'skipped'
147
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
148
+ )
149
+ )
150
+ && !cancelled()
151
+ with:
152
+ image-name: nemo_container
153
+ dockerfile: docker/Dockerfile.ci
154
+
155
+ # cicd-import-tests:
156
+ # if: |
157
+ # needs.pre-flight.outputs.test_to_run != '[]'
158
+ # && (
159
+ # success()
160
+ # || (
161
+ # needs.cicd-wait-in-queue.result == 'skipped'
162
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
163
+ # )
164
+ # )
165
+ # && !cancelled()
166
+ # needs: [cicd-test-container-build, pre-flight]
167
+ # runs-on: self-hosted-azure-gpus-1
168
+ # steps:
169
+ # - name: Create UUID
170
+ # id: uuid
171
+ # run: |
172
+ # echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
173
+
174
+ # - name: Checkout NeMo
175
+ # uses: actions/checkout@v2
176
+ # with:
177
+ # repository: NVIDIA/NeMo
178
+ # path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
179
+
180
+ # - name: Run some checks
181
+ # run: |
182
+ # docker run \
183
+ # --rm \
184
+ # --device=/dev/nvidia0 \
185
+ # --gpus all \
186
+ # --shm-size=8g \
187
+ # --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
188
+ # --env TRANSFORMERS_OFFLINE=0 \
189
+ # --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
190
+ # # PyTorch Lightning version
191
+ # python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
192
+
193
+ # # PyTorch Lightning DDP Checks
194
+ # CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
195
+
196
+ # # Basic Import Checks
197
+ # python tests/core_ptl/check_imports.py --domain asr
198
+ # python tests/core_ptl/check_imports.py --domain nlp
199
+ # python tests/core_ptl/check_imports.py --domain tts
200
+ # '
201
+
202
+ # L0_Setup_Test_Data_And_Models:
203
+ # needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
204
+ # runs-on: self-hosted-azure
205
+ # if: |
206
+ # needs.pre-flight.outputs.test_to_run != '[]'
207
+ # && (
208
+ # success()
209
+ # || (
210
+ # needs.cicd-wait-in-queue.result == 'skipped'
211
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
212
+ # )
213
+ # )
214
+ # && !cancelled()
215
+ # steps:
216
+ # - name: Checkout
217
+ # uses: actions/checkout@v4
218
+ # with:
219
+ # path: ${{ github.run_id }}
220
+
221
+ # - name: main
222
+ # uses: NVIDIA/NeMo/.github/actions/test-template@main
223
+ # with:
224
+ # runner: ${{ runner.name }}
225
+ # script: L0_Setup_Test_Data_And_Models
226
+ # tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
227
+
228
+ # cicd-main-unit-tests:
229
+ # needs: [pre-flight, cicd-test-container-build]
230
+ # uses: ./.github/workflows/cicd-main-unit-tests.yml
231
+ # if: |
232
+ # needs.pre-flight.outputs.test_to_run != '[]'
233
+ # && (
234
+ # success()
235
+ # || (
236
+ # needs.cicd-wait-in-queue.result == 'skipped'
237
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
238
+ # )
239
+ # )
240
+ # && !cancelled()
241
+ # with:
242
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
243
+
244
+ # cicd-main-export-deploy:
245
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
246
+ # uses: ./.github/workflows/cicd-main-export-deploy.yml
247
+ # if: |
248
+ # (
249
+ # needs.pre-flight.outputs.test_to_run != '[]'
250
+ # && (
251
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy')
252
+ # )
253
+ # )
254
+ # && (
255
+ # success()
256
+ # || (
257
+ # needs.cicd-wait-in-queue.result == 'skipped'
258
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
259
+ # )
260
+ # )
261
+ # && !cancelled()
262
+ # with:
263
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
264
+
265
+ # cicd-main-speech:
266
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
267
+ # uses: ./.github/workflows/cicd-main-speech.yml
268
+ # if: |
269
+ # (
270
+ # needs.pre-flight.outputs.test_to_run != '[]'
271
+ # && (
272
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
273
+ # )
274
+ # )
275
+ # && (
276
+ # success()
277
+ # || (
278
+ # needs.cicd-wait-in-queue.result == 'skipped'
279
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
280
+ # )
281
+ # )
282
+ # && !cancelled()
283
+ # with:
284
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
285
+
286
+ # cicd-main-automodel:
287
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
288
+ # uses: ./.github/workflows/cicd-main-automodel.yml
289
+ # if: |
290
+ # (
291
+ # needs.pre-flight.outputs.test_to_run != '[]'
292
+ # && (
293
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel')
294
+ # )
295
+ # )
296
+ # && (
297
+ # success()
298
+ # || (
299
+ # needs.cicd-wait-in-queue.result == 'skipped'
300
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
301
+ # )
302
+ # )
303
+ # && !cancelled()
304
+ # with:
305
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
306
+
307
+ # cicd-main-nemo2:
308
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
309
+ # uses: ./.github/workflows/cicd-main-nemo2.yml
310
+ # if: |
311
+ # (
312
+ # needs.pre-flight.outputs.test_to_run != '[]'
313
+ # && (
314
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
315
+ # || needs.pre-flight.outputs.components_to_run == '["all"]'
316
+ # )
317
+ # )
318
+ # && (
319
+ # success()
320
+ # || (
321
+ # needs.cicd-wait-in-queue.result == 'skipped'
322
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
323
+ # )
324
+ # )
325
+ # && !cancelled()
326
+ # with:
327
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
328
+
329
+ Nemo_CICD_Test_Debug:
330
+ needs:
331
+ - pre-flight
332
+ - cicd-test-container-build
333
+ # - cicd-import-tests
334
+ # - L0_Setup_Test_Data_And_Models
335
+ # - cicd-main-unit-tests
336
+ # - cicd-main-nemo2
337
+ # - cicd-main-export-deploy
338
+ # - cicd-main-automodel
339
+ # - cicd-main-speech
340
+ if: always()
341
+ runs-on: ubuntu-latest
342
+ permissions: write-all
343
+ steps:
344
+ - name: Checkout
345
+ uses: actions/checkout@v4
346
+
347
+ - name: Get workflow result
348
+ id: result
349
+ env:
350
+ GH_TOKEN: ${{ github.token }}
351
+ RUN_ID: ${{ github.run_id }}
352
+
353
+ run: |
354
+ # Get workflow run details and check job conclusions
355
+ NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
356
+ NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
357
+
358
+ if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then
359
+ RESULT="success"
360
+ else
361
+ RESULT="failure"
362
+ fi
363
+
364
+ # Output the final status
365
+ echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
366
+
367
+ - name: Checkout for GH CLI
368
+ uses: actions/checkout@v4
369
+
370
+ - name: Remove label if not cancelled
371
+ if: ${{ steps.result.outputs.code != 'cancelled' && github.event.label.name == 'Run CICD' && github.event.pull_request.head.repo.full_name == github.repository }}
372
+ env:
373
+ GH_TOKEN: ${{ github.token }}
374
+ PR_NUMBER: ${{ github.event.number }}
375
+ run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
376
+
377
+ - name: Pipeline successful, add PR comment
378
+ if: ${{ always() && steps.result.outputs.code == 'success' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
379
+ uses: peter-evans/create-or-update-comment@v4
380
+ env:
381
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
382
+ REPOSITORY: ${{ github.repository }}
383
+ RUN_ID: ${{ github.run_id }}
384
+ with:
385
+ issue-number: ${{ github.event.number }}
386
+ body: |
387
+ [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
388
+
389
+ We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
390
+
391
+ So it might be time to merge this PR or get some approvals.
392
+
393
+ Due to a major CI change, merges are currently handled by the automation team.
394
+ We will reach out to you quickly to merge this PR, but you can always reach us with the following handles:
395
+
396
+ //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
397
+
398
+ - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
399
+ if: ${{ always() && steps.result.outputs.code == 'failure' && env.SLACK_WEBHOOK != '' }}
400
+ env:
401
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
402
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
403
+ REPOSITORY: ${{ github.repository }}
404
+ RUN_ID: ${{ github.run_id }}
405
+ PR_NUMBER: ${{ github.event.number }}
406
+ SERVER_URL: ${{ github.server_url }}
407
+ run: |
408
+ set -x
409
+ pip install PyGithub
410
+ export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
411
+
412
+ python .github/scripts/notify.py
413
+
414
+ - name: Exit
415
+ if: ${{ always() }}
416
+ env:
417
+ RESULT: ${{ steps.result.outputs.code }}
418
+ run: |
419
+ if [ $RESULT == "success" ]; then
420
+ exit 0
421
+ else
422
+ exit 1
423
+ fi
424
+
425
+ Coverage:
426
+ runs-on: ubuntu-latest
427
+ needs: [Nemo_CICD_Test_Debug]
428
+ strategy:
429
+ matrix:
430
+ flag: [unit-test, e2e]
431
+ if: |
432
+ (
433
+ success()
434
+ || needs.Nemo_CICD_Test.result == 'success'
435
+ )
436
+ && !cancelled()
437
+ steps:
438
+ - name: Checkout
439
+ uses: actions/checkout@v4
440
+
441
+ - name: Download coverage reports of current branch
442
+ uses: actions/download-artifact@v4
443
+ with:
444
+ pattern: coverage-${{ matrix.flag }}-*
445
+
446
+ - name: Get total coverage of current branch
447
+ shell: bash -x -e -u -o pipefail {0}
448
+ if: always()
449
+ run: |
450
+ pip install coverage
451
+
452
+ ls -al .
453
+ ls -al coverage-*/
454
+ coverage combine --keep $(ls coverage-*/.coverage)
455
+ coverage report -i
456
+ rm -rf coverage-*
457
+ ls -al
458
+
459
+ - name: Upload coverage reports to Codecov
460
+ uses: codecov/codecov-action@v5
461
+ with:
462
+ token: ${{ secrets.CODECOV_TOKEN }}
463
+ verbose: true
464
+ flags: ${{ matrix.flag }}
465
+
466
+ - name: Upload artifacts
467
+ uses: actions/upload-artifact@v4
468
+ with:
469
+ name: coverage-${{ matrix.flag }}-aggregated
470
+ path: |
471
+ .coverage
472
+ include-hidden-files: true
.github/workflows/cicd-main-unit-tests.yml ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo Unit Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+
22
+ jobs:
23
+ collections-common-tests:
24
+ strategy:
25
+ fail-fast: false
26
+ matrix:
27
+ include:
28
+ - script: L0_Unit_Tests_GPU_Common
29
+ runner: self-hosted-azure-gpus-1
30
+ - script: L0_Unit_Tests_CPU_Common
31
+ runner: self-hosted-azure-cpu
32
+ cpu-only: true
33
+ runs-on: ${{ matrix.runner }}
34
+ name: ${{ matrix.script }}
35
+ steps:
36
+ - name: Checkout
37
+ uses: actions/checkout@v4
38
+ with:
39
+ path: ${{ github.run_id }}
40
+ - name: main
41
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
42
+ with:
43
+ runner: ${{ runner.name }}
44
+ script: ${{ matrix.script }}
45
+ is_unit_test: true
46
+ tests_to_run: ${{ inputs.test_to_run }}
47
+ cpu-only: ${{ matrix.cpu-only || false }}
48
+
49
+ collections-llm-tests:
50
+ strategy:
51
+ fail-fast: false
52
+ matrix:
53
+ include:
54
+ - script: L0_Unit_Tests_GPU_LLM
55
+ runner: self-hosted-azure-gpus-1
56
+ - script: L0_Unit_Tests_CPU_LLM
57
+ runner: self-hosted-azure-cpu
58
+ cpu-only: true
59
+ runs-on: ${{ matrix.runner }}
60
+ name: ${{ matrix.script }}
61
+ steps:
62
+ - name: Checkout
63
+ uses: actions/checkout@v4
64
+ with:
65
+ path: ${{ github.run_id }}
66
+ - name: main
67
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
68
+ with:
69
+ runner: ${{ runner.name }}
70
+ script: ${{ matrix.script }}
71
+ is_unit_test: true
72
+ tests_to_run: ${{ inputs.test_to_run }}
73
+ cpu-only: ${{ matrix.cpu-only || false }}
74
+ is_optional: ${{ matrix.is-optional || false }}
75
+
76
+ collections-multimodal-tests:
77
+ strategy:
78
+ fail-fast: false
79
+ matrix:
80
+ include:
81
+ - script: L0_Unit_Tests_GPU_Multimodal
82
+ runner: self-hosted-azure-gpus-1
83
+ - script: L0_Unit_Tests_CPU_Multimodal
84
+ runner: self-hosted-azure-cpu
85
+ cpu-only: true
86
+ runs-on: ${{ matrix.runner }}
87
+ name: ${{ matrix.script }}
88
+ steps:
89
+ - name: Checkout
90
+ uses: actions/checkout@v4
91
+ with:
92
+ path: ${{ github.run_id }}
93
+ - name: main
94
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
95
+ with:
96
+ runner: ${{ runner.name }}
97
+ script: ${{ matrix.script }}
98
+ is_unit_test: true
99
+ tests_to_run: ${{ inputs.test_to_run }}
100
+ cpu-only: ${{ matrix.cpu-only || false }}
101
+ is_optional: ${{ matrix.is-optional || false }}
102
+ collections-vlm-tests:
103
+ strategy:
104
+ fail-fast: false
105
+ matrix:
106
+ include:
107
+ - script: L0_Unit_Tests_GPU_VLM
108
+ runner: self-hosted-azure-gpus-1
109
+ - script: L0_Unit_Tests_CPU_VLM
110
+ runner: self-hosted-azure-cpu
111
+ cpu-only: true
112
+ runs-on: ${{ matrix.runner }}
113
+ name: ${{ matrix.script }}
114
+ steps:
115
+ - name: Checkout
116
+ uses: actions/checkout@v4
117
+ with:
118
+ path: ${{ github.run_id }}
119
+ - name: main
120
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
121
+ with:
122
+ runner: ${{ runner.name }}
123
+ script: ${{ matrix.script }}
124
+ is_unit_test: true
125
+ tests_to_run: ${{ inputs.test_to_run }}
126
+ cpu-only: ${{ matrix.cpu-only || false }}
127
+ is_optional: ${{ matrix.is-optional || false }}
128
+
129
+ core-tests:
130
+ strategy:
131
+ fail-fast: false
132
+ matrix:
133
+ include:
134
+ - script: L0_Unit_Tests_GPU_Core
135
+ runner: self-hosted-azure-gpus-1
136
+ - script: L0_Unit_Tests_CPU_Core
137
+ runner: self-hosted-azure-cpu
138
+ cpu-only: true
139
+ - script: L0_Unit_Tests_GPU_Hydra
140
+ runner: self-hosted-azure-gpus-1
141
+ - script: L0_Unit_Tests_CPU_Hydra
142
+ runner: self-hosted-azure-cpu
143
+ cpu-only: true
144
+ runs-on: ${{ matrix.runner }}
145
+ name: ${{ matrix.script }}
146
+ steps:
147
+ - name: Checkout
148
+ uses: actions/checkout@v4
149
+ with:
150
+ path: ${{ github.run_id }}
151
+ - name: main
152
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
153
+ with:
154
+ runner: ${{ runner.name }}
155
+ script: ${{ matrix.script }}
156
+ is_unit_test: true
157
+ tests_to_run: ${{ inputs.test_to_run }}
158
+ cpu-only: ${{ matrix.cpu-only || false }}
159
+
160
+ lightning-tests:
161
+ strategy:
162
+ fail-fast: false
163
+ matrix:
164
+ include:
165
+ - script: L0_Unit_Tests_GPU_Lightning
166
+ runner: self-hosted-azure
167
+ - script: L0_Unit_Tests_CPU_Lightning
168
+ runner: self-hosted-azure-cpu
169
+ cpu-only: true
170
+ runs-on: ${{ matrix.runner }}
171
+ name: ${{ matrix.script }}
172
+ steps:
173
+ - name: Checkout
174
+ uses: actions/checkout@v4
175
+ with:
176
+ path: ${{ github.run_id }}
177
+ - name: main
178
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
179
+ with:
180
+ runner: ${{ runner.name }}
181
+ script: ${{ matrix.script }}
182
+ is_unit_test: true
183
+ tests_to_run: ${{ inputs.test_to_run }}
184
+ cpu-only: ${{ matrix.cpu-only || false }}
185
+ is_optional: ${{ matrix.is-optional || false }}
186
+
187
+ other-tests:
188
+ strategy:
189
+ fail-fast: false
190
+ matrix:
191
+ include:
192
+ - script: L0_Unit_Tests_GPU_Others
193
+ runner: self-hosted-azure-gpus-1
194
+ - script: L0_Unit_Tests_CPU_Others
195
+ runner: self-hosted-azure-cpu
196
+ cpu-only: true
197
+ runs-on: ${{ matrix.runner }}
198
+ name: ${{ matrix.script }}
199
+ steps:
200
+ - name: Checkout
201
+ uses: actions/checkout@v4
202
+ with:
203
+ path: ${{ github.run_id }}
204
+ - name: main
205
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
206
+ with:
207
+ runner: ${{ runner.name }}
208
+ script: ${{ matrix.script }}
209
+ is_unit_test: true
210
+ tests_to_run: ${{ inputs.test_to_run }}
211
+ cpu-only: ${{ matrix.cpu-only || false }}
212
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main.yml ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: CICD NeMo
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ pull_request:
19
+ branches:
20
+ - main
21
+ - r**
22
+ - weekly-bump*
23
+ types: [labeled]
24
+ workflow_dispatch:
25
+ inputs:
26
+ test_to_run:
27
+ required: false
28
+ default: all
29
+ type: string
30
+ description: Comma-separated list of tests to run. Use "all" to run the full test suite.
31
+
32
+ concurrency:
33
+ # group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}-${{ github.event_name }}
34
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
35
+ cancel-in-progress: true
36
+
37
+ jobs:
38
+ pre-flight:
39
+ runs-on: ubuntu-latest
40
+ outputs:
41
+ test_to_run: ${{ steps.test_to_run.outputs.main }}
42
+ is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
43
+ no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
44
+ components_to_run: ${{ steps.components_to_run.outputs.main }}
45
+ env:
46
+ TESTS_TO_RUN: ${{ inputs.test_to_run }}
47
+ EVENT_NAME: ${{ github.event_name }}
48
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
49
+ steps:
50
+ - name: Checkout branch
51
+ uses: actions/checkout@v4
52
+ with:
53
+ fetch-depth: 0
54
+
55
+ - name: Select components to run
56
+ id: components_to_run
57
+ run: |
58
+ pip install -U pip
59
+ pip install git-python
60
+
61
+ if [[ "$EVENT_NAME" == "pull_request" ]]; then
62
+ python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
63
+ else
64
+ echo '["nemo2", "export-deploy", "speech"]' | tee -a test_modules.json
65
+ fi
66
+
67
+ components_to_run=$(cat test_modules.json)
68
+
69
+ echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
70
+
71
+ - name: Select tests to run
72
+ id: test_to_run
73
+ run: |
74
+ # For manual dispatch, we replace `all` with the actual job names
75
+ if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
76
+ TESTS_TO_RUN=$TESTS_TO_RUN
77
+
78
+ # For correctly labeled PR, we replace `all` with the actual job names
79
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
80
+ TESTS_TO_RUN=all
81
+
82
+ # For incorrectly labeled PR, run no tests
83
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
84
+ TESTS_TO_RUN=""
85
+
86
+ # For push events, run all tests. This is so that we can generate coverage
87
+ # on branch `main`.
88
+ elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
89
+ TESTS_TO_RUN=all
90
+
91
+ else
92
+ echo "Unsupported event_name $EVENT_NAME provided".
93
+ exit 1
94
+ fi
95
+
96
+ parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
97
+ echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
98
+
99
+ - name: Check if this is a CI workload
100
+ shell: bash
101
+ id: is_ci_workload
102
+ run: |
103
+ branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
104
+
105
+ if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
106
+ is_ci_workload=true
107
+ echo "main=true" | tee -a "$GITHUB_OUTPUT"
108
+ else
109
+ is_ci_workload=false
110
+ fi
111
+
112
+ echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
113
+
114
+ - name: Check if no-fail-fast is set
115
+ shell: bash
116
+ id: no_fail_fast
117
+ env:
118
+ HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
119
+ run: |
120
+ if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
121
+ no_fail_fast=true
122
+ else
123
+ no_fail_fast=false
124
+ fi
125
+
126
+ echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
127
+
128
+ code-linting:
129
+ if: needs.pre-flight.outputs.test_to_run != '[]'
130
+ needs: [pre-flight]
131
+ uses: ./.github/workflows/code-linting.yml
132
+
133
+ cicd-wait-in-queue:
134
+ needs: [pre-flight, code-linting]
135
+ runs-on: ubuntu-latest
136
+ environment: test
137
+ if: |
138
+ needs.pre-flight.outputs.test_to_run != '[]'
139
+ && needs.pre-flight.outputs.components_to_run != '[]'
140
+ && needs.pre-flight.outputs.is_ci_workload == 'false'
141
+ steps:
142
+ - name: Running CI tests
143
+ run: |
144
+ echo "Running CI tests"
145
+
146
+ cicd-test-container-build:
147
+ uses: ./.github/workflows/_build_container.yml
148
+ needs: [pre-flight, code-linting, cicd-wait-in-queue]
149
+ if: |
150
+ needs.pre-flight.outputs.test_to_run != '[]'
151
+ && needs.pre-flight.outputs.components_to_run != '[]'
152
+ && (
153
+ success()
154
+ || (
155
+ needs.cicd-wait-in-queue.result == 'skipped'
156
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
157
+ )
158
+ )
159
+ && !cancelled()
160
+ with:
161
+ image-name: nemo_container
162
+ dockerfile: docker/Dockerfile.ci
163
+
164
+ cicd-import-tests:
165
+ if: |
166
+ needs.pre-flight.outputs.test_to_run != '[]'
167
+ && needs.pre-flight.outputs.components_to_run != '[]'
168
+ && (
169
+ success()
170
+ || (
171
+ needs.cicd-wait-in-queue.result == 'skipped'
172
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
173
+ )
174
+ )
175
+ && !cancelled()
176
+ needs: [cicd-test-container-build, pre-flight]
177
+ runs-on: self-hosted-azure-gpus-1
178
+ steps:
179
+ - name: Create UUID
180
+ id: uuid
181
+ run: |
182
+ echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
183
+
184
+ - name: Checkout NeMo
185
+ uses: actions/checkout@v4
186
+ with:
187
+ path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
188
+
189
+ - name: Run some checks
190
+ run: |
191
+ docker run \
192
+ --rm \
193
+ --device=/dev/nvidia0 \
194
+ --gpus all \
195
+ --shm-size=8g \
196
+ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
197
+ --env TRANSFORMERS_OFFLINE=0 \
198
+ --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
199
+ # PyTorch Lightning version
200
+ python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
201
+
202
+ # PyTorch Lightning DDP Checks
203
+ CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
204
+
205
+ # Basic Import Checks
206
+ python tests/core_ptl/check_imports.py --domain asr
207
+ python tests/core_ptl/check_imports.py --domain tts
208
+ '
209
+
210
+ L0_Setup_Test_Data_And_Models:
211
+ needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
212
+ runs-on: self-hosted-azure
213
+ if: |
214
+ needs.pre-flight.outputs.test_to_run != '[]'
215
+ && needs.pre-flight.outputs.components_to_run != '[]'
216
+ && (
217
+ success()
218
+ || (
219
+ needs.cicd-wait-in-queue.result == 'skipped'
220
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
221
+ )
222
+ )
223
+ && !cancelled()
224
+ steps:
225
+ - name: Checkout
226
+ uses: actions/checkout@v4
227
+ with:
228
+ path: ${{ github.run_id }}
229
+
230
+ - name: main
231
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
232
+ with:
233
+ runner: ${{ runner.name }}
234
+ script: L0_Setup_Test_Data_And_Models
235
+ tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
236
+
237
+ cicd-main-unit-tests:
238
+ needs: [pre-flight, cicd-test-container-build]
239
+ uses: ./.github/workflows/cicd-main-unit-tests.yml
240
+ if: |
241
+ needs.pre-flight.outputs.test_to_run != '[]'
242
+ && needs.pre-flight.outputs.components_to_run != '[]'
243
+ && (
244
+ success()
245
+ || (
246
+ needs.cicd-wait-in-queue.result == 'skipped'
247
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
248
+ )
249
+ )
250
+ && !cancelled()
251
+ with:
252
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
253
+
254
+ cicd-main-speech:
255
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
256
+ uses: ./.github/workflows/cicd-main-speech.yml
257
+ if: |
258
+ (
259
+ needs.pre-flight.outputs.test_to_run != '[]'
260
+ && (
261
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
262
+ )
263
+ )
264
+ && (
265
+ success()
266
+ || (
267
+ needs.cicd-wait-in-queue.result == 'skipped'
268
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
269
+ )
270
+ )
271
+ && !cancelled()
272
+ with:
273
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
274
+
275
+ cicd-main-nemo2:
276
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
277
+ uses: ./.github/workflows/cicd-main-nemo2.yml
278
+ if: |
279
+ (
280
+ needs.pre-flight.outputs.test_to_run != '[]'
281
+ && (
282
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
283
+ || needs.pre-flight.outputs.components_to_run == '["all"]'
284
+ )
285
+ )
286
+ && (
287
+ success()
288
+ || (
289
+ needs.cicd-wait-in-queue.result == 'skipped'
290
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
291
+ )
292
+ )
293
+ && !cancelled()
294
+ with:
295
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
296
+
297
+ Nemo_CICD_Test:
298
+ needs:
299
+ - pre-flight
300
+ - cicd-test-container-build
301
+ - cicd-import-tests
302
+ - L0_Setup_Test_Data_And_Models
303
+ - cicd-main-unit-tests
304
+ - cicd-main-nemo2
305
+ - cicd-main-speech
306
+ if: always()
307
+ runs-on: ubuntu-latest
308
+ permissions: write-all
309
+ steps:
310
+ - name: Checkout
311
+ uses: actions/checkout@v4
312
+
313
+ - name: Get workflow result
314
+ id: result
315
+ env:
316
+ GH_TOKEN: ${{ github.token }}
317
+ RUN_ID: ${{ github.run_id }}
318
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
319
+ IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
320
+ run: |
321
+ # Get workflow run details and check job conclusions
322
+ LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
323
+ NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
324
+ NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
325
+
326
+ if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then
327
+ RESULT="success"
328
+ elif [[ $NUM_CANCELLED -gt 0 ]]; then
329
+ RESULT="cancelled"
330
+ else
331
+ RESULT="failure"
332
+ fi
333
+
334
+ # Output the final status
335
+ echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
336
+
337
+ - name: Checkout for GH CLI
338
+ uses: actions/checkout@v4
339
+
340
+ - name: Remove label if not cancelled
341
+ if: |
342
+ steps.result.outputs.code != 'cancelled'
343
+ && github.event.label.name == 'Run CICD'
344
+ && github.event.pull_request.head.repo.full_name == github.repository
345
+ env:
346
+ GH_TOKEN: ${{ github.token }}
347
+ PR_NUMBER: ${{ github.event.number }}
348
+ run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
349
+
350
+ - name: Pipeline successful, add PR comment
351
+ if: |
352
+ steps.result.outputs.code == 'success'
353
+ && github.event_name == 'pull_request'
354
+ && env.SLACK_WEBHOOK != ''
355
+ uses: peter-evans/create-or-update-comment@v4
356
+ env:
357
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
358
+ REPOSITORY: ${{ github.repository }}
359
+ RUN_ID: ${{ github.run_id }}
360
+ with:
361
+ issue-number: ${{ github.event.number }}
362
+ body: |
363
+ [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
364
+
365
+ We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
366
+
367
+ So it might be time to merge this PR or get some approvals.
368
+
369
+ //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
370
+
371
+ - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
372
+ if: |
373
+ steps.result.outputs.code == 'failure'
374
+ && github.event.label.name == 'Run CICD'
375
+ && env.SLACK_WEBHOOK != ''
376
+ env:
377
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
378
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
379
+ REPOSITORY: ${{ github.repository }}
380
+ RUN_ID: ${{ github.run_id }}
381
+ PR_NUMBER: ${{ github.event.number }}
382
+ SERVER_URL: ${{ github.server_url }}
383
+ run: |
384
+ set -x
385
+ pip install PyGithub
386
+ export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
387
+
388
+ python .github/scripts/notify.py
389
+
390
+ - name: Exit
391
+ if: ${{ always() }}
392
+ env:
393
+ RESULT: ${{ steps.result.outputs.code }}
394
+ run: |
395
+ if [ $RESULT == "success" ]; then
396
+ exit 0
397
+ else
398
+ exit 1
399
+ fi
400
+
401
+ Coverage:
402
+ runs-on: ubuntu-latest
403
+ needs: [pre-flight, Nemo_CICD_Test]
404
+ if: |
405
+ needs.pre-flight.outputs.test_to_run != '[]'
406
+ && needs.pre-flight.outputs.components_to_run != '[]'
407
+ && (
408
+ success()
409
+ || needs.Nemo_CICD_Test.result == 'success'
410
+ )
411
+ && !cancelled()
412
+ strategy:
413
+ matrix:
414
+ flag: [unit-test, e2e]
415
+ steps:
416
+ - name: Checkout
417
+ uses: actions/checkout@v4
418
+
419
+ - name: Download coverage reports of current branch
420
+ uses: actions/download-artifact@v4
421
+ with:
422
+ pattern: coverage-${{ matrix.flag }}-*
423
+
424
+ - name: Get total coverage of current branch
425
+ shell: bash -x -e -u -o pipefail {0}
426
+ if: always()
427
+ run: |
428
+ pip install coverage
429
+
430
+ ls -al .
431
+ ls -al coverage-*/
432
+ coverage combine --keep $(ls coverage-*/.coverage)
433
+ coverage report -i
434
+ rm -rf coverage-*
435
+ ls -al
436
+
437
+ - name: Upload coverage reports to Codecov
438
+ uses: codecov/codecov-action@v5
439
+ with:
440
+ token: ${{ secrets.CODECOV_TOKEN }}
441
+ verbose: true
442
+ flags: ${{ matrix.flag }}
443
+
444
+ - name: Upload artifacts
445
+ uses: actions/upload-artifact@v4
446
+ with:
447
+ name: coverage-${{ matrix.flag }}-aggregated
448
+ path: |
449
+ .coverage
450
+ include-hidden-files: true
.github/workflows/cicd-relabel-bot.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # If the PR get's updated by a new commit, it prevents auto-merges
2
+ # since there's no CI event attached to the commit anymore.
3
+ # This workflow re-attaches the label after a push, if the PR
4
+ # was already labeled prior to the push.
5
+ name: CICD Relabel bot
6
+
7
+ on:
8
+ pull_request_target:
9
+
10
+ jobs:
11
+ relabel:
12
+ runs-on: ubuntu-latest
13
+ env:
14
+ PR_NUMBER: ${{ github.event.number }}
15
+ GH_TOKEN: ${{ secrets.NEMO_RELABEL_TOKEN }}
16
+ HOSTNAME: ${{ github.server_url }}
17
+ permissions: write-all
18
+ steps:
19
+ - name: Checkout repo
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Check if PR was already labeled with `Run CICD`
23
+ id: pre-flight
24
+ run: |
25
+ LABELS=$(gh pr view "$PR_NUMBER" --json labels)
26
+ HAS_LABEL=$(echo $LABELS \
27
+ | jq '[.labels[].name] | any(. == "Run CICD")'
28
+ )
29
+
30
+ echo "has-label=$HAS_LABEL" | tee -a "$GITHUB_OUTPUT"
31
+
32
+ - name: Relabel PR
33
+ if: ${{ steps.pre-flight.outputs.has-label == 'true' }}
34
+ run: |
35
+ gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
36
+ gh pr edit "$PR_NUMBER" --add-label "Run CICD"
.github/workflows/close-inactive-issue-pr.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Stale-Close-Inactive-Issues-PRs
2
+ on:
3
+ schedule:
4
+ - cron: "30 1 * * *"
5
+
6
+ jobs:
7
+ close-issues:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ issues: write
11
+ pull-requests: write
12
+ steps:
13
+ - uses: actions/stale@v6
14
+ with:
15
+ operations-per-run: 100
16
+ days-before-issue-stale: 30
17
+ days-before-issue-close: 7
18
+ stale-issue-label: "stale"
19
+ stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
20
+ close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
21
+ days-before-pr-stale: 14
22
+ days-before-pr-close: 7
23
+ stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. Remove stale label or comment or update or this will be closed in 7 days."
24
+ close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale."
25
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
.github/workflows/code-formatting.yml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Isort and Black Formatting
2
+ # Incrementally reformat only changed files with black, all files with isort
3
+ #
4
+ # Replaces pre-commit.ci, since it reformats all the files.
5
+ # See issue https://github.com/pre-commit-ci/issues/issues/90
6
+ #
7
+ # The action requires a custom token to trigger workflow after pushing reformatted files back to the branch.
8
+ # `secrets.GITHUB_TOKEN` can be used instead, but this will result
9
+ # in not running necessary checks after reformatting, which is undesirable.
10
+ # For details see https://github.com/orgs/community/discussions/25702
11
+
12
+ on:
13
+ pull_request_target:
14
+ paths:
15
+ - "**.py"
16
+ types: [opened, synchronize, reopened, labeled, unlabeled]
17
+
18
+ defaults:
19
+ run:
20
+ shell: bash -x -e -u -o pipefail {0}
21
+
22
+ jobs:
23
+ reformat_with_isort_and_black:
24
+ runs-on: ubuntu-latest
25
+ permissions:
26
+ # write permissions required to commit changes
27
+ contents: write
28
+ steps:
29
+ - name: Checkout branch
30
+ uses: actions/checkout@v4
31
+ with:
32
+ # setup repository and ref for PRs, see
33
+ # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
34
+ repository: ${{ github.event.pull_request.head.repo.full_name }}
35
+ ref: ${{ github.event.pull_request.head.ref }}
36
+ # custom token is required to trigger actions after reformatting + pushing
37
+ token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
38
+ fetch-depth: 0
39
+
40
+ - name: Get changed files
41
+ id: changed-files
42
+ uses: step-security/[email protected]
43
+ with:
44
+ files: |
45
+ **.py
46
+
47
+ - name: Setup Python env
48
+ uses: actions/setup-python@v5
49
+ with:
50
+ python-version: "3.10"
51
+
52
+ - name: black
53
+ uses: psf/black@stable
54
+ if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
55
+ with:
56
+ options: "--verbose"
57
+ # apply only to changed files (pass explicitly the files)
58
+ src: "${{ steps.changed-files.outputs.all_changed_files }}"
59
+ version: "~= 24.3"
60
+
61
+ - name: isort
62
+ uses: isort/isort-action@v1
63
+ if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
64
+ with:
65
+ isort-version: "5.13.2"
66
+ # reformat all files with isort – safe since the whole repo is already reformatted
67
+ configuration: ""
68
+
69
+ - uses: EndBug/add-and-commit@v9
70
+ # Commit changes. Nothing is committed if no changes.
71
+ with:
72
+ message: Apply isort and black reformatting
73
+ commit: --signoff
.github/workflows/code-init-file-checker.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check __init__ files
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ jobs:
8
+ check-init-files:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout
12
+ uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v4
16
+ with:
17
+ python-version: "3.11"
18
+
19
+ - name: Install init-file-checker
20
+ run: pip install init-file-checker
21
+
22
+ - name: Run init-file-checker
23
+ run: init-file-checker nemo/
.github/workflows/code-linting.yml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: PyLint and flake8 linting
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened, labeled, unlabeled]
6
+ workflow_call:
7
+
8
+ jobs:
9
+ linting:
10
+ name: "Domain: ${{ matrix.domain }}"
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ domain: [speech, other]
16
+ env:
17
+ DOMAIN: ${{ matrix.domain }}
18
+ steps:
19
+ - name: Checkout
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Select filter
23
+ id: filter
24
+ run: |
25
+ if [[ "$DOMAIN" == "speech" ]]; then
26
+ FILTER=$(jq -crn '[
27
+ "nemo/collections/common/data/lhotse/*.py",
28
+ "nemo/collections/asr/**/*.py",
29
+ "nemo/collections/tts/**/*.py",
30
+ "nemo/collections/audio/**/*.py",
31
+ "nemo/collections/multimodal/speech_llm/**/*.py",
32
+ "nemo/collections/speechlm/**/*.py",
33
+ "nemo/collections/speechlm2/**/*.py"
34
+ ] | join(",")')
35
+
36
+ else
37
+ FILTER=$(jq -crn '[
38
+ "nemo/**/*.py",
39
+ "!nemo/collections/common/data/lhotse/*.py",
40
+ "!nemo/collections/asr/**/*.py",
41
+ "!nemo/collections/tts/**/*.py",
42
+ "!nemo/collections/audio/**/*.py",
43
+ "!nemo/collections/multimodal/speech_llm/**/*.py",
44
+ "!nemo/collections/speechlm/**/*.py",
45
+ "!nemo/collections/speechlm2/**/*.py",
46
+ "!nemo/export/**/*.py"
47
+ ] | join(",")')
48
+ fi
49
+
50
+ echo "main=$FILTER" | tee -a "$GITHUB_OUTPUT"
51
+
52
+ - name: Get changed files
53
+ id: changed-files
54
+ uses: step-security/[email protected]
55
+ with:
56
+ files: ${{ steps.filter.outputs.main }}
57
+ files_separator: ","
58
+ separator: " "
59
+
60
+ - name: Run PyLint
61
+ id: pylint
62
+ env:
63
+ CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
64
+ SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
65
+ SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
66
+ run: |
67
+ if [[ -z "$CHANGED_FILES" ]]; then
68
+ echo Nothing to lint.
69
+ echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
70
+ exit 0
71
+ fi
72
+
73
+ if [[ $SKIP_DOCS == true ]]; then
74
+ ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
75
+ else
76
+ ADDITIONAL_PYLINT_ARGS=""
77
+ fi
78
+
79
+ if [[ $SKIP_LINTING == true ]]; then
80
+ ADDITIONAL_PYLINT_ARGS="--exit-zero"
81
+ fi
82
+
83
+ pip install pylint
84
+ set +e
85
+ pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.$DOMAIN.txt" --rcfile ".pylintrc.$DOMAIN" ${CHANGED_FILES[@]}
86
+ echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
87
+
88
+ - name: Run flake8
89
+ id: flake8
90
+ env:
91
+ CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
92
+ SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
93
+ run: |
94
+ if [[ -z "$CHANGED_FILES" ]]; then
95
+ echo Nothing to lint.
96
+ echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
97
+ exit 0
98
+ fi
99
+
100
+ if [[ $SKIP_LINTING == true ]]; then
101
+ ADDITIONAL_FLAKE8_ARGS="--exit-zero"
102
+ else
103
+ ADDITIONAL_FLAKE8_ARGS=""
104
+ fi
105
+
106
+ pip install flake8
107
+ set +e
108
+ flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.$DOMAIN.txt" --config ".flake8.$DOMAIN" ${CHANGED_FILES[@]}
109
+ echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
110
+
111
+ - name: Summary
112
+ env:
113
+ PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
114
+ FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
115
+ run: |
116
+
117
+ if [[ "$PYLINT" != "true" ]]; then
118
+ echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
119
+
120
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
121
+ cat pylintrc.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
122
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
123
+ fi
124
+
125
+ if [[ "$FLAKE8" != "true" ]]; then
126
+ echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
127
+
128
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
129
+ cat flake8.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
130
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
131
+ fi
132
+
133
+ if [[ "$PYLINT" != "true" || "$FLAKE8" != "true" ]]; then
134
+ echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
135
+
136
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
137
+ echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
138
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
139
+
140
+ exit 1
141
+ fi
142
+
143
+ Nemo_Linting_Test:
144
+ needs: linting
145
+ runs-on: ubuntu-latest
146
+ if: always()
147
+ steps:
148
+ - name: Main
149
+ env:
150
+ RESULTS: ${{ toJson(needs.linting) }}
151
+ run: |
152
+ RESULT=$(echo "$RESULTS" | jq -r '.result')
153
+
154
+ if [[ "$RESULT" == "success" ]]; then
155
+ echo "All passed."
156
+ exit 0
157
+ else
158
+ echo "Some linting domains failed."
159
+ exit 1
160
+ fi
.github/workflows/codeql.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For most projects, this workflow file will not need changing; you simply need
2
+ # to commit it to your repository.
3
+ #
4
+ # You may wish to alter this file to override the set of languages analyzed,
5
+ # or to provide custom queries or build logic.
6
+ #
7
+ # ******** NOTE ********
8
+ # We have attempted to detect the languages in your repository. Please check
9
+ # the `language` matrix defined below to confirm you have the correct set of
10
+ # supported CodeQL languages.
11
+ #
12
+ name: "CodeQL"
13
+
14
+ on:
15
+ push:
16
+ branches: [ "main", "[rv][0-9]*", "gh-pages-src" ]
17
+ pull_request:
18
+ # The branches below must be a subset of the branches above
19
+ branches: [ "main" ]
20
+ schedule:
21
+ - cron: '19 1 * * 4'
22
+
23
+ jobs:
24
+ analyze:
25
+ name: Analyze
26
+ runs-on: ubuntu-latest
27
+ permissions:
28
+ actions: read
29
+ contents: read
30
+ security-events: write
31
+
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ language: [ 'python' ]
36
+ # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37
+ # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38
+
39
+ steps:
40
+ - name: Checkout repository
41
+ uses: actions/checkout@v3
42
+
43
+ # Initializes the CodeQL tools for scanning.
44
+ - name: Initialize CodeQL
45
+ uses: github/codeql-action/init@v2
46
+ with:
47
+ languages: ${{ matrix.language }}
48
+ # If you wish to specify custom queries, you can do so here or in a config file.
49
+ # By default, queries listed here will override any specified in a config file.
50
+ # Prefix the list here with "+" to use these queries and those in the config file.
51
+
52
+ # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53
+ queries: security-and-quality # security-extended,
54
+ config-file: ./.github/workflows/config/codeql.yml
55
+
56
+
57
+ # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
58
+ # If this step fails, then you should remove it and run the build manually (see below)
59
+ - name: Autobuild
60
+ uses: github/codeql-action/autobuild@v2
61
+
62
+ # ℹ️ Command-line programs to run using the OS shell.
63
+ # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
64
+
65
+ # If the Autobuild fails above, remove it and uncomment the following three lines.
66
+ # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
67
+
68
+ # - run: |
69
+ # echo "Run, Build Application using script"
70
+ # ./location_of_script_within_repo/buildscript.sh
71
+
72
+ - name: Perform CodeQL Analysis
73
+ uses: github/codeql-action/analyze@v2
74
+ with:
75
+ category: "/language:${{matrix.language}}"
.github/workflows/community-bot.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Community Bot
2
+
3
+ on:
4
+ issues:
5
+ types: [opened, edited, reopened, closed, deleted]
6
+ issue_comment:
7
+ types: [created, edited, deleted]
8
+
9
+ jobs:
10
+ community-bot:
11
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
12
+ with:
13
+ community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
14
+ secrets:
15
+ GH_TOKEN: ${{ secrets.PAT }}
.github/workflows/config/changelog-config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "categories": [
3
+ {
4
+ "title": "## ASR\n\n<details><summary>Changelog</summary>",
5
+ "labels": ["asr"],
6
+ "exclude_labels": ["cherry-pick"]
7
+ },
8
+ {
9
+ "title": "</details>\n\n## TTS\n\n<details><summary>Changelog</summary>",
10
+ "labels": ["tts"],
11
+ "exclude_labels": ["cherry-pick"]
12
+ },
13
+ {
14
+ "title": "</details>\n\n## NLP / NMT\n\n<details><summary>Changelog</summary>",
15
+ "labels": ["nlp", "nmt", "megatron"],
16
+ "exclude_labels": ["cherry-pick"]
17
+ },
18
+ {
19
+ "title": "</details>\n\n## Text Normalization / Inverse Text Normalization\n\n<details><summary>Changelog</summary>",
20
+ "labels": ["tn", "itn"],
21
+ "exclude_labels": ["cherry-pick"]
22
+ },
23
+ {
24
+ "title": "</details>\n\n## NeMo Tools\n\n<details><summary>Changelog</summary>",
25
+ "labels": ["tools"],
26
+ "exclude_labels": ["cherry-pick"]
27
+ },
28
+ {
29
+ "title": "</details>\n\n## Export\n\n<details><summary>Changelog</summary>",
30
+ "labels": ["export"],
31
+ "exclude_labels": ["cherry-pick"]
32
+ },
33
+ {
34
+ "title": "</details>\n\n## Documentation\n\n<details><summary>Changelog</summary>",
35
+ "labels": ["docs"],
36
+ "exclude_labels": ["cherry-pick"]
37
+ },
38
+ {
39
+ "title": "</details>\n\n## Bugfixes\n\n<details><summary>Changelog</summary>",
40
+ "labels": ["bug"],
41
+ "exclude_labels": ["cherry-pick"]
42
+ },
43
+ {
44
+ "title": "</details>\n\n## Cherrypick\n\n<details><summary>Changelog</summary>",
45
+ "labels": ["cherry-pick"],
46
+ "exclude_labels": ["cherry-pick"]
47
+ }
48
+ ],
49
+ "ignore_labels": [
50
+ "ignore"
51
+ ],
52
+ "sort": "ASC",
53
+ "template": "\n${{CHANGELOG}}</details>\n\n## Uncategorized:\n\n<details><summary>Changelog</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
54
+ "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
55
+ "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
56
+ "label_extractor": [
57
+ {
58
+ "pattern": "(.*tts.*)|(.*g2p.*)",
59
+ "target": "tts",
60
+ "flags": "gimu",
61
+ "on_property": ["title", "body"]
62
+ },
63
+ {
64
+ "pattern": "(.*asr.*)|(.*ctc.*)|(.*rnnt.*)|(.*transducer.*)|(.*dali.*)|(.*k2.*)",
65
+ "target": "asr",
66
+ "flags": "gimu",
67
+ "on_property": ["title", "body"]
68
+ },
69
+ {
70
+ "pattern": "(.*nlp.*)|(.*punctuation.*)|(.*capitalization.*)|(.*entity.*)|(.*glue.*)|(.*entity.*)|(.*retrieval.*)|(.*entity.*)|(.*intent.*)|(.*slot.*)|(.*entity.*)|(.*language.*)|(.*qa.*)|(.*token class.*)|(.*text class.*)",
71
+ "target": "nlp",
72
+ "flags": "gimu",
73
+ "on_property": ["title", "body"]
74
+ },
75
+ {
76
+ "pattern": "(.*nmt.*)|(.*bignlp.*)|(.*megatron.*)|(.*machine.*)|(.*translation.*)|(.*gpt.*)",
77
+ "target": "nmt",
78
+ "flags": "gimu",
79
+ "on_property": ["title", "body"]
80
+ },
81
+ {
82
+ "pattern": "(.*tn.*)|(.*itn.*)|(.*text norm.*)",
83
+ "target": "tn",
84
+ "flags": "gimu",
85
+ "on_property": ["title", "body"]
86
+ },
87
+ {
88
+ "pattern": "(.*sde.*)|(.*ctc segment.*)",
89
+ "target": "tools",
90
+ "flags": "gimu",
91
+ "on_property": ["title", "body"]
92
+ },
93
+ {
94
+ "pattern": "(.*trt.*)|(.*onnx.*)|(.*export.*)",
95
+ "target": "export",
96
+ "flags": "gimu",
97
+ "on_property": ["title", "body"]
98
+ },
99
+ {
100
+ "pattern": "(.*\\[x\\] Documentation.*)",
101
+ "target": "docs",
102
+ "flags": "gmu",
103
+ "on_property": ["title", "body"]
104
+ },
105
+ {
106
+ "pattern": "(.*\\[x\\] Bugfix.*)|(.*patch.*)",
107
+ "target": "bug",
108
+ "flags": "gmu",
109
+ "on_property": ["title", "body"]
110
+ },
111
+ {
112
+ "pattern": "(.*cherry-pick.*)|(.*cherrypick.*)",
113
+ "target": "cherrypick",
114
+ "flags": "gimu",
115
+ "on_property": ["title", "body"]
116
+ }
117
+ ],
118
+ "duplicate_filter": {
119
+ "pattern": ".+",
120
+ "on_property": "title",
121
+ "method": "match"
122
+ },
123
+ "transformers": [
124
+ ],
125
+ "max_tags_to_fetch": 100,
126
+ "max_pull_requests": 500,
127
+ "max_back_track_time_days": 365,
128
+ "exclude_merge_branches": [
129
+ ],
130
+ "tag_resolver": {
131
+ "method": "semver"
132
+ }
133
+ }
134
+
.github/workflows/config/codeql.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeQL config"
2
+
3
+ paths:
4
+ - nemo/
5
+ - tests/
6
+ - tools/
7
+ - scripts/
8
+ - examples/
9
+ - .github/
.github/workflows/copyright-check.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Copyright check
16
+
17
+ on:
18
+ pull_request:
19
+
20
+ jobs:
21
+ copyright-check:
22
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
.github/workflows/gh-docs.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gh-docs-build
2
+ on:
3
+ push:
4
+ pull_request:
5
+ paths:
6
+ - "**"
7
+
8
+ # Set the access for individual scopes
9
+ permissions: write-all
10
+
11
+ env:
12
+ PYTHON_VERSION: "3.11"
13
+
14
+ jobs:
15
+ deploy:
16
+ runs-on: ubuntu-latest
17
+
18
+ container:
19
+ image: squidfunk/mkdocs-material
20
+
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ if: github.event.repository.fork == false
24
+ with:
25
+ ref: gh-pages-src
26
+
27
+ - name: "Correct github config"
28
+ if: github.event.repository.fork == false
29
+ run: |
30
+ git config --global --add safe.directory "$GITHUB_WORKSPACE"
31
+ git config --global user.name "${GITHUB_ACTOR}"
32
+ git config --global user.email "${GITHUB_ACTOR}@users.noreply.${GITHUB_DOMAIN:-"github.com"}"
33
+ remote_repo="https://x-access-token:${GITHUB_TOKEN}@${GITHUB_DOMAIN:-"github.com"}/${GITHUB_REPOSITORY}.git"
34
+ echo "${remote_repo}"
35
+ git remote rm origin
36
+ git remote add origin "${remote_repo}"
37
+
38
+ - name: "Deploy Github Page"
39
+ continue-on-error: true
40
+ run: mkdocs gh-deploy --force
41
+
42
+ linkcheck:
43
+ runs-on: ubuntu-latest
44
+ steps:
45
+ - name: Checkout
46
+ uses: actions/checkout@v4
47
+
48
+ - name: Get changed files
49
+ id: changed-files
50
+ uses: step-security/[email protected]
51
+ with:
52
+ files: docs/**
53
+ files_separator: ","
54
+ separator: " "
55
+
56
+ - name: Set up Python ${{ env.PYTHON_VERSION }}
57
+ if: steps.changed-files.outputs.any_changed == 'true'
58
+ uses: actions/setup-python@v5
59
+ with:
60
+ python-version: ${{ env.PYTHON_VERSION }}
61
+
62
+ - name: Install Sphinx dependencies
63
+ if: steps.changed-files.outputs.any_changed == 'true'
64
+ run: python3 -m pip install -r requirements/requirements_docs.txt
65
+
66
+ - name: Linkcheck docs build
67
+ if: steps.changed-files.outputs.any_changed == 'true'
68
+ run: make -C docs linkcheck || true
69
+
70
+ - name: Eliminate false positives
71
+ if: steps.changed-files.outputs.any_changed == 'true'
72
+ run: ./docs/check_for_broken_links.sh
73
+
74
+ - name: Upload linkcheck output
75
+ if: steps.changed-files.outputs.any_changed == 'true'
76
+ uses: actions/upload-artifact@v4
77
+ with:
78
+ name: linkcheck-artifact
79
+ path: docs/build/linkcheck
80
+ if-no-files-found: error
81
+ retention-days: 7
.github/workflows/install-test.yml ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI-Install-Check
2
+
3
+ on:
4
+ pull_request:
5
+ paths:
6
+ - "**"
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ test-installs-macos:
14
+ name: ${{ matrix.os }}-py${{ matrix.python }}-${{ matrix.installer }}
15
+ runs-on: ${{ matrix.os }}
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [macos-latest]
20
+ python: ["3.10", "3.11", "3.12"]
21
+ installer: ["pip-install", "nemo-install"]
22
+ steps:
23
+ - name: Checkout repo
24
+ uses: actions/checkout@v2
25
+
26
+ - name: Check disk space before cleanup
27
+ run: df -h
28
+
29
+ - name: Free up disk space
30
+ run: |
31
+ # Remove unnecessary files on macOS
32
+ sudo rm -rf /usr/local/lib/android || true
33
+ sudo rm -rf /usr/local/.ghcup || true
34
+ sudo rm -rf /usr/local/lib/node_modules || true
35
+ brew cleanup || true
36
+ # Clear pip cache
37
+ pip cache purge || true
38
+
39
+ - name: Check disk space after cleanup
40
+ run: df -h
41
+
42
+ - uses: actions/setup-python@v5
43
+ with:
44
+ python-version: "${{ matrix.python }}"
45
+
46
+ - name: Install NeMo
47
+ env:
48
+ INSTALLER: ${{ matrix.installer }}
49
+ NEMO_TAG: ${{ github.sha }}
50
+ NEMO_REPO: ${{ github.server_url }}/${{ github.repository }}
51
+ run: |
52
+ if [[ "$INSTALLER" == "pip-install" ]]; then
53
+ pip install --no-cache-dir -U pip
54
+ pip install --no-cache-dir ".[all]"
55
+ else
56
+ export NEMO_TAG
57
+ export NEMO_REPO
58
+ export INSTALL_DIR=$(pwd)
59
+
60
+ bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
61
+ pip install --no-cache-dir ".[all]"
62
+ fi
63
+
64
+ - name: Check disk space after installation
65
+ run: df -h
66
+
67
+ - name: Run import checks
68
+ run: |
69
+ # Run import checks
70
+ for collection in "asr" "tts" "lightning" "core"; do
71
+ python tests/core_ptl/check_imports.py --domain "$collection"
72
+ done
73
+
74
+ test-installs-linux-amd:
75
+ name: ubuntu-22.04-amd-py${{ matrix.python }}-${{ matrix.installer }}
76
+ runs-on: ubuntu-22.04
77
+ strategy:
78
+ fail-fast: false
79
+ matrix:
80
+ python: ["3.10", "3.11", "3.12"]
81
+ installer: ["pip-install", "nemo-install"]
82
+ steps:
83
+ - name: Checkout repo
84
+ uses: actions/checkout@v2
85
+
86
+ - name: Check disk space before cleanup
87
+ run: df -h
88
+
89
+ - name: Free up disk space
90
+ run: |
91
+ # Remove unnecessary packages and files on Ubuntu
92
+ sudo apt-get clean
93
+ sudo rm -rf /usr/local/lib/android || true
94
+ sudo rm -rf /opt/ghc || true
95
+ sudo rm -rf /usr/local/.ghcup || true
96
+ sudo rm -rf /usr/share/dotnet || true
97
+ sudo rm -rf /opt/az || true
98
+ # Clear pip and npm caches
99
+ pip cache purge || true
100
+ sudo npm cache clean --force || true
101
+
102
+ - name: Check disk space after cleanup
103
+ run: df -h
104
+
105
+ - name: Install Python
106
+ uses: actions/setup-python@v5
107
+ with:
108
+ python-version: ${{ matrix.python }}
109
+
110
+ - name: Install NeMo
111
+ env:
112
+ INSTALLER: ${{ matrix.installer }}
113
+ run: |
114
+ if [ "$INSTALLER" = "pip-install" ]; then
115
+ pip install --no-cache-dir --upgrade pip
116
+ pip install --no-cache-dir ".[all]"
117
+ else
118
+ export INSTALL_DIR=$(pwd)
119
+ bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
120
+ pip install --no-cache-dir ".[all]"
121
+ fi
122
+
123
+ - name: Check disk space after installation
124
+ run: df -h
125
+
126
+ - name: Run import checks
127
+ run: |
128
+ # Run import checks
129
+ for collection in "asr" "tts" "lightning" "core"; do
130
+ python tests/core_ptl/check_imports.py --domain "$collection"
131
+ done
132
+
133
+ test-asr-install-linux-amd:
134
+ name: ubuntu-22.04-amd-py${{ matrix.python }}-asr
135
+ runs-on: ubuntu-22.04
136
+ strategy:
137
+ fail-fast: false
138
+ matrix:
139
+ python: ["3.10", "3.11", "3.12"]
140
+ steps:
141
+ - name: Checkout repo
142
+ uses: actions/checkout@v2
143
+
144
+ - name: Check disk space before cleanup
145
+ run: df -h
146
+
147
+ - name: Free up disk space
148
+ run: |
149
+ # Remove unnecessary packages and files on Ubuntu
150
+ sudo apt-get clean
151
+ sudo rm -rf /usr/local/lib/android || true
152
+ sudo rm -rf /opt/ghc || true
153
+ sudo rm -rf /usr/local/.ghcup || true
154
+ sudo rm -rf /usr/share/dotnet || true
155
+ sudo rm -rf /opt/az || true
156
+ # Clear pip and npm caches
157
+ pip cache purge || true
158
+ sudo npm cache clean --force || true
159
+
160
+ - name: Check disk space after cleanup
161
+ run: df -h
162
+
163
+ - name: Install Python
164
+ uses: actions/setup-python@v5
165
+ with:
166
+ python-version: ${{ matrix.python }}
167
+
168
+ - name: Install NeMo
169
+ run: |
170
+ pip install --no-cache-dir --upgrade pip
171
+ pip install --no-cache-dir ".[asr]"
172
+
173
+ - name: Check disk space after installation
174
+ run: df -h
175
+
176
+ - name: Run import checks
177
+ run: |
178
+ # Run import checks
179
+ python tests/core_ptl/check_imports.py --domain asr
180
+
181
+ test-installs-linux-arm:
182
+ name: ubuntu-22.04-arm-py${{ matrix.python }}-${{ matrix.installer }}
183
+ runs-on: ubuntu-22.04-arm
184
+ strategy:
185
+ fail-fast: false
186
+ matrix:
187
+ python: ["3.10", "3.11", "3.12"]
188
+ installer: ["pip-install", "nemo-install"]
189
+ steps:
190
+ - name: Checkout repo
191
+ uses: actions/checkout@v2
192
+
193
+ - name: Check disk space before cleanup
194
+ run: df -h
195
+
196
+ - name: Free up disk space
197
+ run: |
198
+ # Remove unnecessary packages and files on Ubuntu ARM
199
+ sudo apt-get clean
200
+ sudo rm -rf /usr/local/lib/android || true
201
+ sudo rm -rf /opt/ghc || true
202
+ sudo rm -rf /usr/local/.ghcup || true
203
+ sudo rm -rf /usr/share/dotnet || true
204
+ sudo rm -rf /opt/az || true
205
+ # Clear pip and npm caches
206
+ pip cache purge || true
207
+ sudo npm cache clean --force || true
208
+
209
+ - name: Check disk space after cleanup
210
+ run: df -h
211
+
212
+ - name: Install Python
213
+ uses: actions/setup-python@v5
214
+ with:
215
+ python-version: ${{ matrix.python }}
216
+
217
+ - name: Install NeMo
218
+ env:
219
+ INSTALLER: ${{ matrix.installer }}
220
+ run: |
221
+ if [ "$INSTALLER" = "pip-install" ]; then
222
+ pip install --no-cache-dir --upgrade pip
223
+ pip install --no-cache-dir ".[all]"
224
+ else
225
+ export INSTALL_DIR=$(pwd)
226
+ bash docker/common/install_dep.sh --library "te,mcore,extra" --mode install
227
+ pip install --no-cache-dir ".[all]"
228
+ fi
229
+
230
+ - name: Check disk space after installation
231
+ run: df -h
232
+
233
+ - name: Run import checks
234
+ run: |
235
+ # Run import checks
236
+ for collection in "asr" "tts" "lightning" "core"; do
237
+ python tests/core_ptl/check_imports.py --domain "$collection"
238
+ done
239
+
240
+ test-asr-installs-linux-arm:
241
+ name: ubuntu-22.04-arm-py${{ matrix.python }}-asr
242
+ runs-on: ubuntu-22.04-arm
243
+ strategy:
244
+ fail-fast: false
245
+ matrix:
246
+ python: ["3.10", "3.11", "3.12"]
247
+ steps:
248
+ - name: Checkout repo
249
+ uses: actions/checkout@v2
250
+
251
+ - name: Check disk space before cleanup
252
+ run: df -h
253
+
254
+ - name: Free up disk space
255
+ run: |
256
+ # Remove unnecessary packages and files on Ubuntu ARM
257
+ sudo apt-get clean
258
+ sudo rm -rf /usr/local/lib/android || true
259
+ sudo rm -rf /opt/ghc || true
260
+ sudo rm -rf /usr/local/.ghcup || true
261
+ sudo rm -rf /usr/share/dotnet || true
262
+ sudo rm -rf /opt/az || true
263
+ # Clear pip and npm caches
264
+ pip cache purge || true
265
+ sudo npm cache clean --force || true
266
+
267
+ - name: Check disk space after cleanup
268
+ run: df -h
269
+
270
+ - name: Install Python
271
+ uses: actions/setup-python@v5
272
+ with:
273
+ python-version: ${{ matrix.python }}
274
+
275
+ - name: Install NeMo
276
+ run: |
277
+ pip install --no-cache-dir --upgrade pip
278
+ pip install --no-cache-dir ".[asr]"
279
+
280
+ - name: Check disk space after installation
281
+ run: df -h
282
+
283
+ - name: Run import checks
284
+ run: |
285
+ # Run import checks
286
+ python tests/core_ptl/check_imports.py --domain asr
.github/workflows/labeler.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Pull Request Labeler"
2
+ on:
3
+ - pull_request_target
4
+
5
+ jobs:
6
+ triage:
7
+ permissions:
8
+ contents: read
9
+ pull-requests: write
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/labeler@v4
13
+ with:
14
+ repo-token: "${{ secrets.GITHUB_TOKEN }}"
.github/workflows/mcore-tag-bump-bot.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regularly updates the CI container
2
+ name: Megatron Tag Bump Bot
3
+ on:
4
+ workflow_dispatch:
5
+ schedule:
6
+ - cron: 0 0 * * *
7
+
8
+ jobs:
9
+ get-release-branch-names:
10
+ runs-on: ubuntu-latest
11
+ outputs:
12
+ mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
13
+ nemo: ${{ steps.get-branch.outputs.nemo_release_branch }}
14
+ steps:
15
+ - name: Get release branch names
16
+ id: get-branch
17
+ run: |
18
+ latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' |
19
+ grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' |
20
+ sort -V |
21
+ tail -n1)
22
+ echo "mcore_release_branch=$latest_branch" >> $GITHUB_OUTPUT
23
+
24
+ latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/NeMo.git 'refs/heads/r*' |
25
+ grep -o 'r[0-9]\+\.[0-9]\+\.[0-9]\+' |
26
+ sort -V |
27
+ tail -n1)
28
+ echo "nemo_release_branch=$latest_branch" >> $GITHUB_OUTPUT
29
+
30
+ bump-tags:
31
+ needs: [get-release-branch-names]
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ include:
36
+ - nemo-target-branch: ${{ needs.get-release-branch-names.outputs.nemo }}
37
+ mcore-target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
38
+ - nemo-target-branch: main
39
+ mcore-target-branch: main
40
+ uses: ./.github/workflows/_bump_mcore_tag.yml
41
+ with:
42
+ nemo-target-branch: ${{ matrix.nemo-target-branch }}
43
+ mcore-target-branch: ${{ matrix.mcore-target-branch }}
44
+ secrets:
45
+ PAT: ${{ secrets.PAT }}
46
+
47
+ notify:
48
+ if: failure()
49
+ runs-on: ubuntu-latest
50
+ needs: [bump-tags]
51
+ steps:
52
+ - name: Notify
53
+ env:
54
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
55
+ SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
56
+ GITHUB_RUN_ID: ${{ github.run_id }}
57
+ GITHUB_REPOSITORY: ${{ github.repository }}
58
+ run: |
59
+ curl -X POST \
60
+ -H 'Content-type: application/json' \
61
+ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Mcore-bump-bot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
62
+ $SLACK_WEBHOOK
.github/workflows/monitor-single-vm.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~shut down a single VM
2
+
3
+ on:
4
+ workflow_call:
5
+ inputs:
6
+ vm:
7
+ type: string
8
+ description: Name of VM
9
+ required: true
10
+ n_gpus:
11
+ type: string
12
+ description: Number of GPUs this VM has
13
+ required: true
14
+
15
+ jobs:
16
+ check-status-and-maybe-shutdown:
17
+ environment: main
18
+ runs-on: ${{ inputs.vm }}
19
+ outputs:
20
+ status: ${{ steps.status.outputs.main }}
21
+ steps:
22
+ - name: Check status
23
+ id: status
24
+ run: |
25
+ docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi
26
+
27
+ NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
28
+
29
+ if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then
30
+ echo "Issues with GPU detected, will take this runner offline."
31
+ echo "main=degraded" >> "$GITHUB_OUTPUT"
32
+ else
33
+ echo "main=healthy" >> "$GITHUB_OUTPUT"
34
+ fi
35
+
36
+ - name: Send Slack message & Disconnect runner from GitHub
37
+ if: ${{ steps.status.outputs.main == 'degraded' || failure() }}
38
+ run: |
39
+ MESSAGE='{
40
+ "blocks": [
41
+ {
42
+ "type": "section",
43
+ "text": {
44
+ "type": "mrkdwn",
45
+ "text": ":alert: VM bot 🤖: Hey <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>: VM `${{ inputs.vm }}` is having not the best day of their life, maybe bring them an apple or so."
46
+ }
47
+ }
48
+ ]
49
+ }'
50
+
51
+ curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
52
+
53
+ cd /home/azureuser/actions-runner
54
+ echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop
.github/workflows/monitor-vms.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regularly updates the CI container
2
+ name: Reboots VMs in a controlled way
3
+ on:
4
+ schedule:
5
+ - cron: 0/15 * * * *
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ pre-flight:
10
+ runs-on: ubuntu-latest
11
+ if: github.repository_owner == 'NVIDIA'
12
+ outputs:
13
+ list-of-vms: ${{ steps.main.outputs.main }}
14
+ environment: main
15
+ steps:
16
+ - name: Get list of VMs
17
+ id: main
18
+ env:
19
+ GITHUB_TOKEN: ${{ secrets.PAT }}
20
+ run: |
21
+ RUNNERS=$(curl -L \
22
+ -H "Accept: application/vnd.github+json" \
23
+ -H "Authorization: Bearer $GITHUB_TOKEN" \
24
+ -H "X-GitHub-Api-Version: 2022-11-28" \
25
+ https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
26
+
27
+ MATRIX=$(echo $RUNNERS \
28
+ | jq -c '[
29
+ .runners[]
30
+ | select(.status == "online")
31
+ | select(.name | contains("cpu") | not)
32
+ | {
33
+ "vm": .name,
34
+ "n_gpus": [
35
+ .labels[]
36
+ | select(.name | endswith("gpu")) | .name
37
+ ][0][:1]
38
+ }
39
+ ]
40
+ '
41
+ )
42
+ echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
43
+
44
+ maintenance:
45
+ needs: pre-flight
46
+ strategy:
47
+ fail-fast: false
48
+ matrix:
49
+ include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
50
+ uses: ./.github/workflows/monitor-single-vm.yml
51
+ with:
52
+ vm: ${{ matrix.vm }}
53
+ n_gpus: ${{ matrix.n_gpus }}
54
+ secrets: inherit # pragma: allowlist secret
.github/workflows/release-freeze.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Code freeze"
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ type_of_release:
7
+ type: choice
8
+ description: Type of release
9
+ options:
10
+ - major
11
+ - minor
12
+ freeze-commit:
13
+ type: string
14
+ description: Commit SHA to use for cut-off
15
+ required: false
16
+ default: main
17
+ mcore_version:
18
+ description: "Version of MCore to use (must be a valid git ref)"
19
+ required: true
20
+ type: string
21
+ dry-run:
22
+ type: boolean
23
+ description: Dry-run of code-freeze
24
+ required: false
25
+ default: true
26
+
27
+ jobs:
28
+ code-freeze:
29
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
30
+ with:
31
+ library-name: NeMo-Toolkit
32
+ python-package: nemo
33
+ release-type: ${{ inputs.type_of_release }}
34
+ freeze-commit: ${{ inputs.freeze-commit }}
35
+ dry-run: ${{ inputs.dry-run }}
36
+ use-pat: true
37
+ secrets:
38
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
39
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
40
+ PAT: ${{ secrets.PAT }}
41
+
42
+ freeze-tags:
43
+ runs-on: ubuntu-latest
44
+ needs: [code-freeze]
45
+ environment: main
46
+ steps:
47
+ - name: Checkout repository
48
+ uses: actions/checkout@v4
49
+ with:
50
+ path: ${{ github.run_id }}
51
+ token: ${{ secrets.PAT }}
52
+ fetch-depth: 0
53
+ fetch-tags: true
54
+ ref: ${{ inputs.dry-run == true && inputs.freeze-commit || needs.code-freeze.outputs.release-branch }}
55
+
56
+ - name: Pin branch name in Notebooks
57
+ run: |
58
+ cd ${{ github.run_id }}
59
+ find tutorials -type f -name "*.ipynb" -exec sed -i "s/BRANCH = 'main'/BRANCH = '${{ needs.code-freeze.outputs.release-branch }}'/g" {} +
60
+
61
+ - name: Pin MCore in Dockerfile
62
+ run: |
63
+ cd ${{ github.run_id }}
64
+ sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' docker/Dockerfile.ci
65
+
66
+ - name: Show status
67
+ run: |
68
+ cd ${{ github.run_id }}
69
+ git status
70
+
71
+ - name: Create PR
72
+ uses: peter-evans/create-pull-request@v6
73
+ id: create-pull-request
74
+ if: ${{ inputs.dry-run != true }}
75
+ with:
76
+ path: ${{ github.run_id }}
77
+ base: ${{ needs.code-freeze.outputs.release-branch }}
78
+ branch: ci/freeze-tags-${{ needs.code-freeze.outputs.release-branch }}
79
+ title: "Freeze tags in in `${{ needs.code-freeze.outputs.release-branch }}`"
80
+ body: |
81
+ 🚀 PR to freeze tags in `${{ needs.code-freeze.outputs.release-branch }}`.
82
+
83
+ commit-message: "[🤠]: Howdy folks, let's release NeMo `${{ needs.code-freeze.outputs.release-branch }}` !"
84
+ signoff: true
85
+ assignees: okoenig
.github/workflows/release.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "Release Neural Modules"
15
+
16
+ on:
17
+ workflow_dispatch:
18
+ inputs:
19
+ release-ref:
20
+ description: Ref (SHA or branch name) to release
21
+ required: true
22
+ type: string
23
+ version-bump-branch:
24
+ description: Branch for version bump
25
+ required: true
26
+ type: string
27
+ dry-run:
28
+ description: Do not publish a wheel and GitHub release.
29
+ required: true
30
+ default: true
31
+ type: boolean
32
+
33
+ jobs:
34
+ release:
35
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
36
+ with:
37
+ release-ref: ${{ inputs.release-ref }}
38
+ python-package: nemo
39
+ python-version: "3.10"
40
+ library-name: Neural Modules
41
+ dry-run: ${{ inputs.dry-run }}
42
+ version-bump-branch: ${{ inputs.version-bump-branch }}
43
+ secrets:
44
+ TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
45
+ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
46
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
47
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
48
+ PAT: ${{ secrets.PAT }}
.github/workflows/secrets-detector.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: Secrets detector
15
+
16
+ on:
17
+ pull_request_target:
18
+ branches:
19
+ - 'main'
20
+
21
+ jobs:
22
+ main:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - name: Checkout repository
26
+ uses: actions/checkout@v4
27
+ with:
28
+ fetch-depth: 0
29
+ token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
30
+
31
+ - name: Install secrets detector
32
+ run: pip install detect-secrets
33
+
34
+ - name: Run on change-set
35
+ run: |
36
+ git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --disable-plugin HexHighEntropyString --baseline .secrets.baseline
37
+
38
+ - uses: EndBug/add-and-commit@v9
39
+ # Commit changes. Nothing is committed if no changes.
40
+ if: always()
41
+ with:
42
+ message: Update baseline
43
+ commit: --signoff
.github/workflows/update-buildcache.yml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: Update build cache
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ push:
19
+ branches:
20
+ - main
21
+ workflow_dispatch:
22
+ inputs:
23
+ runner:
24
+ required: false
25
+ default: self-hosted-azure-builder
26
+ type: string
27
+ description: VM to use for build
28
+
29
+ jobs:
30
+ pre-flight:
31
+ runs-on: ubuntu-latest
32
+ outputs:
33
+ build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
34
+ cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
35
+ steps:
36
+ - name: Checkout branch
37
+ uses: actions/checkout@v4
38
+
39
+ - name: Parse manifest.json
40
+ id: manifest
41
+ run: |
42
+ BUILD_ARGS=$(cat << EOF
43
+ BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
44
+ TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
45
+ TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
46
+ MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
47
+ MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
48
+ TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
49
+ TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
50
+ APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
51
+ APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
52
+ EOF
53
+ )
54
+
55
+ echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
56
+ echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
57
+ echo "EOF" >> $GITHUB_OUTPUT
58
+
59
+ - name: Get last merged PR
60
+ id: cache_from
61
+ env:
62
+ GH_TOKEN: ${{ github.token }}
63
+ run: |
64
+ LAST_PRS=$(gh api graphql -f query='
65
+ query {
66
+ repository(owner: "NVIDIA", name: "NeMo") {
67
+ pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
68
+ nodes {
69
+ number
70
+ }
71
+ }
72
+ }
73
+ }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
74
+ echo "nemoci.azurecr.io/nemo_container-buildcache:$number"
75
+ done)
76
+
77
+ echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
78
+ echo "$LAST_PRS" >> $GITHUB_OUTPUT
79
+ echo "EOF" >> $GITHUB_OUTPUT
80
+
81
+ cicd-test-container-build:
82
+ needs: [pre-flight]
83
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
84
+ strategy:
85
+ fail-fast: false
86
+ matrix:
87
+ include:
88
+ - dockerfile: docker/Dockerfile.ci
89
+ image-name: nemo_container_automodel
90
+ - dockerfile: docker/Dockerfile.ci
91
+ image-name: nemo_container_nemo2
92
+ - dockerfile: docker/Dockerfile.ci
93
+ image-name: nemo_container_speech
94
+ - dockerfile: docker/Dockerfile.ci
95
+ image-name: nemo_container
96
+ with:
97
+ image-name: ${{ matrix.image-name }}
98
+ dockerfile: ${{ matrix.dockerfile }}
99
+ image-label: nemo-core
100
+ build-args: |
101
+ IMAGE_LABEL=nemo-core
102
+ NEMO_TAG=${{ github.sha }}
103
+ NEMO_REPO=https://github.com/NVIDIA/NeMo
104
+ ${{ needs.pre-flight.outputs.BUILD_ARGS }}
105
+ runner: ${{ inputs.runner || 'self-hosted-azure-builder' }}
106
+ use-inline-cache: false
107
+ prune-filter-timerange: 24h
108
+ cache-from: |
109
+ nemoci.azurecr.io/${{ matrix.image-name }}-buildcache:main
110
+ ${{ needs.pre-flight.outputs.cache-from }}