Spaces:
Runtime error
Runtime error
| # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from nemo.collections.llm.bert.model.bert import ( | |
| HuggingFaceBertBaseConfig, | |
| HuggingFaceBertLargeConfig, | |
| MegatronBertBaseConfig, | |
| MegatronBertLargeConfig, | |
| ) | |
| def test_huggingface_bert_base_config(): | |
| config = HuggingFaceBertBaseConfig() | |
| assert config.bert_type == 'huggingface' | |
| assert config.num_layers == 12 | |
| assert config.hidden_size == 768 | |
| assert config.ffn_hidden_size == 3072 | |
| assert config.num_attention_heads == 12 | |
| def test_huggingface_bert_large_config(): | |
| config = HuggingFaceBertLargeConfig() | |
| assert config.bert_type == 'huggingface' | |
| assert config.num_layers == 24 | |
| assert config.hidden_size == 1024 | |
| assert config.ffn_hidden_size == 4096 | |
| assert config.num_attention_heads == 16 | |
| def test_megatron_bert_base_config(): | |
| config = MegatronBertBaseConfig() | |
| assert config.bert_type == 'megatron' | |
| assert config.num_layers == 12 | |
| assert config.hidden_size == 768 | |
| assert config.ffn_hidden_size == 3072 | |
| assert config.num_attention_heads == 12 | |
| def test_megatron_bert_large_config(): | |
| config = MegatronBertLargeConfig() | |
| assert config.bert_type == 'megatron' | |
| assert config.num_layers == 24 | |
| assert config.hidden_size == 1024 | |
| assert config.ffn_hidden_size == 4096 | |
| assert config.num_attention_heads == 16 | |
| from dataclasses import dataclass | |
| from unittest.mock import MagicMock | |
| import pytest | |
| import torch | |
| from nemo.collections.llm.bert.model.bert import ( | |
| _export_embedding, | |
| _export_qkv, | |
| _export_qkv_bias, | |
| _import_embedding, | |
| _import_embedding_2, | |
| _import_output_bias, | |
| _import_qkv, | |
| _import_qkv_2, | |
| _import_qkv_bias, | |
| _import_qkv_bias_2, | |
| ) | |
| class MockConfig: | |
| num_attention_heads: int = 12 | |
| hidden_size: int = 768 | |
| kv_channels: int = 64 | |
| make_vocab_size_divisible_by: int = 128 | |
| vocab_size: int = 30522 | |
| class TestBertTransforms: | |
| def mock_ctx(self): | |
| ctx = MagicMock() | |
| ctx.target.config = MockConfig() | |
| ctx.source.config = MockConfig() | |
| return ctx | |
| from dataclasses import dataclass | |
| class MockConfig: | |
| num_attention_heads: int = 12 | |
| hidden_size: int = 768 | |
| kv_channels: int = 64 | |
| make_vocab_size_divisible_by: int = 128 | |
| vocab_size: int = 30522 | |
| class TestBertTransforms: | |
| def mock_ctx(self): | |
| ctx = MagicMock() | |
| ctx.target.config = MockConfig() | |
| ctx.source.config = MockConfig() | |
| return ctx | |
| def test_import_qkv(self, mock_ctx): | |
| hidden_size = 768 | |
| head_size = 64 | |
| num_heads = 12 | |
| q = torch.randn(num_heads * head_size, hidden_size) | |
| k = torch.randn(num_heads * head_size, hidden_size) | |
| v = torch.randn(num_heads * head_size, hidden_size) | |
| # Test both import functions | |
| for transform_fn in [_import_qkv.transform, _import_qkv_2.transform]: | |
| result = transform_fn(mock_ctx, q, k, v) | |
| # Check output shape | |
| expected_shape = (3 * num_heads * head_size, hidden_size) | |
| assert result.shape == expected_shape | |
| def test_import_qkv_bias(self, mock_ctx): | |
| head_size = 64 | |
| num_heads = 12 | |
| qb = torch.randn(num_heads * head_size) | |
| kb = torch.randn(num_heads * head_size) | |
| vb = torch.randn(num_heads * head_size) | |
| # Test both bias import functions | |
| for transform_fn in [_import_qkv_bias.transform, _import_qkv_bias_2.transform]: | |
| result = transform_fn(mock_ctx, qb, kb, vb) | |
| # Check output shape | |
| expected_shape = (3 * num_heads * head_size,) | |
| assert result.shape == expected_shape | |
| def test_import_embedding(self, mock_ctx): | |
| vocab_size = 30000 # Less than divisible_by * n | |
| hidden_size = 768 | |
| embedding = torch.randn(vocab_size, hidden_size) | |
| # Test both embedding import functions | |
| for transform_fn in [_import_embedding.transform, _import_embedding_2.transform]: | |
| result = transform_fn(mock_ctx, embedding) | |
| # Check padding | |
| expected_padded_size = int(torch.ceil(torch.tensor(vocab_size) / 128) * 128) | |
| assert result.shape == (expected_padded_size, hidden_size) | |
| # Check original values preserved | |
| torch.testing.assert_close(result[:vocab_size], embedding) | |
| # Check padding is zeros | |
| assert torch.all(result[vocab_size:] == 0) | |
| def test_import_output_bias(self, mock_ctx): | |
| vocab_size = 30000 | |
| bias = torch.randn(vocab_size) | |
| result = _import_output_bias.transform(mock_ctx, bias) | |
| # Check padding | |
| expected_padded_size = int(torch.ceil(torch.tensor(vocab_size) / 128) * 128) | |
| assert result.shape == (expected_padded_size,) | |
| # Check original values preserved | |
| torch.testing.assert_close(result[:vocab_size], bias) | |
| # Check padding is zeros | |
| assert torch.all(result[vocab_size:] == 0) | |
| def test_export_qkv(self, mock_ctx): | |
| hidden_size = 768 | |
| head_size = 64 | |
| num_heads = 12 | |
| # Create input tensor with shape [3 * num_heads * head_size, hidden_size] | |
| linear_qkv = torch.randn(3 * num_heads * head_size, hidden_size) | |
| q_proj, k_proj, v_proj = _export_qkv.transform(mock_ctx, linear_qkv) | |
| # Check output shapes | |
| assert q_proj.shape == (num_heads * head_size, hidden_size) | |
| assert k_proj.shape == (num_heads * head_size, hidden_size) | |
| assert v_proj.shape == (num_heads * head_size, hidden_size) | |
| def test_export_qkv_bias(self, mock_ctx): | |
| head_size = 64 | |
| num_heads = 12 | |
| # Create input bias tensor | |
| qkv_bias = torch.randn(3 * num_heads * head_size) | |
| q_bias, k_bias, v_bias = _export_qkv_bias.transform(mock_ctx, qkv_bias) | |
| # Check output shapes | |
| assert q_bias.shape == (num_heads * head_size,) | |
| assert k_bias.shape == (num_heads * head_size,) | |
| assert v_bias.shape == (num_heads * head_size,) | |
| def test_export_embedding(self, mock_ctx): | |
| vocab_size = 30522 | |
| hidden_size = 768 | |
| padded_vocab_size = 30720 # Next multiple of 128 | |
| # Create padded embedding tensor | |
| embedding = torch.randn(padded_vocab_size, hidden_size) | |
| result = _export_embedding.transform(mock_ctx, embedding) | |
| # Check output shape matches vocab_size | |
| assert result.shape == (vocab_size, hidden_size) | |
| # Check values preserved | |
| torch.testing.assert_close(result, embedding[:vocab_size]) | |