Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

MagpieTTS_Internal_Demo / nemo /export /onnx_llm_exporter.py

subhankarg

Upload folder using huggingface_hub

0558aa4 verified 11 days ago

raw

history blame contribute delete

18.9 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import warnings
	from pathlib import Path
	from typing import TYPE_CHECKING, Dict, List, Optional, Union

	import numpy as np
	import torch
	import wrapt
	from transformers import AutoModel, AutoTokenizer

	from nemo.deploy import ITritonDeployable
	from nemo.export.utils import get_example_inputs, get_model_device_type, is_nemo2_checkpoint, validate_fp8_network
	from nemo.utils import logging

	if TYPE_CHECKING:
	import tensorrt as trt


	@wrapt.decorator
	def noop_decorator(func):
	"""No op decorator"""

	def wrapper(args, *kwargs):
	return func(args, *kwargs)

	return wrapper


	use_pytriton = True
	batch = noop_decorator
	try:
	from pytriton.decorators import batch
	except Exception:
	logging.warning("PyTriton is not available.")
	use_pytriton = False


	use_onnxruntime = True
	try:
	import onnxruntime
	except Exception:
	logging.warning("onnxruntime is not available.")
	use_onnxruntime = False


	use_trt = True
	try:
	import tensorrt as trt
	except ImportError:
	logging.warning("tensorrt is not available")
	use_trt = False


	# pylint: disable=line-too-long
	class OnnxLLMExporter(ITritonDeployable):
	"""
	Exports models to ONNX and run fast inference.

	Example:
	from nemo.export.onnx_llm_exporter import OnnxLLMExporter

	onnx_llm_exporter = OnnxLLMExporter(
	onnx_model_dir="/path/for/onnx_model/files",
	model_name_or_path="/path/for/model/files",
	)

	onnx_llm_exporter.export(
	input_names=["input_ids", "attention_mask", "dimensions"],
	output_names=["embeddings"],
	)

	output = onnx_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
	print("output: ", output)
	"""

	def __init__(
	self,
	onnx_model_dir: str,
	model: Optional[torch.nn.Module] = None,
	tokenizer=None,
	model_name_or_path: str = None,
	load_runtime: bool = True,
	):
	"""
	Initializes the ONNX Exporter.

	Args:
	onnx_model_dir (str): path for storing the ONNX model files.
	model (Optional[torch.nn.Module]): torch model.
	tokenizer (HF or NeMo tokenizer): tokenizer class.
	model_name_or_path (str): a path for ckpt or HF model ID
	load_runtime (bool): load ONNX runtime if there is any exported model available in
	the onnx_model_dir folder.
	"""
	self.onnx_model_dir = onnx_model_dir
	self.model_name_or_path = model_name_or_path
	self.onnx_model_path = str(Path(onnx_model_dir) / "model.onnx")
	self.model = model
	self.tokenizer = tokenizer
	self.model_input_names = None
	self.model_output_names = None
	self.onnx_runtime_session = None
	self.calibration_data = None
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.quant_max_batch_size = None

	if self.model_name_or_path is not None:
	if model is not None:
	raise ValueError("A model was also passed but it will be overridden.")

	if Path(self.model_name_or_path).is_dir():
	if is_nemo2_checkpoint(self.model_name_or_path):
	raise NotImplementedError("NeMo 2.0 checkpoint will be supported later.")
	else:
	self._load_hf_model()

	if load_runtime:
	self._load_runtime()

	def _load_runtime(self):
	if use_onnxruntime:
	if Path(self.onnx_model_path).exists():
	self.onnx_runtime_session = onnxruntime.InferenceSession(self.onnx_model_path)
	self.model_input_names = [input.name for input in self.onnx_runtime_session.get_inputs()]
	self.model_output_names = [output.name for output in self.onnx_runtime_session.get_outputs()]
	self.tokenizer = AutoTokenizer.from_pretrained(
	Path(self.onnx_model_dir) / "tokenizer", trust_remote_code=True
	)

	def _load_hf_model(self):
	self.model = AutoModel.from_pretrained(
	self.model_name_or_path,
	trust_remote_code=True,
	).eval()

	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, trust_remote_code=True)

	def export(
	self,
	input_names: list,
	output_names: list,
	example_inputs: dict = None,
	opset: int = 20,
	dynamic_axes_input: Optional[dict] = None,
	dynamic_axes_output: Optional[dict] = None,
	export_dtype: str = "fp32",
	verbose: bool = False,
	):
	"""
	Performs ONNX conversion from a PyTorch model.

	Args:
	input_names (list): input parameter names of the model that ONNX will export will use.
	output_names (list): output parameter names of the model that ONNX will export will use.
	example_inputs (dict): example input for the model to build the engine.
	opset (int): ONNX opset version. Default is 20.
	dynamic_axes_input (dict): Variable length axes for the input.
	dynamic_axes_output (dict): Variable length axes for the output.
	export_dtype (str): Export dtype, fp16 or fp32.
	verbose (bool): Enable verbose or not.
	"""

	self._export_to_onnx(
	input_names=input_names,
	example_inputs=example_inputs,
	output_names=output_names,
	opset=opset,
	dynamic_axes_input=dynamic_axes_input,
	dynamic_axes_output=dynamic_axes_output,
	export_dtype=export_dtype,
	verbose=verbose,
	)
	self._load_runtime()

	def _export_to_onnx(
	self,
	input_names: list,
	output_names: list,
	example_inputs: dict = None,
	opset: int = 20,
	dynamic_axes_input: Optional[dict] = None,
	dynamic_axes_output: Optional[dict] = None,
	export_dtype: Union[torch.dtype, str] = "fp32",
	verbose: bool = False,
	):

	if example_inputs is None:
	example_inputs = get_example_inputs(self.tokenizer)

	if "dimensions" in input_names:
	example_inputs["dimensions"] = torch.tensor([1] * example_inputs["input_ids"].shape[0])

	if isinstance(export_dtype, str):
	export_dtype = {"fp16": torch.float16, "fp32": torch.float32}[export_dtype]

	self.model.to(export_dtype)

	Path(self.onnx_model_dir).mkdir(parents=True, exist_ok=True)

	with torch.autocast(device_type=get_model_device_type(self.model), dtype=export_dtype):
	torch.onnx.export(
	model=self.model,
	args=(example_inputs,),
	f=self.onnx_model_path,
	input_names=input_names,
	output_names=output_names,
	dynamic_axes={dynamic_axes_input, dynamic_axes_output},
	verbose=verbose,
	opset_version=opset,
	)
	logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")

	existing_directory_path = Path(self.onnx_model_dir) / "tokenizer"
	existing_directory_path.mkdir(exist_ok=True)
	self.tokenizer.save_pretrained(existing_directory_path)

	def export_onnx_to_trt(
	self,
	trt_model_dir: str,
	profiles=None,
	override_layernorm_precision_to_fp32: bool = False,
	override_layers_to_fp32: List = None,
	trt_dtype: str = "fp16",
	profiling_verbosity: str = "layer_names_only",
	trt_builder_flags: List["trt.BuilderFlag"] = None,
	) -> None:
	"""Performs TensorRT conversion from an ONNX model.

	Args:
	trt_model_dir: path to store the TensorRT model.
	profiles: TensorRT profiles.
	override_layernorm_precision_to_fp32 (bool): whether to convert layers to fp32 or not.
	override_layers_to_fp32 (List): Layer names to be converted to fp32.
	trt_dtype (str): "fp16" or "fp32".
	profiling_verbosity (str): Profiling verbosity. Default is "layer_names_only".
	trt_builder_flags (List[trt.BuilderFlag]): TRT specific flags.
	"""
	logging.info(f"Building TRT engine from ONNX model ({self.onnx_model_path})")
	trt_logger = trt.Logger(trt.Logger.WARNING)
	builder = trt.Builder(trt_logger)
	network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
	config = builder.create_builder_config()
	parser = trt.OnnxParser(network, trt_logger)

	# we use parse_from_file() instead of parse() because it can be used for both single
	# file models as well as externally stored models (required when model >2GiB)
	if not parser.parse_from_file(self.onnx_model_path):
	logging.warning("ONNX model could not be parsed")
	for error in range(parser.num_errors):
	logging.error(parser.get_error(error))
	return

	if profiles:
	for profile in profiles:
	optimization_profile = builder.create_optimization_profile()

	for i in range(network.num_inputs):
	in_tensor = network.get_input(i)
	optimization_profile.set_shape(
	in_tensor.name,
	min=profile[in_tensor.name][0],
	opt=profile[in_tensor.name][1],
	max=profile[in_tensor.name][2],
	)

	config.add_optimization_profile(optimization_profile)

	if trt_dtype == "fp16":
	logging.info("Setting Build Flag FP16")
	config.set_flag(trt.BuilderFlag.FP16)
	elif trt_dtype == "fp8":
	# With FP8 export we want to also enable FP16 layers as a fallback instead of FP32
	logging.info("Setting Build Flag FP8 and FP16")
	config.set_flag(trt.BuilderFlag.FP8)
	config.set_flag(trt.BuilderFlag.FP16)
	validate_fp8_network(network)

	# patch network
	if override_layernorm_precision_to_fp32:
	logging.info("Overriding TensorRT network LayerNorm precision to float32.")
	self._override_layernorm_precision_to_fp32(network)

	if override_layers_to_fp32:
	logging.info("Overriding some layers to float32.")
	self._override_layers_to_fp32(network, override_layers_to_fp32)

	try:
	config.profiling_verbosity = {
	"detailed": trt.ProfilingVerbosity.DETAILED,
	"layer_names_only": trt.ProfilingVerbosity.LAYER_NAMES_ONLY,
	"none": trt.ProfilingVerbosity.NONE,
	}[profiling_verbosity]
	except KeyError:
	error_msg = "Unknown profiling verbosity value."
	raise ValueError(error_msg)
	logging.info(f"Setting Profiling Verbosity to {config.profiling_verbosity}")

	if trt_builder_flags is not None:
	for flag in trt_builder_flags:
	config.set_flag(flag)

	engine_string = builder.build_serialized_network(network, config)
	if engine_string is None:
	raise Exception("Failed to serialize the TensorRT Engine. Please check the " "TensorRT logs for details")

	trt_model_path = Path(trt_model_dir)
	trt_model_path.mkdir(parents=True, exist_ok=True)
	trt_model_path = trt_model_path / "model.plan"
	trt_model_path.write_bytes(engine_string)
	logging.info(f"Successfully exported ONNX model ({self.onnx_model_path}) " f"to TRT engine ({trt_model_path})")

	def _override_layer_precision_to_fp32(self, layer: "trt.ILayer") -> None:
	layer.precision = trt.float32
	layer.set_output_type(0, trt.float32)

	def _override_layers_to_fp32(self, network: "trt.INetworkDefinition", fp32_layer_patterns: list[str]) -> None:
	for i in range(network.num_layers):
	layer = network.get_layer(i)
	layer_name = layer.name
	if any(layer_name.startswith(pattern) for pattern in fp32_layer_patterns) and layer.precision in {
	trt.float32,
	trt.float16,
	}:
	if layer.type in {trt.LayerType.CAST}:
	logging.info(f"Skipping overriding {layer.type} layer {i} {layer_name} dtype")
	continue
	if any(
	layer.get_input(input_idx).dtype in {trt.float32, trt.float16}
	for input_idx in range(layer.num_inputs)
	):
	# Note: Assigning to layer.precision (even the same value) sets precision_is_set=True,
	# which prevents TensorRT from changing this layer's precision
	layer.precision = trt.float32
	logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) precision to FP32")
	for j in range(layer.num_outputs):
	if layer.get_output_type(j) in {trt.float32, trt.float16}:
	layer.set_output_type(j, trt.float32)
	logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")

	def _override_layernorm_precision_to_fp32(self, network: "trt.INetworkDefinition") -> None:
	"""Set the precision of LayerNorm subgraphs to FP32 to preserve accuracy.

	- https://nvbugs/4478448 (Mistral)
	- https://nvbugs/3802112 (T5)

	Args:
	network: tensorrt.INetworkDefinition
	"""
	# Logic originally from OSS T5 HF export script:
	# https://gitlab-master.nvidia.com/TensorRT/Public/oss/-/blob/77495ec/demo/HuggingFace/T5/export.py
	pow_ops = {}
	for layer_index, layer in enumerate(network):
	if layer.type == trt.LayerType.IDENTITY:
	all_fp32 = all(
	[
	layer.output_type_is_set(o) and layer.get_output_type(o) == trt.float32
	for o in range(layer.num_outputs)
	]
	)
	if all_fp32:
	if layer.get_input(0).dtype == trt.float32:
	layer.precision = trt.float32

	if layer.type == trt.LayerType.ELEMENTWISE:
	layer.__class__ = getattr(trt, "IElementWiseLayer")
	if layer.op == trt.ElementWiseOperation.POW:
	pow_ops[layer] = layer_index
	self._override_layer_precision_to_fp32(layer)

	for _, index in pow_ops.items():
	# Iterate from few layers before pow to include residual add and cast op.
	# Iterate till 10 layers after pow op to include all
	# operations included in layer norm.
	START_OFFSET = 4
	END_OFFSET = 12
	for i in range(index - START_OFFSET, index + END_OFFSET):
	layer = network.get_layer(i)
	if layer.type == trt.LayerType.REDUCE:
	self._override_layer_precision_to_fp32(layer)

	if layer.type == trt.LayerType.ELEMENTWISE:
	layer.__class__ = getattr(trt, "IElementWiseLayer")
	if layer.op == trt.ElementWiseOperation.SUM:
	self._override_layer_precision_to_fp32(layer)

	if layer.type == trt.LayerType.UNARY:
	layer.__class__ = getattr(trt, "IUnaryLayer")
	if layer.op == trt.UnaryOperation.SQRT:
	self._override_layer_precision_to_fp32(layer)

	if layer.type == trt.LayerType.ELEMENTWISE:
	layer.__class__ = getattr(trt, "IElementWiseLayer")
	if layer.op == trt.ElementWiseOperation.DIV:
	self._override_layer_precision_to_fp32(layer)

	if layer.type == trt.LayerType.ELEMENTWISE:
	layer.__class__ = getattr(trt, "IElementWiseLayer")
	if layer.op == trt.ElementWiseOperation.PROD:
	self._override_layer_precision_to_fp32(layer)

	def forward(self, inputs: Union[List, Dict], dimensions: Optional[List] = None):
	"""Run inference for a given input.

	Args:
	inputs (Union[List, Dict]): Input for the model. If list, it should be a list of strings.
	If dict, it should be a dictionary with keys as the model input names.
	dimensions (Optional[List]): The dimensions parameter of the model. Required if the model
	was exported to accept dimensions parameter and inputs is given as a list of strings.

	Returns:
	np.ndarray: Model output.
	"""

	if self.onnx_runtime_session is None:
	warnings.warn("ONNX Runtime is not available. Please install the onnxruntime-gpu and try again.")
	return None

	if isinstance(inputs, List):
	if "dimensions" in self.model_input_names and dimensions is None:
	raise ValueError("Dimensions should be provided for list input.")
	inputs = dict(self.tokenizer(inputs))
	inputs["dimensions"] = dimensions

	output = self.onnx_runtime_session.run(self.model_output_names, inputs)
	return output[0]

	@property
	def get_model(self):
	"""Returns the model"""

	return self.model

	@property
	def get_tokenizer(self):
	"""Returns the tokenizer"""

	return self.tokenizer

	@property
	def get_model_input_names(self):
	"""Returns the model input names"""

	return self.model_input_names

	@property
	def get_triton_input(self):
	"""Get triton input"""

	raise NotImplementedError("This function will be implemented later.")

	@property
	def get_triton_output(self):
	"""Get triton output"""

	raise NotImplementedError("This function will be implemented later.")

	@batch
	def triton_infer_fn(self, **inputs: np.ndarray):
	"""PyTriton inference function"""

	raise NotImplementedError("This function will be implemented later.")