Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

MagpieTTS_Internal_Demo / nemo /lightning /io /hf.py

subhankarg

Upload folder using huggingface_hub

0558aa4 verified 13 days ago

raw

history blame contribute delete

11.8 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import logging
	from pathlib import Path
	from typing import Any, Callable, Dict, Optional, TypeVar, Union

	import lightning.pytorch as pl
	import torch
	import torch.distributed as dist
	from huggingface_hub.errors import HFValidationError
	from lightning.fabric.plugins import CheckpointIO
	from lightning.fabric.utilities.cloud_io import get_filesystem
	from lightning.fabric.utilities.types import _PATH
	from torch import nn
	from typing_extensions import override

	from nemo.lightning.ckpt_utils import HF_ADAPTER_CONFIG_FILENAME, HF_ADAPTER_PATH, HF_WEIGHTS_PATH, WEIGHTS_PATH
	from nemo.lightning.io.mixin import IOMixin
	from nemo.lightning.io.pl import ckpt_to_weights_subdir

	log = logging.getLogger(__name__)


	LightningModuleT = TypeVar("LightningModuleT", bound=pl.LightningModule)
	ModuleT = TypeVar("ModuleT", bound=nn.Module)


	def is_rank_0():
	"""Checks whether rank=0 accounting for un-inintialized dist-env"""
	return not dist.is_available() or not dist.is_initialized() or dist.get_rank() == 0


	class HFAdapterKeyRenamer:
	"""Dummy class for key renaming"""

	@staticmethod
	def nemo_to_hf(x):
	"""Converts lora adapter FQNs to HF"""
	return (
	x.replace("model.model.", "base_model.model.model.", 1)
	.replace("lora_a.weight", "lora_A.weight", 1)
	.replace("lora_b.weight", "lora_B.weight", 1)
	)

	@staticmethod
	def hf_to_nemo(x):
	"""Converts lora adapter FQNs to NeMo"""
	return (
	x.replace("lora_B.weight", "lora_b.weight", 1)
	.replace("lora_A.weight", "lora_a.weight", 1)
	.replace("base_model.model.model.", "model.model.", 1)
	)


	class HFCheckpointIO(CheckpointIO, IOMixin):
	"""HFCheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load
	checkpoints respectively, common for most use cases.

	.. warning:: This is an :ref:`experimental <versioning:Experimental API>` feature.

	"""

	def __init__(self, model=None, adapter_only=False):
	"""Initializes HFCheckpointIO

	Args:
	model (nn.Module, optional): The nn.Module that's used for training.
	This supplies the save_pretrained function.
	adapter_only (bool, optional): If true, will only save LoRA adapter weights. Defaults to False.
	"""
	super().__init__()
	self.adapter_only = adapter_only
	self.model = model

	@override
	def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
	"""
	Save model/training states to a checkpoint file.

	Note:
	This function assumes it's only written by RANK=0 if executed inside a dist-env.
	Args:
	checkpoint: dict containing model and trainer state
	path: write-target path
	storage_options: not used in ``TorchCheckpointIO.save_checkpoint``

	Raises
	------
	TypeError:
	If ``storage_options`` arg is passed in

	"""
	assert is_rank_0(), "Expected to run only on rank=0"
	# Determine checkpoint directory & make dir
	checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)
	assert checkpoint_dir.parts[-1] == WEIGHTS_PATH, "Expected {} to end with {}".format(
	checkpoint_dir, WEIGHTS_PATH
	)
	# remove the WEIGHTS_PATH suffix
	checkpoint_dir = checkpoint_dir.parent
	fs = get_filesystem(checkpoint_dir)
	fs.makedirs(checkpoint_dir, exist_ok=True)

	if self.adapter_only:
	# In this case the output looks like the following:
	# default--reduced_train_loss=0.0112-epoch=2-step=3/
	# ├── context
	# │ ├── 10e845ad-f676-482e-b6d1-669e911cfaf8
	# │ ├── io.json
	# │ └── model.yaml
	# ├── hf_adapter
	# │ ├── adapter_config.json
	# │ └── adapter_model.safetensors
	# └── trainer.pt
	# Where the `trainer.pt` stores trainer's state (optimizer, dataloader, etc).
	# The `hf_adapter` directory contains the adapter's state dict, in HF format.
	adapter_path = checkpoint_dir / HF_ADAPTER_PATH
	adapter_path.mkdir(exist_ok=True)
	self._save_adapter_weights_only(checkpoint.pop('state_dict'), adapter_path, storage_options)
	torch.save(checkpoint, checkpoint_dir / 'trainer.pt')
	elif callable(getattr(self.model, 'save_pretrained', None)):
	# In this case the output looks like the following:
	# default--reduced_train_loss=0.0112-epoch=2-step=3/
	# ├── hf_weights
	# │ ├── config.json
	# │ ├── generation_config.json
	# │ ├── model.safetensors
	# │ ├── special_tokens_map.json
	# │ ├── tokenizer.json
	# │ └── tokenizer_config.json
	# └── trainer.pt
	# Where the `hf_weights` directory contains the model's state dict, in HF format.
	# The `trainer.pt` stores trainer's state (optimizer, dataloader, etc).
	hf_weights_path = checkpoint_dir / HF_WEIGHTS_PATH
	hf_weights_path.mkdir(parents=True, exist_ok=True)
	self.model.save_pretrained(hf_weights_path, state_dict=checkpoint.pop('state_dict'))
	torch.save(checkpoint, checkpoint_dir / 'trainer.pt')
	else:
	super().save_checkpoint(checkpoint, path, storage_options)
	raise NotImplementedError("Checkpoint was saved at: " + str(path))

	def _save_adapter_weights_only(
	self, state_dict: Dict[str, Any], path: Union[str, Path], storage_options: Optional[Any] = None
	) -> None:
	"""
	Saves only the adapter weights in a safetensors format.

	Args:
	state_dict (Dict[str, Any]): The state dictionary containing model weights.
	path (Union[str, Path]): The directory path where the adapter weights should be saved.
	storage_options (Optional[Any], optional): Additional storage options, if required.

	Raises:
	OSError: If saving the file fails.
	"""
	from safetensors.torch import save_file

	# Rename keys in state_dict to match expected format
	module_names = list(state_dict.keys())
	for name in module_names:
	param = state_dict.pop(name)
	state_dict[HFAdapterKeyRenamer.nemo_to_hf(name)] = param

	# Save weights to safetensors format
	try:
	save_file(state_dict, path / "adapter_model.safetensors")
	except OSError as e:
	raise OSError("Failed to save adapter weights: {}".format(e))

	@staticmethod
	def _load_adapter_weights_only(path: Union[str, Path]) -> Dict[str, Any]:
	"""
	Loads only the adapter weights from a safetensors checkpoint.

	Args:
	path (Union[str, Path]): The directory path where the adapter weights are stored.

	Returns:
	Dict[str, Any]: A dictionary containing the state dictionary of the adapter model.

	Raises:
	FileNotFoundError: If the checkpoint directory does not exist.
	ValueError: If the checkpoint path is not a directory.
	OSError: If loading the weights fails.
	"""
	fs = get_filesystem(path)

	if not fs.exists(path):
	raise FileNotFoundError("Checkpoint file not found: {}", path)

	if not fs.isdir(path):
	raise ValueError("Checkpoints should be a directory. Found: {}".format(path))

	state_dict = {}
	adapter_file = Path(path) / "adapter_model.safetensors"
	if not adapter_file.exists():
	raise FileNotFoundError("Adapter weights file not found: {}".format(adapter_file))
	config_file = Path(path) / HF_ADAPTER_CONFIG_FILENAME
	if not config_file.exists():
	raise FileNotFoundError("Adapter config file not found: {}".format(config_file))

	from safetensors import safe_open

	try:
	with safe_open(adapter_file, framework="pt", device="cpu") as f:
	for k in f.keys():
	state_dict[HFAdapterKeyRenamer.hf_to_nemo(k)] = f.get_tensor(k)
	except OSError as e:
	raise OSError(f"Failed to load adapter weights: {e}")

	return {'state_dict': state_dict}

	@override
	def load_checkpoint(
	self,
	path: _PATH,
	sharded_state_dict=None,
	map_location: Optional[Callable] = None,
	strict: Optional['StrictHandling'] \| bool = None, # noqa: F821
	) -> Dict[str, Any]:
	"""Loads checkpoint using :func:`torch.load`, with additional handling for ``fsspec`` remote loading of files.

	Args:
	path: Path to checkpoint
	map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
	locations.

	Returns: The loaded checkpoint.

	Raises
	------
	FileNotFoundError: If ``path`` is not found by the ``fsspec`` filesystem

	"""
	path = Path(path)
	assert path.parts[-1] != HF_WEIGHTS_PATH, f"Expected {path} not to end with {HF_WEIGHTS_PATH}"
	trainer_state = {}

	if not (path / 'trainer.pt').exists():
	logging.info("Asked to restore from checkpoint without trainer state at {}".format(path))
	else:
	trainer_state = torch.load(
	path / 'trainer.pt',
	map_location='cpu',
	mmap=True,
	weights_only=False,
	)

	if self.adapter_only:
	adapter_path = path / HF_ADAPTER_PATH
	trainer_state \|= HFCheckpointIO._load_adapter_weights_only(adapter_path)
	elif callable(getattr(self.model, 'load_pretrained', None)):
	try:
	trainer_state['state_dict'] = self.model.load_pretrained(path / HF_WEIGHTS_PATH)
	except (EnvironmentError, HFValidationError):
	raise EnvironmentError(
	f"Failed to load weights from {path}. If this is a local checkpoint, please make sure the path exists and has the correct format. "
	f"If this is a model from the HuggingFace Hub, please provide a valid repo_id of a model on the Hub."
	)
	else:
	raise RuntimeError(
	"Checkpoint load has failed: 'load_pretrained' is not defined for "
	"this model and 'adapter_only' is disabled. Please implement 'load_pretrained' or "
	"switch to 'adapter_only' mode."
	)

	return trainer_state

	@override
	def remove_checkpoint(self, path: _PATH) -> None:
	"""Remove checkpoint file from the filesystem.

	Args:
	path: Path to checkpoint

	"""
	fs = get_filesystem(path)
	if fs.exists(path):
	fs.rm(path, recursive=True)
	log.debug("Removed checkpoint: {}".format(path))