Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

MagpieTTS_Internal_Demo / scripts /vlm /llama4 /llama4_ptq.py

subhankarg

Upload folder using huggingface_hub

0558aa4 verified 10 days ago

raw

history blame contribute delete

11.2 kB

	# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Example of PTQ for Llama4:
	torchrun --nproc_per_node=8 \
	scripts/vlm/llama4/llama4_ptq.py \
	--calibration_tp 8 \
	--nemo_checkpoint "path/to/nemo_checkpoint" \
	--output_path "path/to/quantized_nemo_checkpoint" \
	--algorithm fp8 \
	--batch_size 1 \
	--export_format nemo \
	--legacy_ckpt \
	"""

	import argparse

	import requests
	import torch
	from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
	from PIL import Image
	from transformers import AutoProcessor

	from nemo.collections.llm.modelopt import ExportConfig, QuantizationConfig
	from nemo.collections.llm.modelopt.quantization.quant_cfg_choices import get_quant_cfg_choices
	from nemo.collections.vlm.api import ptq


	def load_image(url):
	"""Load image from URL."""
	try:
	response = requests.get(url, stream=True)
	response.raise_for_status()
	image = Image.open(response.raw)
	return image
	except requests.exceptions.RequestException as e:
	print(f"Error loading image from {url}: {e}")
	return None


	# Define calibration dataset URLs
	base_img_url = "http://images.cocodataset.org/val2017/"
	images = [
	"000000039769.jpg",
	"000000002685.jpg",
	"000000004495.jpg",
	"000000005001.jpg",
	"000000003845.jpg",
	"000000011615.jpg",
	"000000010977.jpg",
	"000000010764.jpg",
	"000000010707.jpg",
	"000000010583.jpg",
	"000000010363.jpg",
	"000000010092.jpg",
	"000000009914.jpg",
	"000000009891.jpg",
	"000000009769.jpg",
	"000000009590.jpg",
	"000000009483.jpg",
	"000000009448.jpg",
	"000000009378.jpg",
	"000000008899.jpg",
	]
	quantization_images_url = [base_img_url + img_id for img_id in images]


	def get_args():
	"""Parses PTQ arguments."""
	QUANT_CFG_CHOICES_LIST = ["no_quant", *get_quant_cfg_choices()]
	parser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="NeMo PTQ argument parser"
	)
	parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source NeMo 2.0 checkpoint")
	parser.add_argument("--decoder_type", type=str, help="Decoder type for TensorRT-Model-Optimizer")
	parser.add_argument("-ctp", "--calibration_tp", "--calib_tp", type=int, default=1)
	parser.add_argument("-cpp", "--calibration_pp", "--calib_pp", type=int, default=1)
	parser.add_argument(
	"--num_layers_in_first_pipeline_stage",
	type=int,
	default=None,
	help="Number of layers in the first pipeline stage. If None, pipeline parallelism will default to evenly split layers.",
	)
	parser.add_argument(
	"--num_layers_in_last_pipeline_stage",
	type=int,
	default=None,
	help="Number of layers in the last pipeline stage. If None, pipeline parallelism will default to evenly split layers.",
	)
	parser.add_argument(
	"-itp",
	"--inference_tp",
	"--tensor_parallelism_size",
	type=int,
	default=1,
	help="TRT-LLM engine TP size. (Only used when `--export_format` is 'trtllm')",
	)
	parser.add_argument(
	"-ipp",
	"--inference_pp",
	"--pipeline_parallelism_size",
	type=int,
	default=1,
	help="TRT-LLM engine PP size. (Only used when `--export_format` is 'trtllm')",
	)
	parser.add_argument("--devices", type=int, help="Number of GPUs to use per node")
	parser.add_argument("-nodes", "--num_nodes", type=int, help="Number of nodes used")
	parser.add_argument("-out", "--export_path", "--output_path", type=str, help="Path for the exported engine")
	parser.add_argument(
	"--export_format", default="trtllm", choices=["trtllm", "nemo", "hf"], help="Model format to export as"
	)
	parser.add_argument(
	"-algo",
	"--algorithm",
	type=str,
	default="fp8",
	choices=QUANT_CFG_CHOICES_LIST,
	help="TensorRT-Model-Optimizer quantization algorithm",
	)
	parser.add_argument(
	"-awq_bs", "--awq_block_size", type=int, default=128, help="Block size for AWQ quantization algorithms"
	)
	parser.add_argument("--sq_alpha", type=float, default=0.5, help="Smooth-Quant alpha parameter")
	parser.add_argument("--enable_kv_cache", help="Enables KV-cache quantization", action="store_true")
	parser.add_argument("--disable_kv_cache", dest="enable_kv_cache", action="store_false")
	parser.set_defaults(enable_kv_cache=None)
	parser.add_argument(
	"-dt", "--dtype", default="bf16", choices=["16", "bf16"], help="Default precision for non-quantized layers"
	)
	parser.add_argument("-bs", "--batch_size", default=64, type=int, help="Calibration batch size")
	parser.add_argument("-sl", "--seq_len", default=128, type=int, help="Length of the tokenized text")
	parser.add_argument(
	"-calib_size", "--calibration_dataset_size", default=512, type=int, help="Size of calibration dataset"
	)
	parser.add_argument(
	"-calib_ds",
	"--calibration_dataset",
	default="cnn_dailymail",
	type=str,
	help='Calibration dataset to be used. Should be "wikitext", "cnn_dailymail" or path to a local .json file',
	)
	parser.add_argument(
	"--generate_sample", help="Generate sample model output after performing PTQ", action="store_true"
	)
	parser.add_argument(
	"--trust_remote_code", help="Trust remote code when loading HuggingFace models", action="store_true"
	)
	parser.add_argument("--legacy_ckpt", help="Load ckpt saved with TE < 1.14", action="store_true")
	parser.add_argument(
	"--model_id",
	type=str,
	default="meta-llama/Llama-4-Scout-17B-16E-Instruct",
	help="Model HuggingFace ID to use.",
	)

	args = parser.parse_args()

	if args.export_path is None:
	if args.export_format == "trtllm":
	args.export_path = f"./qnemo_{args.algorithm}_tp{args.inference_tp}_pp{args.inference_pp}"
	else:
	args.export_path = f"./{args.export_format}_{args.algorithm}"

	if args.devices is None:
	args.devices = args.calibration_tp
	if args.num_nodes is None:
	args.num_nodes = args.calibration_pp

	return args


	class SingleBatchIterator:
	def __init__(self, images, input_ids, position_ids):
	self.batch = dict(
	media=images,
	tokens=input_ids,
	position_ids=position_ids,
	attention_mask=None,
	)
	self._yielded = False

	def __iter__(self):
	return self

	def __next__(self):
	if self._yielded:
	raise StopIteration
	self._yielded = True
	return self.batch


	def llama4_forward_step(data_iterator, model, **kwargs) -> torch.Tensor:
	batch = next(data_iterator)

	forward_args = {
	"images": batch["media"],
	"input_ids": batch["tokens"],
	"position_ids": batch["position_ids"],
	"attention_mask": batch.get("attention_mask", None),
	}

	def loss_func(x, **kwargs):
	return x

	return model(**forward_args), loss_func


	def main():
	"""Example NeMo 2.0 Post Training Quantization workflow"""
	args = get_args()

	def forward_loop(model):
	"""Forward loop for quantization calibration."""
	# Initialize processor and tokenizer
	model_id = args.model_id
	processor = AutoProcessor.from_pretrained(model_id)

	for img_url in quantization_images_url:
	raw_image = load_image(img_url)
	if raw_image is None:
	continue
	messages = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": "You are a helpful visual assistant."},
	],
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "url": img_url},
	{"type": "text", "text": "Can you describe this image?"},
	],
	},
	]
	inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	)
	input_ids = inputs["input_ids"].cuda()
	images = inputs["pixel_values"].cuda()
	position_ids = torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0)

	batch_iterator = SingleBatchIterator(images, input_ids, position_ids)
	fwd_bwd_function = get_forward_backward_func()
	with torch.no_grad():
	output = fwd_bwd_function(
	forward_step_func=llama4_forward_step,
	data_iterator=batch_iterator,
	model=model,
	num_microbatches=1,
	forward_only=True,
	seq_length=input_ids.size(1),
	micro_batch_size=1,
	collect_non_loss_data=True,
	)

	quantization_config = QuantizationConfig(
	algorithm=None if args.algorithm == "no_quant" else args.algorithm,
	awq_block_size=args.awq_block_size,
	sq_alpha=args.sq_alpha,
	enable_kv_cache=args.enable_kv_cache,
	calibration_dataset=args.calibration_dataset,
	calibration_dataset_size=args.calibration_dataset_size,
	calibration_batch_size=args.batch_size,
	calibration_seq_len=args.seq_len,
	)
	export_config = ExportConfig(
	export_format=args.export_format,
	path=args.export_path,
	decoder_type=args.decoder_type,
	inference_tp=args.inference_tp,
	inference_pp=args.inference_pp,
	dtype=args.dtype,
	generate_sample=args.generate_sample,
	)

	ptq(
	model_path=args.nemo_checkpoint,
	export_config=export_config,
	calibration_tp=args.calibration_tp,
	calibration_pp=args.calibration_pp,
	num_layers_in_first_pipeline_stage=args.num_layers_in_first_pipeline_stage,
	num_layers_in_last_pipeline_stage=args.num_layers_in_last_pipeline_stage,
	devices=args.devices,
	num_nodes=args.num_nodes,
	quantization_config=quantization_config,
	legacy_ckpt=args.legacy_ckpt,
	trust_remote_code=args.trust_remote_code,
	forward_loop=forward_loop,
	)


	if __name__ == "__main__":
	main()