# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # flake8: noqa # pylint: disable=C0115 # pylint: disable=C0116 # pylint: disable=C0301 import argparse import ast import math from functools import partial from itertools import islice from pathlib import Path from typing import Callable, Iterable import numpy as np import pandas as pd from lhotse.cut import Cut from omegaconf import OmegaConf import nemo.collections.speechlm2.data.salm_dataset # noqa from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config from nemo.collections.common.data.lhotse.dataloader import LhotseDataLoadingConfig, tokenize, tokenize_with_prompt from nemo.collections.common.data.lhotse.sampling import ( MultimodalFixedBucketBatchSizeConstraint2D, MultimodalSamplingConstraint, TokenCountFilter, TokenPerTokenFilter, ) from nemo.collections.common.prompts.formatter import PromptFormatter from nemo.collections.common.tokenizers import AggregateTokenizer, AutoTokenizer, SentencePieceTokenizer def parse_args(): parser = argparse.ArgumentParser( description="Estimate token bins for Lhotse dynamic bucketing using a sample of the input dataset. " "The dataset is read either from one or more manifest files and supports data weighting. " "Unlike estimate_duration_bins.py, this script is intended for text data only. " "It supports 2D bucketing. ", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "input", help='Path to a data input configuration YAML file. ' 'This is the only type of input specification supported for text data.', ) parser.add_argument( "-t", "--tokenizer", nargs="+", required=True, help="Path to one or more SPE tokenizers. More than one means we'll use AggregateTokenizer and --langs argument must also be used. When provided, we'll estimate a 2D distribution for input and output sequence lengths.", ) parser.add_argument( "-a", "--langs", nargs="+", help="Language names for each of AggregateTokenizer sub-tokenizers." ) parser.add_argument( "-b", "--buckets", type=int, default=30, help="The desired number of buckets (dim0 => covers input sequence length / audio duration).", ) parser.add_argument( "-s", "--sub-buckets", type=int, default=None, help="The desired number of sub-buckets (dim1 => covers output sequence length / num_tokens). " "If not provided, we'll only perform 1D bucketing. ", ) parser.add_argument( "-n", "--num_examples", type=int, default=-1, help="The number of examples (utterances) to estimate the bins. -1 means use all data " "(be careful: it could be iterated over infinitely).", ) parser.add_argument( "-l", "--min_tokens", type=float, default=-float("inf"), help="If specified, we'll filter out examples with less tokens than this number.", ) parser.add_argument( "-u", "--max_tokens", type=float, default=float("inf"), help="If specified, we'll filter out examples with more tokens than this number.", ) parser.add_argument( "--max_tpt", type=float, default=float("inf"), help="If specified, we'll filter out examples with more output tokens per input token than this. ", ) parser.add_argument( "-q", "--quiet", type=bool, default=False, help="When specified, only print the estimated duration bins." ) parser.add_argument( "-f", "--prompt-format", type=str, help="When specified, we'll use a prompt formatter in addition to the tokenizer for the purpose of estimating token count bins. " "This is useful for accurate 2D bucket estimation with models such as EncDecMultiTaskModel (Canary-1B), " "or any model where the label sequence consists of a user prompt and a model's response.", ) parser.add_argument( "-p", "--prompt", type=str, help="Prompt slots provided as a Python list of dicts. It is used together with --prompt-format option." "For example, with Canary-1B you may use: [{'role':'user','slots':{'source_lang':'en','target_lang':'en','task':'asr','pnc':'yes'}]", ) parser.add_argument( "-m", "--measure-total-length", type=bool, default=False, help="When specified, we'll measure the total length (context+answer, i.e. input_ids) instead of context-only length. Total length is more suitable for decoder-only models while context-only length is more suitable for encoder-decoder models.", ) return parser.parse_args() def estimate_token_buckets( cuts: Iterable[Cut], num_buckets: int, num_subbuckets: int | None, quiet: bool, ) -> list[tuple[float, float]]: """ This function is based on lhotse.dataset.sampling.dynamic_bucketing.estimate_duration_buckets. It extends it to a 2D bucketing case. """ assert num_buckets > 1 is_2d = num_subbuckets is not None if is_2d: constraint = MultimodalFixedBucketBatchSizeConstraint2D([(0.0, 0.0)], [0], measure_total_length=False) else: constraint = MultimodalSamplingConstraint(measure_total_length=True) # Gather the duration and token count statistics for the dataset. num_input_tokens = [] num_output_tokens = [] for c in cuts: ans = constraint.measure_length(c) if is_2d: itoks, otoks = ans num_input_tokens.append(itoks) num_output_tokens.append(otoks) else: num_input_tokens.append(ans) num_input_tokens = np.array(num_input_tokens, dtype=np.int32) if is_2d: num_output_tokens = np.array(num_output_tokens, dtype=np.int32) joint = np.rec.fromarrays([num_input_tokens, num_output_tokens]) joint.sort() num_input_tokens = joint.f0 num_output_tokens = joint.f1 else: num_input_tokens.sort() # We are building buckets with equal duration (empirically leads to more even bucket exhaustion over time). # We need to determine how much duration to allocate per bucket. size_per_bucket = num_input_tokens.sum() / num_buckets if not quiet: print("Duration distribution:") print(pd.Series(num_input_tokens).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])) max_input_tokens = num_input_tokens[-1] if is_2d: tpt = num_output_tokens / num_input_tokens if not quiet: print("Output tokens per input token distribution:") print(pd.Series(tpt).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])) max_tpt = tpt.max() del tpt bins = [] bin_indexes = [0] tot = 0.0 def _estimate_output_token_buckets(max_bucket_duration): # Since this is 2D bucketing, apply the same bin creation logic # for the second dimension (i.e. token count) as for the first dimension (duration). # That means we aim to have each bucket contain roughly the same number of tokens. # Note that this estimation is biased towards more padding if you have # a lot of zero-token examples (e.g. non-speech). nonlocal bins num_tokens_bucket = num_output_tokens[bin_indexes[-1] : binidx] num_tokens_bucket.sort() tokens_per_subbucket = num_tokens_bucket.sum() / num_subbuckets tot_toks = 0 # Iterate over token counts, and whenever we hit tokens_per_subbucket, create a new 2D bucket bin. for num_toks in num_tokens_bucket: # Threshold hit: we are creating a new (max_duration, max_num_tokens) bin. if tot_toks > tokens_per_subbucket: bins.append((max_bucket_duration, num_toks)) tot_toks = 0 tot_toks += num_toks bins.append((size, math.ceil(size * max_tpt))) # Iterate over data, and whenever we hit size_per_bucket, create a new bucket bin. for binidx, size in enumerate(num_input_tokens): if tot > size_per_bucket: # Threshold hit: we are creating a new duration bin (multiplied by number of token bins). if is_2d: _estimate_output_token_buckets(max_bucket_duration=size) else: bins.append(size) tot = 0.0 tot += size # Estimate an extra 2D bin set for global max duration. if num_subbuckets is not None: if is_2d: _estimate_output_token_buckets(max_bucket_duration=max_input_tokens) else: bins.append(max_input_tokens) return bins def load_tokenizer(paths: list[str], langs: list[str] = None) -> TokenizerWrapper: if len(paths) == 1: (p,) = paths if Path(p).exists(): tok = SentencePieceTokenizer(p) else: # Assume it's HF name tok = AutoTokenizer(p, use_fast=True) else: assert langs is not None and len(paths) == len( langs ), f"Cannot create AggregateTokenizer; each tokenizer must have assigned a language via --langs option (we got --tokenizers={paths} and --langs={langs})" tok = AggregateTokenizer({lang: SentencePieceTokenizer(p) for lang, p in zip(langs, paths)}) return TokenizerWrapper(tok) def apply_tokenizer(cut, tokenizer=None, prompt: PromptFormatter = None): if prompt is not None: cut = tokenize_with_prompt(cut, tokenizer, prompt) elif tokenizer is not None: cut = tokenize(cut, tokenizer) return cut class RejectionsCounter: def __init__(self, predicate: Callable, message: str): self.predicate = predicate self.message = message self.total = 0 self.rejected = 0 def __call__(self, example) -> bool: ans = self.predicate(example) self.total += 1 if not ans: self.rejected += 1 return ans def print_report(self) -> None: if self.rejected: print(f"{self.message} | Rejected {self.rejected}/{self.total} examples.") def main(): args = parse_args() if not args.quiet: pd.set_option('display.float_format', lambda x: '%.2f' % x) tokenizer = None prompt = None if args.tokenizer is not None: tokenizer = load_tokenizer(args.tokenizer, args.langs) if args.prompt_format is not None: prompt_defaults = None if args.prompt is not None: prompt_defaults = ast.literal_eval(args.prompt) prompt = PromptFormatter.resolve(args.prompt_format)(tokenizer._tokenizer, defaults=prompt_defaults) assert args.input.endswith(".yaml") config = OmegaConf.merge( OmegaConf.structured(LhotseDataLoadingConfig), OmegaConf.from_dotlist([f"input_cfg={args.input}", "force_finite=True", "metadata_only=True"]), ) cuts, _ = read_cutset_from_config(config) cuts = cuts.map(partial(apply_tokenizer, tokenizer=tokenizer, prompt=prompt), apply_fn=None) if hasattr(cuts, "prefetch"): cuts = cuts.prefetch() # to be released in lhotse 1.27 token_filter = RejectionsCounter( TokenCountFilter(args.min_tokens, args.max_tokens, args.measure_total_length), "Token count filtering" ) cuts = cuts.filter(token_filter) tpt_filter = RejectionsCounter(TokenPerTokenFilter(-1, args.max_tpt), "Output tokens per input token filtering") cuts = cuts.filter(tpt_filter) if (N := args.num_examples) > 0: cuts = islice(cuts, N) token_bins = estimate_token_buckets( cuts, num_buckets=args.buckets, num_subbuckets=args.sub_buckets, quiet=args.quiet, ) if args.sub_buckets is not None: token_bins = "[" + ','.join(f"[{b:d},{sb:d}]" for b, sb in token_bins) + "]" else: token_bins = "[" + ','.join(f"{b:d}" for b in token_bins) + "]" if args.quiet: print(token_bins) return token_filter.print_report() tpt_filter.print_report() print("Use the following options in your config:") print(f"\tnum_buckets={args.buckets}") print(f"\tbucket_duration_bins={token_bins}") if __name__ == "__main__": main()