From a678b5567d31dbfa7647d137e51f309a212f5341 Mon Sep 17 00:00:00 2001 From: CHiSwsz Date: Mon, 6 Apr 2026 20:29:54 +0800 Subject: [PATCH 1/3] Add native MOSS-TTS audio encoder and decoder support --- convert_moss_audio_tokenizer_split_to_gguf.py | 427 ++++++ convert_moss_audio_tokenizer_to_gguf.py | 503 +++++++ docs/moss-tts-firstclass-e2e.md | 175 ++- docs/moss-tts-firstclass-e2e_zh.md | 175 ++- include/llama-moss-audio-tokenizer.h | 61 + include/llama.h | 13 + src/CMakeLists.txt | 5 + src/llama-arch.cpp | 54 + src/llama-arch.h | 17 + src/llama-context.cpp | 146 +- src/llama-context.h | 4 + src/llama-graph.cpp | 4 + src/llama-graph.h | 2 + src/llama-hparams.cpp | 4 + src/llama-hparams.h | 3 + src/llama-model.cpp | 172 ++- src/llama-vocab.cpp | 12 +- src/models/models.h | 8 + src/models/moss-audio-common.cpp | 508 +++++++ src/models/moss-audio-common.h | 171 +++ src/models/moss-audio-decoder.cpp | 37 + src/models/moss-audio-encoder.cpp | 49 + src/models/moss-audio-tokenizer.cpp | 1205 +++++++++++++++++ tools/tts/CMakeLists.txt | 2 +- tools/tts/moss-tts-audio-decode.py | 29 +- tools/tts/moss-tts-build-generation-ref.py | 24 +- tools/tts/moss-tts-firstclass-e2e.py | 20 +- tools/tts/moss_tts_onnx.py | 74 + tools/tts/moss_tts_processor.py | 269 ++++ .../{moss-tts.cpp => run-moss-tts-delay.cpp} | 1049 +++++++++++++- 30 files changed, 4989 insertions(+), 233 deletions(-) create mode 100644 convert_moss_audio_tokenizer_split_to_gguf.py create mode 100755 convert_moss_audio_tokenizer_to_gguf.py create mode 100644 include/llama-moss-audio-tokenizer.h create mode 100644 src/models/moss-audio-common.cpp create mode 100644 src/models/moss-audio-common.h create mode 100644 src/models/moss-audio-decoder.cpp create mode 100644 src/models/moss-audio-encoder.cpp create mode 100644 src/models/moss-audio-tokenizer.cpp create mode 100644 tools/tts/moss_tts_onnx.py create mode 100644 tools/tts/moss_tts_processor.py rename tools/tts/{moss-tts.cpp => run-moss-tts-delay.cpp} (60%) diff --git a/convert_moss_audio_tokenizer_split_to_gguf.py b/convert_moss_audio_tokenizer_split_to_gguf.py new file mode 100644 index 000000000..74ffe4e89 --- /dev/null +++ b/convert_moss_audio_tokenizer_split_to_gguf.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT + +from __future__ import annotations + +import argparse +import logging +from pathlib import Path +from typing import Any, Callable, Iterator + +import numpy as np + +from convert_moss_audio_tokenizer_to_gguf import SafeTensorsIndex +from convert_moss_audio_tokenizer_to_gguf import convert_tensor_dtype +from convert_moss_audio_tokenizer_to_gguf import load_config +from convert_moss_audio_tokenizer_to_gguf import map_tensor_name +from convert_moss_audio_tokenizer_to_gguf import merge_weight_norm +from convert_moss_audio_tokenizer_to_gguf import validate_config +from convert_moss_audio_tokenizer_to_gguf import write_module_list_metadata + +import sys +import types + +# Older local Python envs can ship a NumPy build without numpy.typing. +try: + import numpy.typing # type: ignore # noqa: F401 +except Exception: + numpy_typing = types.ModuleType("numpy.typing") + numpy_typing.DTypeLike = object + sys.modules["numpy.typing"] = numpy_typing + +sys.path.insert(0, str(Path(__file__).parent / "gguf-py")) +import gguf # noqa: E402 + + +logger = logging.getLogger("convert_moss_audio_tokenizer_split_to_gguf") + +ARCH_ENCODER = "moss-tts-audio-encoder" +ARCH_DECODER = "moss-tts-audio-decoder" + +DEFAULT_SAMPLING_RATE = 24_000 +DEFAULT_DOWNSAMPLE_RATE = 1_920 +DEFAULT_CONTEXT_DURATION = 10.0 + + +def default_encoder_outfile(model_dir: Path, outtype: str) -> Path: + return model_dir / f"{model_dir.name}-encoder-{outtype}.gguf" + + +def default_decoder_outfile(model_dir: Path, outtype: str) -> Path: + return model_dir / f"{model_dir.name}-decoder-{outtype}.gguf" + + +def build_transformer_block_index_map(module_cfgs: list[dict[str, Any]]) -> dict[int, int]: + result: dict[int, int] = {} + tensor_block = 0 + for module_idx, module_cfg in enumerate(module_cfgs): + if module_cfg.get("module_type") != "Transformer": + continue + result[module_idx] = tensor_block + tensor_block += 1 + return result + + +def map_transformer_tensor_name(tensor_block: int, tail: str) -> str | None: + if tail == "input_proj.weight": + return f"blk.{tensor_block}.input_proj.weight" + if tail == "output_proj.weight": + return f"blk.{tensor_block}.output_proj.weight" + + parts = tail.split(".") + if len(parts) < 5 or parts[0] != "transformer" or parts[1] != "layers": + return None + + layer_idx = int(parts[2]) + layer_prefix = f"blk.{tensor_block}.layer.{layer_idx}" + layer_tail = ".".join(parts[3:]) + + if layer_tail == "layer_scale_1.scale": + return f"{layer_prefix}.attn_scale.scale" + if layer_tail == "layer_scale_2.scale": + return f"{layer_prefix}.ffn_scale.scale" + if layer_tail == "linear1.weight": + return f"{layer_prefix}.ffn_up.weight" + if layer_tail == "linear2.weight": + return f"{layer_prefix}.ffn_down.weight" + if layer_tail == "norm1.weight": + return f"{layer_prefix}.attn_norm.weight" + if layer_tail == "norm1.bias": + return f"{layer_prefix}.attn_norm.bias" + if layer_tail == "norm2.weight": + return f"{layer_prefix}.ffn_norm.weight" + if layer_tail == "norm2.bias": + return f"{layer_prefix}.ffn_norm.bias" + if layer_tail == "self_attn.in_projs.0.weight": + return f"{layer_prefix}.attn_qkv.weight" + if layer_tail == "self_attn.out_projs.0.weight": + return f"{layer_prefix}.attn_output.weight" + + return None + + +def map_split_tensor_name( + name: str, + encoder_block_map: dict[int, int], + decoder_block_map: dict[int, int], +) -> str | None: + mapped = map_tensor_name(name) + if mapped is None: + return None + + if mapped.startswith("encoder."): + rest = mapped[len("encoder."):] + module_idx_str, tail = rest.split(".", 1) + return map_transformer_tensor_name(encoder_block_map[int(module_idx_str)], tail) + + if mapped.startswith("decoder."): + rest = mapped[len("decoder."):] + module_idx_str, tail = rest.split(".", 1) + return map_transformer_tensor_name(decoder_block_map[int(module_idx_str)], tail) + + return mapped + + +def _count_path(name: str) -> str: + parts = name.split(".") + if len(parts) >= 4 and parts[0] == "quantizer" and parts[1] == "quantizers": + return ".".join(parts[:4]) + if len(parts) >= 4 and parts[0] == "blk" and parts[2] == "layer": + return ".".join(parts[:4]) + if len(parts) >= 2: + return ".".join(parts[:2]) + return name + + +def is_encoder_tensor(name: str) -> bool: + if name.startswith("encoder."): + return True + if name.startswith("quantizer.input_proj."): + return True + if name.startswith("quantizer.quantizers.") and ( + ".in_proj." in name or ".out_proj." in name or ".codebook." in name + ): + return True + return False + + +def is_decoder_tensor(name: str) -> bool: + if name.startswith("decoder."): + return True + if name.startswith("quantizer.output_proj."): + return True + if name.startswith("quantizer.quantizers.") and ( + ".out_proj." in name or ".codebook." in name + ): + return True + return False + + +def count_filtered_output_tensors( + index: SafeTensorsIndex, + include_fn: Callable[[str], bool], + rename_fn: Callable[[str], str | None], +) -> int: + seen: set[str] = set() + for name in index: + mapped_name = map_tensor_name(name) + renamed_name = rename_fn(name) + if mapped_name is None or renamed_name is None or not include_fn(mapped_name): + continue + seen.add(renamed_name) + return len(seen) + + +def iter_filtered_tensors( + index: SafeTensorsIndex, + outtype: str, + include_fn: Callable[[str], bool], + rename_fn: Callable[[str], str | None], +) -> Iterator[tuple[str, np.ndarray[Any, Any]]]: + emitted: set[str] = set() + + for name in index: + mapped_name = map_tensor_name(name) + renamed_name = rename_fn(name) + if ( + mapped_name is None + or renamed_name is None + or renamed_name in emitted + or not include_fn(mapped_name) + ): + continue + + if ".parametrizations.weight.original0" in name: + prefix = name.replace(".parametrizations.weight.original0", "") + g_name = f"{prefix}.parametrizations.weight.original0" + v_name = f"{prefix}.parametrizations.weight.original1" + weight = merge_weight_norm(index.load(g_name), index.load(v_name)) + yield renamed_name, convert_tensor_dtype(weight, outtype) + emitted.add(renamed_name) + continue + + tensor = index.load(name) + yield renamed_name, convert_tensor_dtype(tensor, outtype) + emitted.add(renamed_name) + + +def add_common_metadata( + writer: gguf.GGUFWriter, + arch: str, + config: dict[str, Any], + model_name: str, +) -> None: + writer.add_type("model") + writer.add_name(model_name) + + sampling_rate = int(config.get("sampling_rate", DEFAULT_SAMPLING_RATE)) + downsample_rate = int(config.get("downsample_rate", DEFAULT_DOWNSAMPLE_RATE)) + context_duration = float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)) + quantizer_cfg = dict(config.get("quantizer_kwargs", {})) + quantizer_type = config.get("quantizer_type") or quantizer_cfg.get("quantizer_type", "rlfq") + + writer.add_uint32(f"{arch}.sampling_rate", sampling_rate) + writer.add_uint32(f"{arch}.downsample_rate", downsample_rate) + writer.add_float32(f"{arch}.causal_transformer_context_duration", context_duration) + writer.add_uint32(f"{arch}.code_dim", int(config.get("code_dim", quantizer_cfg.get("output_dim", 0)))) + writer.add_string(f"{arch}.quantizer_type", quantizer_type) + writer.add_uint32(f"{arch}.quantizer.input_dim", int(quantizer_cfg["input_dim"])) + writer.add_uint32(f"{arch}.quantizer.rvq_dim", int(quantizer_cfg.get("rvq_dim", quantizer_cfg["input_dim"]))) + writer.add_uint32(f"{arch}.quantizer.output_dim", int(quantizer_cfg.get("output_dim", quantizer_cfg["input_dim"]))) + writer.add_uint32(f"{arch}.quantizer.num_quantizers", int(quantizer_cfg["num_quantizers"])) + writer.add_uint32(f"{arch}.quantizer.codebook_size", int(quantizer_cfg["codebook_size"])) + writer.add_uint32(f"{arch}.quantizer.codebook_dim", int(quantizer_cfg["codebook_dim"])) + + +def add_encoder_metadata(writer: gguf.GGUFWriter, config: dict[str, Any], model_name: str) -> None: + add_common_metadata(writer, ARCH_ENCODER, config, model_name) + write_module_list_metadata( + writer=writer, + arch=ARCH_ENCODER, + section_name="encoder", + module_cfgs=list(config.get("encoder_kwargs", [])), + initial_frame_rate=float(config.get("sampling_rate", DEFAULT_SAMPLING_RATE)), + context_duration=float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)), + is_encoder=True, + ) + + +def add_decoder_metadata(writer: gguf.GGUFWriter, config: dict[str, Any], model_name: str) -> None: + add_common_metadata(writer, ARCH_DECODER, config, model_name) + + encoder_frame_rate = float(config.get("sampling_rate", DEFAULT_SAMPLING_RATE)) + for module_cfg in list(config.get("encoder_kwargs", [])): + if module_cfg.get("module_type") == "PatchedPretransform": + encoder_frame_rate /= int(module_cfg["patch_size"]) + + write_module_list_metadata( + writer=writer, + arch=ARCH_DECODER, + section_name="decoder", + module_cfgs=list(config.get("decoder_kwargs", [])), + initial_frame_rate=encoder_frame_rate, + context_duration=float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)), + is_encoder=False, + ) + + +def build_writer(outfile: Path, arch: str, outtype: str, config: dict[str, Any], model_name: str) -> gguf.GGUFWriter: + ftype_map = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + } + writer = gguf.GGUFWriter(path=outfile, arch=arch) + writer.add_file_type(ftype_map[outtype]) + if arch == ARCH_ENCODER: + add_encoder_metadata(writer, config, model_name) + elif arch == ARCH_DECODER: + add_decoder_metadata(writer, config, model_name) + else: + raise ValueError(f"unexpected split arch {arch!r}") + return writer + + +def convert_one( + model_dir: Path, + outfile: Path, + outtype: str, + model_name: str, + include_fn: Callable[[str], bool], + arch: str, + dry_run: bool, +) -> None: + config = load_config(model_dir) + validate_config(config) + index = SafeTensorsIndex(model_dir) + encoder_block_map = build_transformer_block_index_map(list(config.get("encoder_kwargs", []))) + decoder_block_map = build_transformer_block_index_map(list(config.get("decoder_kwargs", []))) + rename_fn = lambda name: map_split_tensor_name(name, encoder_block_map, decoder_block_map) + total_tensors = count_filtered_output_tensors(index, include_fn, rename_fn) + logger.info( + "%s: selected %d output tensors for %s", + arch, + total_tensors, + outfile, + ) + + if dry_run: + paths: dict[str, int] = {} + for name in index: + mapped_name = map_tensor_name(name) + renamed_name = rename_fn(name) + if mapped_name is None or renamed_name is None or not include_fn(mapped_name): + continue + key = _count_path(renamed_name) + paths[key] = paths.get(key, 0) + 1 + + for key in sorted(paths): + logger.debug("%s keeps %3d tensors under %s", arch, paths[key], key) + logger.info("%s: dry-run only, not writing %s", arch, outfile) + return + + outfile.parent.mkdir(parents=True, exist_ok=True) + writer = build_writer(outfile, arch, outtype, config, model_name) + try: + for i, (name, tensor) in enumerate(iter_filtered_tensors(index, outtype, include_fn, rename_fn), start=1): + logger.debug("[%4d / %4d] %s %s", i, total_tensors, name, list(tensor.shape)) + writer.add_tensor(name, tensor) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=False) + logger.info("%s: wrote %s", arch, outfile) + finally: + writer.close() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Split a Hugging Face MOSS Audio Tokenizer checkpoint into " + "moss-tts-audio-encoder and moss-tts-audio-decoder GGUF files." + ) + ) + parser.add_argument( + "model_dir", + type=Path, + help="Path to a local MOSS Audio Tokenizer HF checkpoint directory.", + ) + parser.add_argument( + "--encoder-outfile", + type=Path, + default=None, + help="Output path for the moss-tts-audio-encoder GGUF.", + ) + parser.add_argument( + "--decoder-outfile", + type=Path, + default=None, + help="Output path for the moss-tts-audio-decoder GGUF.", + ) + parser.add_argument( + "--outtype", + choices=("f16", "f32"), + default="f16", + help="GGUF floating-point storage type.", + ) + parser.add_argument( + "--model-name-prefix", + type=str, + default=None, + help="Optional prefix for general.name. Defaults to the checkpoint directory name.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate config and tensor split without writing GGUF files.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable per-tensor logging.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(levelname)s:%(name)s:%(message)s", + ) + + model_dir = args.model_dir.resolve() + name_prefix = args.model_name_prefix or model_dir.name + encoder_outfile = ( + args.encoder_outfile.resolve() + if args.encoder_outfile is not None + else default_encoder_outfile(model_dir, args.outtype) + ) + decoder_outfile = ( + args.decoder_outfile.resolve() + if args.decoder_outfile is not None + else default_decoder_outfile(model_dir, args.outtype) + ) + + convert_one( + model_dir=model_dir, + outfile=encoder_outfile, + outtype=args.outtype, + model_name=f"{name_prefix} Encoder", + include_fn=is_encoder_tensor, + arch=ARCH_ENCODER, + dry_run=args.dry_run, + ) + convert_one( + model_dir=model_dir, + outfile=decoder_outfile, + outtype=args.outtype, + model_name=f"{name_prefix} Decoder", + include_fn=is_decoder_tensor, + arch=ARCH_DECODER, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/convert_moss_audio_tokenizer_to_gguf.py b/convert_moss_audio_tokenizer_to_gguf.py new file mode 100755 index 000000000..b503370b7 --- /dev/null +++ b/convert_moss_audio_tokenizer_to_gguf.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT + +from __future__ import annotations + +import argparse +import json +import logging +import struct +import sys +import types +from collections import OrderedDict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterator + +import numpy as np + +# Older local Python envs can ship a NumPy build without numpy.typing. +try: + import numpy.typing # type: ignore # noqa: F401 +except Exception: + numpy_typing = types.ModuleType("numpy.typing") + numpy_typing.DTypeLike = object + sys.modules["numpy.typing"] = numpy_typing + +sys.path.insert(0, str(Path(__file__).parent / "gguf-py")) +import gguf # noqa: E402 + + +logger = logging.getLogger("convert_moss_audio_tokenizer_to_gguf") + +ARCH = "moss-audio-tokenizer" + +DEFAULT_SAMPLING_RATE = 24_000 +DEFAULT_DOWNSAMPLE_RATE = 1_920 +DEFAULT_CONTEXT_DURATION = 10.0 + +SUPPORTED_MODULE_TYPES = {"PatchedPretransform", "Transformer"} +SUPPORTED_GATING = {"none"} +SUPPORTED_POSITIONAL_EMBEDDINGS = {"rope"} +SUPPORTED_QUANTIZER_TYPES = {"rlfq"} + +_SAFETENSORS_DTYPES: dict[str, np.dtype[Any]] = { + "BOOL": np.dtype(np.bool_), + "U8": np.dtype(np.uint8), + "I8": np.dtype(np.int8), + "I16": np.dtype(np.int16), + "U16": np.dtype(np.uint16), + "I32": np.dtype(np.int32), + "U32": np.dtype(np.uint32), + "I64": np.dtype(np.int64), + "U64": np.dtype(np.uint64), + "F16": np.dtype(np.float16), + "F32": np.dtype(np.float32), + "F64": np.dtype(np.float64), +} + + +@dataclass(frozen=True) +class TensorLocation: + name: str + shard: Path + dtype: str + shape: tuple[int, ...] + data_offsets: tuple[int, int] + data_start: int + + +class SafeTensorsIndex: + def __init__(self, model_dir: Path): + self.model_dir = model_dir + self.locations: OrderedDict[str, TensorLocation] = OrderedDict() + self._headers: dict[Path, dict[str, Any]] = {} + + index_path = model_dir / "model.safetensors.index.json" + if index_path.exists(): + index = json.loads(index_path.read_text()) + weight_map = index["weight_map"] + for tensor_name, shard_name in weight_map.items(): + shard_path = model_dir / shard_name + header, data_start = self._load_header(shard_path) + meta = header[tensor_name] + self.locations[tensor_name] = TensorLocation( + name=tensor_name, + shard=shard_path, + dtype=meta["dtype"], + shape=tuple(int(v) for v in meta["shape"]), + data_offsets=(int(meta["data_offsets"][0]), int(meta["data_offsets"][1])), + data_start=data_start, + ) + return + + shard_paths = sorted(model_dir.glob("*.safetensors")) + if not shard_paths: + raise FileNotFoundError(f"No safetensors files found under {model_dir}") + + for shard_path in shard_paths: + header, data_start = self._load_header(shard_path) + for tensor_name, meta in header.items(): + if tensor_name == "__metadata__": + continue + self.locations[tensor_name] = TensorLocation( + name=tensor_name, + shard=shard_path, + dtype=meta["dtype"], + shape=tuple(int(v) for v in meta["shape"]), + data_offsets=(int(meta["data_offsets"][0]), int(meta["data_offsets"][1])), + data_start=data_start, + ) + + def _load_header(self, shard_path: Path) -> tuple[dict[str, Any], int]: + cached = self._headers.get(shard_path) + if cached is not None: + return cached, cached["__data_start__"] + + with shard_path.open("rb") as f: + header_len = struct.unpack(" bool: + return name in self.locations + + def __iter__(self) -> Iterator[str]: + return iter(self.locations.keys()) + + def load(self, name: str) -> np.ndarray[Any, Any]: + loc = self.locations[name] + shape = tuple(loc.shape) + offset = loc.data_start + loc.data_offsets[0] + + if loc.dtype == "BF16": + raw = np.memmap(loc.shard, mode="r", dtype=np.uint16, offset=offset, shape=shape) + return bf16_to_float32(raw) + + dtype = _SAFETENSORS_DTYPES.get(loc.dtype) + if dtype is None: + raise ValueError(f"Unsupported safetensors dtype {loc.dtype!r} for tensor {name!r}") + + tensor = np.memmap(loc.shard, mode="r", dtype=dtype, offset=offset, shape=shape) + return np.asarray(tensor) + + +def bf16_to_float32(raw: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]: + u32 = raw.astype(np.uint32) << 16 + return u32.view(np.float32) + + +def to_serializable_config_value(value: Any) -> Any: + if isinstance(value, (str, bool, int, float)): + return value + raise TypeError(f"Unsupported config value type: {type(value)!r}") + + +def add_config_value(writer: gguf.GGUFWriter, key: str, value: Any) -> None: + value = to_serializable_config_value(value) + if isinstance(value, bool): + writer.add_bool(key, value) + elif isinstance(value, int): + if value >= 0: + writer.add_uint32(key, value) + else: + writer.add_int32(key, value) + elif isinstance(value, float): + writer.add_float32(key, value) + elif isinstance(value, str): + writer.add_string(key, value) + else: + raise TypeError(f"Unsupported config value type for {key!r}: {type(value)!r}") + + +def load_config(model_dir: Path) -> dict[str, Any]: + config_path = model_dir / "config.json" + if not config_path.exists(): + raise FileNotFoundError(f"Missing config.json under {model_dir}") + return json.loads(config_path.read_text()) + + +def validate_config(config: dict[str, Any]) -> None: + quantizer_type = config.get("quantizer_type") or config.get("quantizer_kwargs", {}).get("quantizer_type") + if quantizer_type not in SUPPORTED_QUANTIZER_TYPES: + raise ValueError( + f"Unsupported quantizer_type {quantizer_type!r}. " + f"This converter currently supports: {sorted(SUPPORTED_QUANTIZER_TYPES)}" + ) + + for section in ("encoder_kwargs", "decoder_kwargs"): + for idx, module_cfg in enumerate(config.get(section, [])): + module_type = module_cfg.get("module_type") + if module_type not in SUPPORTED_MODULE_TYPES: + raise ValueError(f"Unsupported {section}[{idx}].module_type={module_type!r}") + if module_type != "Transformer": + continue + gating = module_cfg.get("gating", "none") + if gating not in SUPPORTED_GATING: + raise ValueError(f"Unsupported {section}[{idx}].gating={gating!r}") + positional_embedding = module_cfg.get("positional_embedding", "rope") + if positional_embedding not in SUPPORTED_POSITIONAL_EMBEDDINGS: + raise ValueError( + f"Unsupported {section}[{idx}].positional_embedding={positional_embedding!r}" + ) + if "weights_per_step" in module_cfg and module_cfg["weights_per_step"]: + raise ValueError(f"Unsupported {section}[{idx}].weights_per_step={module_cfg['weights_per_step']!r}") + if "weights_per_step_schedule" in module_cfg and module_cfg["weights_per_step_schedule"]: + raise ValueError( + f"Unsupported {section}[{idx}].weights_per_step_schedule=" + f"{module_cfg['weights_per_step_schedule']!r}" + ) + + +def add_metadata( + writer: gguf.GGUFWriter, + config: dict[str, Any], + model_name: str, + *, + include_general_fields: bool = True, +) -> None: + if include_general_fields: + writer.add_type("audio_tokenizer") + writer.add_name(model_name) + + sampling_rate = int(config.get("sampling_rate", DEFAULT_SAMPLING_RATE)) + downsample_rate = int(config.get("downsample_rate", DEFAULT_DOWNSAMPLE_RATE)) + context_duration = float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)) + + writer.add_uint32(f"{ARCH}.sampling_rate", sampling_rate) + writer.add_uint32(f"{ARCH}.downsample_rate", downsample_rate) + writer.add_float32(f"{ARCH}.causal_transformer_context_duration", context_duration) + + if "code_dim" in config: + writer.add_uint32(f"{ARCH}.code_dim", int(config["code_dim"])) + + quantizer_type = config.get("quantizer_type") or config.get("quantizer_kwargs", {}).get("quantizer_type", "rlfq") + writer.add_string(f"{ARCH}.quantizer_type", quantizer_type) + + quantizer_cfg = dict(config.get("quantizer_kwargs", {})) + writer.add_uint32(f"{ARCH}.quantizer.input_dim", int(quantizer_cfg["input_dim"])) + writer.add_uint32(f"{ARCH}.quantizer.rvq_dim", int(quantizer_cfg.get("rvq_dim", quantizer_cfg["input_dim"]))) + writer.add_uint32(f"{ARCH}.quantizer.output_dim", int(quantizer_cfg.get("output_dim", quantizer_cfg["input_dim"]))) + writer.add_uint32(f"{ARCH}.quantizer.num_quantizers", int(quantizer_cfg["num_quantizers"])) + writer.add_uint32(f"{ARCH}.quantizer.codebook_size", int(quantizer_cfg["codebook_size"])) + writer.add_uint32(f"{ARCH}.quantizer.codebook_dim", int(quantizer_cfg["codebook_dim"])) + + write_module_list_metadata( + writer=writer, + arch=ARCH, + section_name="encoder", + module_cfgs=list(config.get("encoder_kwargs", [])), + initial_frame_rate=float(sampling_rate), + context_duration=context_duration, + is_encoder=True, + ) + + encoder_final_frame_rate = compute_final_encoder_frame_rate( + module_cfgs=list(config.get("encoder_kwargs", [])), + sampling_rate=float(sampling_rate), + ) + write_module_list_metadata( + writer=writer, + arch=ARCH, + section_name="decoder", + module_cfgs=list(config.get("decoder_kwargs", [])), + initial_frame_rate=encoder_final_frame_rate, + context_duration=context_duration, + is_encoder=False, + ) + + +def compute_final_encoder_frame_rate(module_cfgs: list[dict[str, Any]], sampling_rate: float) -> float: + frame_rate = sampling_rate + for module_cfg in module_cfgs: + if module_cfg.get("module_type") == "PatchedPretransform": + frame_rate /= int(module_cfg["patch_size"]) + return frame_rate + + +def write_module_list_metadata( + writer: gguf.GGUFWriter, + arch: str, + section_name: str, + module_cfgs: list[dict[str, Any]], + initial_frame_rate: float, + context_duration: float, + is_encoder: bool, +) -> None: + writer.add_uint32(f"{arch}.{section_name}.block_count", len(module_cfgs)) + + frame_rate = initial_frame_rate + for idx, module_cfg in enumerate(module_cfgs): + prefix = f"{arch}.{section_name}.{idx}" + module_type = module_cfg["module_type"] + writer.add_string(f"{prefix}.module_type", module_type) + + if module_type == "PatchedPretransform": + patch_size = int(module_cfg["patch_size"]) + writer.add_uint32(f"{prefix}.patch_size", patch_size) + if is_encoder: + frame_rate /= patch_size + else: + frame_rate *= patch_size + continue + + context = int(frame_rate * context_duration) + add_config_value(writer, f"{prefix}.context", context) + for key in ( + "input_dimension", + "output_dimension", + "d_model", + "num_heads", + "num_layers", + "dim_feedforward", + "causal", + "norm", + "positional_embedding", + "max_period", + "layer_scale", + "conv_layout", + "gating", + ): + if key in module_cfg: + add_config_value(writer, f"{prefix}.{key}", module_cfg[key]) + + +def map_tensor_name(name: str) -> str | None: + if ".parametrizations.weight.original0" in name: + return name.replace(".parametrizations.weight.original0", ".weight") + if ".parametrizations.weight.original1" in name: + return None + return name + + +def is_float_tensor(tensor: np.ndarray[Any, Any]) -> bool: + return np.issubdtype(tensor.dtype, np.floating) + + +def choose_output_dtype(tensor: np.ndarray[Any, Any], outtype: str) -> np.dtype[Any] | None: + if not is_float_tensor(tensor): + return None + if outtype == "f32": + return np.dtype(np.float32) + if outtype == "f16": + if tensor.ndim <= 1: + return np.dtype(np.float32) + return np.dtype(np.float16) + raise ValueError(f"Unsupported outtype {outtype!r}") + + +def convert_tensor_dtype(tensor: np.ndarray[Any, Any], outtype: str) -> np.ndarray[Any, Any]: + dst_dtype = choose_output_dtype(tensor, outtype) + if dst_dtype is None: + return np.ascontiguousarray(tensor) + if tensor.dtype == dst_dtype: + return np.ascontiguousarray(tensor) + return np.ascontiguousarray(tensor.astype(dst_dtype, copy=False)) + + +def merge_weight_norm(g: np.ndarray[Any, Any], v: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]: + axes = tuple(range(1, v.ndim)) + norm = np.linalg.norm(v.astype(np.float32), axis=axes, keepdims=True) + norm = np.maximum(norm, np.finfo(np.float32).eps) + return g.astype(np.float32) * v.astype(np.float32) / norm + + +def iter_converted_tensors(index: SafeTensorsIndex, outtype: str) -> Iterator[tuple[str, np.ndarray[Any, Any]]]: + emitted: set[str] = set() + + for name in index: + mapped_name = map_tensor_name(name) + if mapped_name is None or mapped_name in emitted: + continue + + if ".parametrizations.weight.original0" in name: + prefix = name.replace(".parametrizations.weight.original0", "") + g_name = f"{prefix}.parametrizations.weight.original0" + v_name = f"{prefix}.parametrizations.weight.original1" + weight = merge_weight_norm(index.load(g_name), index.load(v_name)) + yield mapped_name, convert_tensor_dtype(weight, outtype) + emitted.add(mapped_name) + continue + + tensor = index.load(name) + yield mapped_name, convert_tensor_dtype(tensor, outtype) + emitted.add(mapped_name) + + +def count_output_tensors(index: SafeTensorsIndex) -> int: + seen: set[str] = set() + for name in index: + mapped_name = map_tensor_name(name) + if mapped_name is not None: + seen.add(mapped_name) + return len(seen) + + +def default_outfile(model_dir: Path, outtype: str) -> Path: + return model_dir / f"{model_dir.name}-{outtype}.gguf" + + +def build_writer(outfile: Path, outtype: str, model_name: str, config: dict[str, Any]) -> gguf.GGUFWriter: + ftype_map = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + } + writer = gguf.GGUFWriter(path=outfile, arch=ARCH) + writer.add_file_type(ftype_map[outtype]) + add_metadata(writer, config, model_name) + return writer + + +def convert(model_dir: Path, outfile: Path, outtype: str, model_name: str, dry_run: bool) -> None: + config = load_config(model_dir) + validate_config(config) + + index = SafeTensorsIndex(model_dir) + total_tensors = count_output_tensors(index) + logger.info("Found %d input tensors, %d output tensors", len(index.locations), total_tensors) + + if dry_run: + logger.info("Dry-run only, not writing %s", outfile) + return + + outfile.parent.mkdir(parents=True, exist_ok=True) + writer = build_writer(outfile=outfile, outtype=outtype, model_name=model_name, config=config) + try: + for i, (name, tensor) in enumerate(iter_converted_tensors(index, outtype), start=1): + logger.debug("[%4d / %4d] %s %s", i, total_tensors, name, list(tensor.shape)) + writer.add_tensor(name, tensor) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_tensors_to_file(progress=False) + logger.info("Wrote %s", outfile) + finally: + writer.close() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a Hugging Face MOSS Audio Tokenizer checkpoint to GGUF without modifying convert_hf_to_gguf.py." + ) + parser.add_argument( + "model_dir", + type=Path, + help="Path to a local MOSS Audio Tokenizer HF checkpoint directory containing config.json and safetensors shards.", + ) + parser.add_argument( + "--outfile", + type=Path, + default=None, + help="Output GGUF path. Defaults to /-.gguf", + ) + parser.add_argument( + "--outtype", + choices=("f16", "f32"), + default="f16", + help="GGUF floating-point storage type. f16 keeps 1D float tensors in f32, matching MOSTLY_F16 semantics.", + ) + parser.add_argument( + "--model-name", + type=str, + default=None, + help="Optional GGUF general.name override. Defaults to the checkpoint directory name.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate the checkpoint and tensor mapping without writing the GGUF file.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable per-tensor logging.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(levelname)s:%(name)s:%(message)s", + ) + + model_dir = args.model_dir.resolve() + outfile = args.outfile.resolve() if args.outfile is not None else default_outfile(model_dir, args.outtype) + model_name = args.model_name or model_dir.name + + convert( + model_dir=model_dir, + outfile=outfile, + outtype=args.outtype, + model_name=model_name, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md index bdf9efd96..2c9e871e6 100644 --- a/docs/moss-tts-firstclass-e2e.md +++ b/docs/moss-tts-firstclass-e2e.md @@ -4,24 +4,21 @@ This document describes the **first-class** MOSS-TTS end-to-end inference pipeline in the current `llama.cpp` repository. -This pipeline uses: +There are currently two ways to run it: -- **llama.cpp** and `llama-moss-tts` to run the first-class MOSS-TTS-Delay GGUF model -- **ONNX Runtime** for reference-audio encoding and final waveform decoding -- **Python helper scripts** for prompt construction and end-to-end orchestration -- A local **MOSS-TTS** checkout that provides the prompt builder and ONNX tokenizer Python modules +- **Recommended native path**: all three models run inside `llama.cpp` + - `moss-tts-delay` backbone via `llama_decode()` + - `moss-tts-audio-encoder` via `llama_encode()` + - `moss-tts-audio-decoder` via `llama_encode()` +- **Hybrid wrapper path**: backbone in `llama.cpp`, audio tokenizer in ONNX, orchestrated by Python -Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. Python is only responsible for preparing inputs and invoking the ONNX audio tokenizer. +Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. ## Prerequisites 1. **llama.cpp** built from source with the `llama-moss-tts` target -2. **Python >= 3.10** -3. A local **MOSS-TTS** checkout, provided in any of the following ways: - - available at `../MOSS-TTS` relative to the repository root - - passed through `--moss-tts-dir` - - passed through `MOSS_TTS_DIR` or `MOSS_TTS_ROOT` -4. Python packages required by the helper scripts: +2. **Python >= 3.10** if you want to use the hybrid wrapper or the converter scripts +3. Python packages required by the hybrid helper scripts: - `numpy` - `soundfile` - `tokenizers` @@ -29,22 +26,37 @@ Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository ## Build +### CPU-only build + ```bash cd /path/to/llama.cpp -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release cmake --build build --target llama-moss-tts -j ``` -The resulting binary is: +Binary: - `build/bin/llama-moss-tts` -If you want to build at runtime, you can also pass `--build` to the e2e script. +### CUDA build + +```bash +cd /path/to/llama.cpp + +cmake -S . -B build-cuda -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake --build build-cuda --target llama-moss-tts -j +``` + +Binary: + +- `build-cuda/bin/llama-moss-tts` + +If you want to build the hybrid wrapper at runtime, you can also pass `--build` to the e2e script. ## Weight Preparation -### Step 1: Prepare the first-class GGUF model +### Step 1: Prepare the backbone GGUF You need a first-class MOSS-TTS-Delay GGUF model that already contains: @@ -75,7 +87,30 @@ Important: - It is **not** the same thing as a generic GGUF downloaded from `OpenMOSS/MOSS-TTS-GGUF`. - Do not point this pipeline at a file from `OpenMOSS/MOSS-TTS-GGUF` unless that file was explicitly produced as a first-class MOSS-TTS-Delay GGUF for this `llama.cpp` implementation. -### Step 2: Prepare the tokenizer directory +### Step 2: Prepare the native audio encoder / decoder GGUFs + +You need two additional GGUF files: + +- `moss-tts-audio-encoder` +- `moss-tts-audio-decoder` + +They can be generated from the Hugging Face `MOSS-Audio-Tokenizer` directory with: + +```bash +huggingface-cli download OpenMOSS-Team/MOSS-Audio-Tokenizer --local-dir /path/to/MOSS-Audio-Tokenizer-hf + +python convert_moss_audio_tokenizer_split_to_gguf.py \ + /path/to/MOSS-Audio-Tokenizer-hf \ + --outdir /path/to/out \ + --outtype f16 +``` + +Typical outputs: + +- `/path/to/out/moss_tts_audio_encoder_f16.gguf` +- `/path/to/out/moss_tts_audio_decoder_f16.gguf` + +### Step 3: Prepare the tokenizer directory for the hybrid wrapper You need a tokenizer directory containing at least: @@ -85,7 +120,7 @@ For example: - `weights/extracted/qwen3_backbone/` -### Step 3: Prepare the ONNX audio tokenizer +### Step 4: Prepare the ONNX audio tokenizer for the hybrid wrapper You need both ONNX files: @@ -97,34 +132,70 @@ For example: - `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx` - `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx` -### Step 4: Make the MOSS-TTS repository visible +## Usage -The helper scripts import: +### Current Native Runtime: Three GGUFs -- `moss_tts_delay.llama_cpp.processor` -- `moss_audio_tokenizer.onnx` +This is the current recommended path. -You can provide the repository path like this: +#### CPU ```bash -export MOSS_TTS_DIR=/path/to/MOSS-TTS +# Text-only TTS on CPU +build/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text "Hello, world!" \ + --wav-out /path/to/output.wav + +# Voice cloning on CPU +build/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --wav-out /path/to/output.wav ``` -or: +#### GPU ```bash -python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ... +# Text-only TTS on GPU +build-cuda/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text "Hello, world!" \ + --wav-out /path/to/output.wav \ + -ngl -1 + +# Voice cloning on GPU +build-cuda/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --wav-out /path/to/output.wav \ + -ngl -1 ``` -## Usage +Notes: + +- `--reference-audio` must be a 24 kHz mono wav. +- `-ngl -1` means "offload all eligible layers to GPU". +- If you built `build-cuda/bin/llama-moss-tts` but want to force CPU execution, use `-ngl 0`. -### CLI +### Hybrid Wrapper: Backbone in GGUF, Audio Tokenizer in ONNX + +This path remains useful for parity checks and intermediate artifact inspection. + +#### CLI ```bash # Voice cloning: text + reference audio -> wav python tools/tts/moss-tts-firstclass-e2e.py \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -135,7 +206,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \ # Direct generation without reference audio python tools/tts/moss-tts-firstclass-e2e.py \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -146,7 +216,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \ python tools/tts/moss-tts-firstclass-e2e.py \ --build \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -159,7 +228,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | Option | Values | Description | |------|------|------| | `--model-gguf` | path | First-class MOSS-TTS GGUF model | -| `--moss-tts-dir` | path | Local `MOSS-TTS` repository root | +| `--moss-tts-dir` | path | Deprecated compatibility flag; no longer required | | `--tokenizer-dir` | path | Directory containing `tokenizer.json` | | `--onnx-encoder` | path | Audio tokenizer encoder ONNX | | `--onnx-decoder` | path | Audio tokenizer decoder ONNX | @@ -174,8 +243,41 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | `--cpu-audio-encode` | flag | Force ONNX reference-audio encoding on CPU | | `--build` | flag | Build `llama-moss-tts` before running | +### Native Runtime Options + +| Option | Values | Description | +|------|------|------| +| `-m` | path | Backbone `moss-tts-delay` GGUF | +| `--audio-encoder-model` | path | Native `moss-tts-audio-encoder` GGUF | +| `--audio-decoder-model` | path | Native `moss-tts-audio-decoder` GGUF | +| `--text` / `--text-file` | string / path | Input text, choose exactly one | +| `--reference-audio` | path | Optional 24 kHz reference wav | +| `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder | +| `--max-new-tokens` | int | Maximum generation steps | +| `--gpu-layers` / `-ngl` | `-1` / `0` / `N` | GPU offload layers | +| `--wav-out` | path | Output wav path | + ## Architecture +### Native Three-GGUF Path + +```text +Input text (+ optional reference wav) + | + v +llama-moss-tts + | + |- text prompt packing + |- optional reference wav -> moss-tts-audio-encoder -> reference audio codes + |- moss-tts-delay backbone via llama_decode() + |- multi-head sampling + C++ delay-pattern decoding + |- raw audio codes -> moss-tts-audio-decoder -> waveform + v +wav +``` + +### Hybrid Wrapper Path + ```text Input text (+ optional reference wav) | @@ -184,7 +286,7 @@ moss-tts-build-generation-ref.py | |- tokenizes text with the Qwen3 tokenizer |- optionally encodes the reference wav into audio codes with ONNX - |- calls the prompt builder from the local MOSS-TTS repo + |- builds the packed prompt with the local lightweight MOSS-TTS processor v generation.ref.bin | @@ -232,11 +334,14 @@ llama.cpp/ ├── docs/ │ ├── moss-tts-firstclass-e2e.md │ └── moss-tts-firstclass-e2e_zh.md +├── convert_moss_audio_tokenizer_split_to_gguf.py ├── tools/tts/ │ ├── moss-tts-firstclass-e2e.py # End-to-end wrapper │ ├── moss-tts-build-generation-ref.py # Prompt / input builder │ ├── moss-tts-audio-decode.py # ONNX audio decode helper -│ └── moss-tts.cpp # llama-moss-tts implementation -└── build/bin/ +│ └── run-moss-tts-delay.cpp # llama-moss-tts implementation +├── build/bin/ +│ └── llama-moss-tts +└── build-cuda/bin/ └── llama-moss-tts ``` diff --git a/docs/moss-tts-firstclass-e2e_zh.md b/docs/moss-tts-firstclass-e2e_zh.md index 644a4bf4c..593e6b736 100644 --- a/docs/moss-tts-firstclass-e2e_zh.md +++ b/docs/moss-tts-firstclass-e2e_zh.md @@ -4,24 +4,21 @@ 本文档说明当前 `llama.cpp` 仓库中的 **first-class** MOSS-TTS 端到端推理链路。 -这条链路使用: +目前有两种运行方式: -- **llama.cpp** 和 `llama-moss-tts` 运行 first-class MOSS-TTS-Delay GGUF 模型 -- **ONNX Runtime** 完成参考音频编码和最终波形解码 -- **Python helper scripts** 负责 prompt 构建和整条链路编排 -- 本地 **MOSS-TTS** 仓库 checkout 提供 prompt builder 和 ONNX tokenizer Python 模块 +- **推荐的原生路径**:三个模型都在 `llama.cpp` 里运行 + - `moss-tts-delay` backbone 通过 `llama_decode()` + - `moss-tts-audio-encoder` 通过 `llama_encode()` + - `moss-tts-audio-decoder` 通过 `llama_encode()` +- **Hybrid wrapper 路径**:backbone 在 `llama.cpp`,音频 tokenizer 仍走 ONNX,由 Python 统一编排 -与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同,这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。Python 只负责准备输入和调用 ONNX 音频编解码器。 +与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同,这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。 ## 前置条件 1. **llama.cpp** 已从源码编译,并包含 `llama-moss-tts` 目标 -2. **Python >= 3.10** -3. 本地存在一个 **MOSS-TTS** checkout,可以通过以下任一方式提供: - - 位于当前仓库根目录旁边的 `../MOSS-TTS` - - 通过 `--moss-tts-dir` 指定 - - 通过 `MOSS_TTS_DIR` 或 `MOSS_TTS_ROOT` 指定 -4. helper scripts 需要的 Python 包: +2. **Python >= 3.10**,如果你要使用 hybrid wrapper 或转换脚本 +3. hybrid helper scripts 需要的 Python 包: - `numpy` - `soundfile` - `tokenizers` @@ -29,22 +26,37 @@ ## 编译 +### 仅 CPU 构建 + ```bash cd /path/to/llama.cpp -cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release cmake --build build --target llama-moss-tts -j ``` -编译产物为: +产物: - `build/bin/llama-moss-tts` -如果你希望在运行时自动构建,也可以在 e2e 脚本里传 `--build`。 +### CUDA 构建 + +```bash +cd /path/to/llama.cpp + +cmake -S . -B build-cuda -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON +cmake --build build-cuda --target llama-moss-tts -j +``` + +产物: + +- `build-cuda/bin/llama-moss-tts` + +如果你希望在 hybrid wrapper 运行时自动构建,也可以传 `--build`。 ## 权重准备 -### 第一步:准备 first-class GGUF 模型 +### 第一步:准备 backbone GGUF 需要一个已经包含以下内容的 first-class MOSS-TTS-Delay GGUF: @@ -75,7 +87,30 @@ python convert_hf_to_gguf.py \ - 它**不是** `OpenMOSS/MOSS-TTS-GGUF` 仓库里的通用 GGUF 文件。 - 除非某个文件被明确说明为适配这套 `llama.cpp` first-class 实现的 MOSS-TTS-Delay GGUF,否则不要把 `OpenMOSS/MOSS-TTS-GGUF` 里的文件直接拿来给这条 e2e 流水线使用。 -### 第二步:准备 tokenizer 目录 +### 第二步:准备原生 audio encoder / decoder GGUF + +还需要两个额外的 GGUF 文件: + +- `moss-tts-audio-encoder` +- `moss-tts-audio-decoder` + +它们可以从 Hugging Face 的 `MOSS-Audio-Tokenizer` 目录转换得到: + +```bash +huggingface-cli download OpenMOSS-Team/MOSS-Audio-Tokenizer --local-dir /path/to/MOSS-Audio-Tokenizer-hf + +python convert_moss_audio_tokenizer_split_to_gguf.py \ + /path/to/MOSS-Audio-Tokenizer-hf \ + --outdir /path/to/out \ + --outtype f16 +``` + +典型输出: + +- `/path/to/out/moss_tts_audio_encoder_f16.gguf` +- `/path/to/out/moss_tts_audio_decoder_f16.gguf` + +### 第三步:为 hybrid wrapper 准备 tokenizer 目录 需要一个至少包含以下文件的 tokenizer 目录: @@ -85,7 +120,7 @@ python convert_hf_to_gguf.py \ - `weights/extracted/qwen3_backbone/` -### 第三步:准备 ONNX 音频编解码器 +### 第四步:为 hybrid wrapper 准备 ONNX 音频编解码器 需要同时提供两个 ONNX 文件: @@ -97,34 +132,70 @@ python convert_hf_to_gguf.py \ - `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx` - `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx` -### 第四步:让脚本能找到 MOSS-TTS 仓库 +## 使用方式 -helper scripts 会导入: +### 当前原生运行方式:三个 GGUF -- `moss_tts_delay.llama_cpp.processor` -- `moss_audio_tokenizer.onnx` +这是当前推荐路径。 -可以通过以下方式提供 repo 路径: +#### CPU ```bash -export MOSS_TTS_DIR=/path/to/MOSS-TTS +# 纯文本 TTS,CPU 运行 +build/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text "你好,世界!" \ + --wav-out /path/to/output.wav + +# 音色克隆,CPU 运行 +build/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --wav-out /path/to/output.wav ``` -或者: +#### GPU ```bash -python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ... +# 纯文本 TTS,GPU 运行 +build-cuda/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text "你好,世界!" \ + --wav-out /path/to/output.wav \ + -ngl -1 + +# 音色克隆,GPU 运行 +build-cuda/bin/llama-moss-tts \ + -m /path/to/moss_delay_firstclass_f16.gguf \ + --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \ + --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \ + --text-file /path/to/text.txt \ + --reference-audio /path/to/reference_24k.wav \ + --wav-out /path/to/output.wav \ + -ngl -1 ``` -## 使用方式 +说明: + +- `--reference-audio` 必须是 24 kHz 单声道 wav。 +- `-ngl -1` 表示尽可能把可 offload 的层全部放到 GPU。 +- 如果你使用的是 `build-cuda/bin/llama-moss-tts` 但想强制走 CPU,可以传 `-ngl 0`。 -### 命令行 +### Hybrid wrapper:backbone 走 GGUF,音频 tokenizer 走 ONNX + +这条路径仍然适合做 parity 检查和中间产物调试。 + +#### 命令行 ```bash # 音色克隆:text + reference audio -> wav python tools/tts/moss-tts-firstclass-e2e.py \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -135,7 +206,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \ # 不带参考音频的直接生成 python tools/tts/moss-tts-firstclass-e2e.py \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -146,7 +216,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \ python tools/tts/moss-tts-firstclass-e2e.py \ --build \ --model-gguf /path/to/moss_delay_firstclass.gguf \ - --moss-tts-dir /path/to/MOSS-TTS \ --tokenizer-dir /path/to/tokenizer_dir \ --onnx-encoder /path/to/encoder.onnx \ --onnx-decoder /path/to/decoder.onnx \ @@ -160,7 +229,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | 参数 | 取值 | 说明 | |------|------|------| | `--model-gguf` | path | first-class MOSS-TTS GGUF 模型 | -| `--moss-tts-dir` | path | 本地 `MOSS-TTS` 仓库根目录 | +| `--moss-tts-dir` | path | 已废弃的兼容参数;不再需要 | | `--tokenizer-dir` | path | 含 `tokenizer.json` 的目录 | | `--onnx-encoder` | path | 音频 tokenizer encoder ONNX | | `--onnx-decoder` | path | 音频 tokenizer decoder ONNX | @@ -175,8 +244,41 @@ python tools/tts/moss-tts-firstclass-e2e.py \ | `--cpu-audio-encode` | flag | 强制 ONNX 参考音频编码走 CPU | | `--build` | flag | 运行前构建 `llama-moss-tts` | +### 原生运行参数 + +| 参数 | 取值 | 说明 | +|------|------|------| +| `-m` | path | `moss-tts-delay` backbone GGUF | +| `--audio-encoder-model` | path | 原生 `moss-tts-audio-encoder` GGUF | +| `--audio-decoder-model` | path | 原生 `moss-tts-audio-decoder` GGUF | +| `--text` / `--text-file` | string / path | 输入文本,二选一 | +| `--reference-audio` | path | 可选 24 kHz reference wav | +| `--language` | `zh` / `en` / tag | 传给 prompt builder 的语言标签 | +| `--max-new-tokens` | int | 最大生成步数 | +| `--gpu-layers` / `-ngl` | `-1` / `0` / `N` | GPU offload 层数 | +| `--wav-out` | path | 输出 wav 路径 | + ## 架构 +### 原生三 GGUF 路径 + +```text +输入文本(+ 可选 reference wav) + | + v +llama-moss-tts + | + |- 文本 prompt 打包 + |- 可选:reference wav -> moss-tts-audio-encoder -> reference audio codes + |- moss-tts-delay backbone,经由 llama_decode() + |- 多头采样 + C++ delay-pattern decode + |- raw audio codes -> moss-tts-audio-decoder -> waveform + v +wav +``` + +### Hybrid wrapper 路径 + ```text 输入文本(+ 可选 reference wav) | @@ -185,7 +287,7 @@ moss-tts-build-generation-ref.py | |- 用 Qwen3 tokenizer 处理文本 |- 可选:用 ONNX 把 reference wav 编成 audio codes - |- 调用本地 MOSS-TTS repo 的 prompt builder + |- 用仓库内置的轻量 MOSS-TTS processor 构建 packed prompt v generation.ref.bin | @@ -233,11 +335,14 @@ llama.cpp/ ├── docs/ │ ├── moss-tts-firstclass-e2e.md │ └── moss-tts-firstclass-e2e_zh.md +├── convert_moss_audio_tokenizer_split_to_gguf.py ├── tools/tts/ │ ├── moss-tts-firstclass-e2e.py # 端到端 wrapper │ ├── moss-tts-build-generation-ref.py # prompt / input 构建器 │ ├── moss-tts-audio-decode.py # ONNX 音频解码 helper -│ └── moss-tts.cpp # llama-moss-tts 实现 -└── build/bin/ +│ └── run-moss-tts-delay.cpp # llama-moss-tts 实现 +├── build/bin/ +│ └── llama-moss-tts +└── build-cuda/bin/ └── llama-moss-tts ``` diff --git a/include/llama-moss-audio-tokenizer.h b/include/llama-moss-audio-tokenizer.h new file mode 100644 index 000000000..5e0326787 --- /dev/null +++ b/include/llama-moss-audio-tokenizer.h @@ -0,0 +1,61 @@ +#pragma once + +#ifndef __cplusplus +#error "This header is for C++ only" +#endif + +#include "llama.h" + +#include +#include +#include +#include +#include + +struct moss_audio_tokenizer_options { + int n_threads = -1; +}; + +class LLAMA_API moss_audio_tokenizer { +public: + explicit moss_audio_tokenizer( + const std::string & model_path, + const moss_audio_tokenizer_options & options = {}); + ~moss_audio_tokenizer(); + + moss_audio_tokenizer(const moss_audio_tokenizer &) = delete; + moss_audio_tokenizer & operator=(const moss_audio_tokenizer &) = delete; + + moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept; + moss_audio_tokenizer & operator=(moss_audio_tokenizer &&) noexcept; + + int sample_rate() const; + uint32_t downsample_rate() const; + uint32_t num_quantizers() const; + + std::vector decode( + const std::vector & codes, + size_t n_frames, + uint32_t n_quantizers = 0) const; + + std::vector encode( + const std::vector & audio, + size_t * out_frames = nullptr, + uint32_t n_quantizers = 0) const; + +private: + struct impl; + std::unique_ptr impl_; +}; + +LLAMA_API int moss_audio_model_sample_rate(const struct llama_model * model); + +LLAMA_API uint32_t moss_audio_model_downsample_rate(const struct llama_model * model); + +LLAMA_API uint32_t moss_audio_model_num_quantizers(const struct llama_model * model); + +LLAMA_API std::vector moss_audio_model_quantizer_encode( + const struct llama_model * model, + const std::vector & input, + size_t n_frames, + uint32_t n_quantizers = 0); diff --git a/include/llama.h b/include/llama.h index c79adbaf5..3abcd000b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -556,6 +556,7 @@ extern "C" { LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_out_i32 (const struct llama_model * model); LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); @@ -1016,6 +1017,18 @@ extern "C" { // otherwise: float[n_embd] (1-dimensional) LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); + // Get all raw int32 outputs. + // shape: [n_outputs*n_out_i32] + // Returns NULL if the current model/graph does not expose any raw int32 outputs. + LLAMA_API int32_t * llama_get_output_i32(struct llama_context * ctx); + + // Get the raw int32 outputs for the ith token/output row. + // For positive indices, equivalent to: + // llama_get_output_i32(ctx) + ctx->output_ids[i]*n_out_i32 + // Negative indices can be used to access outputs in reverse order, -1 is the last row. + // Returns NULL for invalid ids or when no raw int32 outputs are available. + LLAMA_API int32_t * llama_get_output_i32_ith(struct llama_context * ctx, int32_t i); + // // backend sampling API [EXPERIMENTAL] // note: use only if the llama_context was created with at least one llama_sampler_seq_config diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 06e6e23ed..b93054d70 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,6 +7,7 @@ llama_add_compile_flags() # llama add_library(llama + ../include/llama-moss-audio-tokenizer.h ../include/llama.h llama.cpp llama-adapter.cpp @@ -100,6 +101,10 @@ add_library(llama models/minicpm3.cpp models/minimax-m2.cpp models/mistral3.cpp + models/moss-audio-common.cpp + models/moss-audio-decoder.cpp + models/moss-audio-encoder.cpp + models/moss-audio-tokenizer.cpp models/moss-tts-delay.cpp models/modern-bert.cpp models/mpt.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 07800c68a..c27828ee0 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -36,6 +36,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN3, "qwen3" }, { LLM_ARCH_MOSS_TTS_DELAY, "moss-tts-delay" }, + { LLM_ARCH_MOSS_TTS_AUDIO_ENCODER, "moss-tts-audio-encoder" }, + { LLM_ARCH_MOSS_TTS_AUDIO_DECODER, "moss-tts-audio-decoder" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" }, { LLM_ARCH_QWEN3NEXT, "qwen3next" }, { LLM_ARCH_QWEN3VL, "qwen3vl" }, @@ -548,6 +550,21 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, "blk.%d.input_proj" }, + { LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, "blk.%d.output_proj" }, + { LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, "blk.%d.layer.%d.attn_qkv" }, + { LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, "blk.%d.layer.%d.attn_output" }, + { LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "blk.%d.layer.%d.attn_norm" }, + { LLM_TENSOR_MOSS_AUDIO_FFN_UP, "blk.%d.layer.%d.ffn_up" }, + { LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, "blk.%d.layer.%d.ffn_down" }, + { LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "blk.%d.layer.%d.ffn_norm" }, + { LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, "blk.%d.layer.%d.attn_scale" }, + { LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, "blk.%d.layer.%d.ffn_scale" }, + { LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "quantizer.input_proj" }, + { LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "quantizer.output_proj" }, + { LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "quantizer.quantizers.%d.codebook" }, + { LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "quantizer.quantizers.%d.in_proj" }, + { LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "quantizer.quantizers.%d.out_proj" }, }; static std::set llm_get_tensor_names(llm_arch arch) { @@ -1002,6 +1019,25 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + return { + LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, + LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, + LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, + LLM_TENSOR_MOSS_AUDIO_FFN_UP, + LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, + LLM_TENSOR_MOSS_AUDIO_FFN_NORM, + LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, + LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, + LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, + LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, + }; case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_OLMOE: @@ -2792,6 +2828,21 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ,{LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ,{LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_MOSS_AUDIO_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ,{LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ,{LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // Nemotron 3 Super {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -2827,6 +2878,9 @@ std::string LLM_TN_IMPL::str() const { switch (tensor) { case LLM_TENSOR_TOKEN_EMBD_AUDIO: case LLM_TENSOR_OUTPUT_AUDIO: + case LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK: + case LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ: + case LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ: name = ::format(LLM_TENSOR_NAMES.at(tensor), xid); break; default: diff --git a/src/llama-arch.h b/src/llama-arch.h index 9320b01da..4b042cb66 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -40,6 +40,8 @@ enum llm_arch { LLM_ARCH_QWEN2VL, LLM_ARCH_QWEN3, LLM_ARCH_MOSS_TTS_DELAY, + LLM_ARCH_MOSS_TTS_AUDIO_ENCODER, + LLM_ARCH_MOSS_TTS_AUDIO_DECODER, LLM_ARCH_QWEN3MOE, LLM_ARCH_QWEN3NEXT, LLM_ARCH_QWEN3VL, @@ -556,6 +558,21 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, + LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, + LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, + LLM_TENSOR_MOSS_AUDIO_FFN_UP, + LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, + LLM_TENSOR_MOSS_AUDIO_FFN_NORM, + LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, + LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, + LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, + LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, + LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, }; enum llm_tensor_layer { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 915380b26..9206aa83f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -832,6 +832,12 @@ float * llama_context::get_embeddings() { return embd.data; } +int32_t * llama_context::get_output_i32() { + output_reorder(); + + return out_i32.data; +} + llama_token * llama_context::get_sampled_tokens() const{ return sampling.sampled.data; } @@ -866,6 +872,26 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } +int32_t * llama_context::get_output_i32_ith(int32_t i) { + output_reorder(); + + try { + if (out_i32.data == nullptr) { + throw std::runtime_error("no raw int32 outputs"); + } + + const int64_t j = output_resolve_row(i); + return out_i32.data + j*out_i32_stride; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid raw i32 output id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ABORT("fatal error"); +#else + return nullptr; +#endif + } +} + llama_token llama_context::get_sampled_token_ith(int32_t idx) { output_reorder(); @@ -1235,6 +1261,43 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll return res; } +static uint32_t llama_encode_expected_outputs(const llama_model & model, uint32_t n_tokens) { + switch (model.arch) { + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + { + const auto key = llm_arch_name(model.arch) + std::string(".downsample_rate"); + const auto it = model.gguf_kv.find(key); + const uint32_t downsample_rate = it != model.gguf_kv.end() ? std::max(1, (uint32_t) std::stoul(it->second)) : 1u; + return std::max(1, n_tokens / downsample_rate); + } + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + { + const auto key = llm_arch_name(model.arch) + std::string(".downsample_rate"); + const auto it = model.gguf_kv.find(key); + const uint32_t downsample_rate = it != model.gguf_kv.end() ? std::max(1, (uint32_t) std::stoul(it->second)) : 1u; + return n_tokens * downsample_rate; + } + default: + return n_tokens; + } +} + +static uint32_t llama_encode_actual_outputs(const llama_model & model, ggml_tensor * t_embd, ggml_tensor * t_out_i32, uint32_t fallback) { + switch (model.arch) { + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + if (t_embd != nullptr) { + return (uint32_t) std::max(1, t_embd->ne[1]); + } + if (t_out_i32 != nullptr) { + return (uint32_t) std::max(1, t_out_i32->ne[1]); + } + return fallback; + default: + return fallback; + } +} + int llama_context::encode(const llama_batch & batch_inp) { GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT @@ -1274,17 +1337,19 @@ int llama_context::encode(const llama_batch & batch_inp) { n_queued_tokens += n_tokens; + const uint32_t n_outputs_expected = llama_encode_expected_outputs(model, n_tokens); + // reserve output buffer - if (output_reserve(n_tokens) < n_tokens) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens); + if (output_reserve(n_outputs_expected) < n_outputs_expected) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs_expected); return -2; }; - for (uint32_t i = 0; i < n_tokens; ++i) { + for (uint32_t i = 0; i < n_outputs_expected; ++i) { output_ids[i] = i; } - n_outputs = n_tokens; + n_outputs = n_outputs_expected; const auto causal_attn_org = cparams.causal_attn; @@ -1309,6 +1374,13 @@ int llama_context::encode(const llama_batch & batch_inp) { auto * t_logits = res->get_logits(); auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + auto * t_out_i32 = res->get_out_i32(); + + const uint32_t n_outputs_actual = llama_encode_actual_outputs(model, t_embd, t_out_i32, n_tokens); + n_outputs = n_outputs_actual; + for (uint32_t i = 0; i < n_outputs_actual; ++i) { + output_ids[i] = i; + } // extract logits if (logits.data && t_logits) { @@ -1316,7 +1388,7 @@ int llama_context::encode(const llama_batch & batch_inp) { GGML_ASSERT(backend_res != nullptr); GGML_ASSERT(logits.data != nullptr); - ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_logits*sizeof(float)); + ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_outputs_actual*n_logits*sizeof(float)); } // extract embeddings @@ -1331,8 +1403,8 @@ int llama_context::encode(const llama_batch & batch_inp) { GGML_ASSERT(embd.data != nullptr); const uint32_t n_embd_out = hparams.n_embd_out(); - GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size); - ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float)); + GGML_ASSERT(n_outputs_actual*n_embd_out <= (int64_t) embd.size); + ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_outputs_actual*n_embd_out*sizeof(float)); } break; case LLAMA_POOLING_TYPE_MEAN: case LLAMA_POOLING_TYPE_CLS: @@ -1371,6 +1443,14 @@ int llama_context::encode(const llama_batch & batch_inp) { } } + if (out_i32.data && t_out_i32) { + ggml_backend_t backend_out_i32 = ggml_backend_sched_get_tensor_backend(sched.get(), t_out_i32); + GGML_ASSERT(backend_out_i32 != nullptr); + GGML_ASSERT(out_i32.data != nullptr); + GGML_ASSERT(n_outputs_actual*out_i32_stride <= (int64_t) out_i32.size); + ggml_backend_tensor_get_async(backend_out_i32, t_out_i32, out_i32.data, 0, n_outputs_actual*out_i32_stride*sizeof(int32_t)); + } + // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { //cross.t_embd = t_embd; @@ -1719,8 +1799,9 @@ int llama_context::decode(const llama_batch & batch_inp) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - auto * t_logits = res->get_logits(); - auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_out_i32 = res->get_out_i32(); if (t_embd && res->get_embd_pooled()) { t_embd = res->get_embd_pooled(); @@ -1798,6 +1879,16 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + if (out_i32.data && t_out_i32 && n_outputs > 0) { + ggml_backend_t backend_out_i32 = ggml_backend_sched_get_tensor_backend(sched.get(), t_out_i32); + GGML_ASSERT(backend_out_i32 != nullptr); + + int32_t * out_i32_dst = out_i32.data + n_outputs_prev*out_i32_stride; + GGML_ASSERT(n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*out_i32_stride <= (int64_t) out_i32.size); + ggml_backend_tensor_get_async(backend_out_i32, t_out_i32, out_i32_dst, 0, n_outputs*out_i32_stride*sizeof(int32_t)); + } + // Copy backend sampling output if this ubatch produced any sampling tensors. if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) { const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev); @@ -1880,10 +1971,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); - const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); const auto n_logits = model.n_logits(); const auto n_embd_out = hparams.n_embd_out(); + const auto n_out_i32 = hparams.n_out_i32(); bool has_logits = true; bool has_embd = cparams.embeddings; @@ -1901,6 +1992,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits_stride = has_logits ? n_logits : 0; logits.size = has_logits ? n_logits*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; + out_i32_stride = n_out_i32; + out_i32.size = n_out_i32 > 0 ? n_out_i32*n_outputs_max : 0; // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); @@ -1909,15 +2002,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { backend_token_count = (1 + n_vocab) * n_outputs_max; // sampled + candidates } - if (output_ids.empty()) { - // init, never resized afterwards - output_ids.resize(n_batch); + if (output_ids.size() < (size_t) n_outputs_max) { + output_ids.resize(n_outputs_max); } const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = (logits.size + embd.size + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + (out_i32.size) * sizeof(int32_t) + + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -1933,6 +2026,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { buf_output = nullptr; logits.data = nullptr; embd.data = nullptr; + out_i32.data = nullptr; } auto * buft = ggml_backend_cpu_buffer_type(); @@ -1960,6 +2054,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd = has_embd ? buffer_view{(float *) (base + offset), embd.size} : buffer_view{nullptr, 0}; offset += embd.size * sizeof(float); + out_i32 = out_i32.size > 0 ? buffer_view{(int32_t *) (base + offset), out_i32.size} : buffer_view{nullptr, 0}; + offset += out_i32.size * sizeof(int32_t); + if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; offset += sampling.logits.size * sizeof(float); @@ -2007,7 +2104,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { void llama_context::output_reorder() { const uint64_t n_logits = logits_stride; const uint64_t n_vocab = model.vocab.n_tokens(); - const uint64_t n_embd = model.hparams.n_embd; + const uint64_t n_embd = model.hparams.n_embd_out(); + const uint64_t n_i32 = model.hparams.n_out_i32(); for (size_t s = 0; s < output_swaps.size(); ++s) { const uint64_t i0 = output_swaps[s].i0; @@ -2025,6 +2123,12 @@ void llama_context::output_reorder() { } } + if (out_i32.size > 0) { + for (uint64_t k = 0; k < n_i32; ++k) { + std::swap(out_i32.data[i0*n_i32 + k], out_i32.data[i1*n_i32 + k]); + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -3101,6 +3205,18 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } +int32_t * llama_get_output_i32(llama_context * ctx) { + ctx->synchronize(); + + return ctx->get_output_i32(); +} + +int32_t * llama_get_output_i32_ith(llama_context * ctx, int32_t i) { + ctx->synchronize(); + + return ctx->get_output_i32_ith(i); +} + bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { return ctx->set_sampler(seq_id, smpl); } diff --git a/src/llama-context.h b/src/llama-context.h index 49c39f023..fe7e0cacd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -78,6 +78,8 @@ struct llama_context { float * get_embeddings(); float * get_embeddings_ith(int32_t i); float * get_embeddings_seq(llama_seq_id seq_id); + int32_t * get_output_i32(); + int32_t * get_output_i32_ith(int32_t i); llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -277,6 +279,8 @@ struct llama_context { // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE buffer_view embd = {nullptr, 0}; + buffer_view out_i32 = {nullptr, 0}; + uint32_t out_i32_stride = 0; struct sampling_info { // !samplers.empty() to check if any samplers are active diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 77735daad..463dd821d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -737,6 +737,7 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + t_out_i32 = nullptr; t_sampled.clear(); t_sampled_probs.clear(); t_sampled_logits.clear(); @@ -775,6 +776,9 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } + if (t_out_i32 != nullptr) { + ggml_set_output(t_out_i32); + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); diff --git a/src/llama-graph.h b/src/llama-graph.h index a1362cc5a..d1feb8e48 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -638,6 +638,7 @@ class llm_graph_result { ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } + ggml_tensor * get_out_i32() const { return t_out_i32; } ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -666,6 +667,7 @@ class llm_graph_result { ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; + ggml_tensor * t_out_i32 = nullptr; std::map t_sampled_logits; std::map t_candidates; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 002d15d41..edcc8de21 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -84,6 +84,10 @@ uint32_t llama_hparams::n_embd_out() const { return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } +uint32_t llama_hparams::n_out_i32() const { + return n_out_i32_impl; +} + uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { if (il < n_layer) { return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 0a1c76965..dd4a47986 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -187,6 +187,8 @@ struct llama_hparams { // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; + // raw int32 output width (0 = disabled) + uint32_t n_out_i32_impl = 0; // llama4 smallthinker uint32_t n_moe_layer_step = 0; @@ -274,6 +276,7 @@ struct llama_hparams { // dimension of output embeddings uint32_t n_embd_out() const; + uint32_t n_out_i32() const; // dimension of key/value embeddings for each head (per layer) uint32_t n_embd_head_k(uint32_t il = 0) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f7b4bd12f..45bb9fb8d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -367,14 +367,48 @@ void llama_model::load_hparams(llama_model_loader & ml) { return; } - ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); - ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); - ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); - ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); - ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); - ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); - ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + if (arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER || arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) { + const std::string arch_name = llm_arch_name(arch); + uint32_t downsample_rate = 0; + uint32_t sampling_rate = 0; + uint32_t block_count = 0; + uint32_t num_quantizers = 0; + uint32_t quantizer_input_dim = 0; + float context_duration = 0.0f; + + ml.get_key(arch_name + ".downsample_rate", downsample_rate); + ml.get_key(arch_name + ".sampling_rate", sampling_rate); + ml.get_key(arch_name + ".causal_transformer_context_duration", context_duration); + ml.get_key(arch_name + ".quantizer.num_quantizers", num_quantizers); + ml.get_key(arch_name + ".quantizer.input_dim", quantizer_input_dim); + + if (arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) { + ml.get_key(arch_name + ".encoder.block_count", block_count); + hparams.n_ctx_train = std::max(1, (uint32_t) std::lround((double) sampling_rate * context_duration)); + hparams.n_embd = 1; + hparams.n_embd_out_impl = quantizer_input_dim; + hparams.n_out_i32_impl = num_quantizers; + } else { + ml.get_key(arch_name + ".decoder.block_count", block_count); + hparams.n_ctx_train = std::max(1, (uint32_t) std::lround((double) sampling_rate * context_duration / std::max(downsample_rate, 1))); + hparams.n_embd = 1; + hparams.n_embd_out_impl = 1; + hparams.n_out_i32_impl = 0; + } + + hparams.n_layer = block_count; + hparams.n_vq = num_quantizers; + hparams.sampling_rate = sampling_rate; + } else { + ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); + ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); + ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); + ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + } if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); @@ -1028,6 +1062,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + { + hparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + hparams.causal_attn = false; + hparams.f_norm_eps = 1e-5f; + type = LLM_TYPE_UNKNOWN; + } break; case LLM_ARCH_MAINCODER: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -3729,6 +3771,88 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + { + const std::string arch_name = llm_arch_name(arch); + const char * section_name = arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? "encoder" : "decoder"; + + auto get_u32 = [&](const std::string & key) -> int64_t { + uint32_t value = 0; + ml.get_key(key, value); + return value; + }; + + auto get_str = [&](const std::string & key) -> std::string { + std::string value; + ml.get_key(key, value); + return value; + }; + + const int64_t quant_input_dim = get_u32(arch_name + ".quantizer.input_dim"); + const int64_t quant_rvq_dim = get_u32(arch_name + ".quantizer.rvq_dim"); + const int64_t quant_output_dim = get_u32(arch_name + ".quantizer.output_dim"); + const int64_t quant_codebook_size = get_u32(arch_name + ".quantizer.codebook_size"); + const int64_t quant_codebook_dim = get_u32(arch_name + ".quantizer.codebook_dim"); + const int64_t num_quantizers = get_u32(arch_name + ".quantizer.num_quantizers"); + const int64_t block_count = get_u32(arch_name + "." + section_name + ".block_count"); + + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "weight"), {1, quant_input_dim, quant_rvq_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "bias"), {quant_rvq_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "weight"), {1, quant_rvq_dim, quant_output_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER ? 0 : TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "bias"), {quant_output_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER ? 0 : TENSOR_NOT_REQUIRED); + + for (int64_t iq = 0; iq < num_quantizers; ++iq) { + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, (int) iq), {quant_codebook_dim, quant_codebook_size}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "weight", -1, (int) iq), {1, quant_rvq_dim, quant_codebook_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "bias", -1, (int) iq), {quant_codebook_dim}, + arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, (int) iq), {1, quant_codebook_dim, quant_rvq_dim}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, (int) iq), {quant_rvq_dim}, 0); + } + + int tensor_block = 0; + for (int64_t ib = 0; ib < block_count; ++ib) { + const std::string block_prefix = arch_name + "." + section_name + "." + std::to_string(ib); + const std::string module_type = get_str(block_prefix + ".module_type"); + + if (module_type == "PatchedPretransform") { + continue; + } + if (module_type != "Transformer") { + throw std::runtime_error("unsupported MOSS audio module type: " + module_type); + } + + const int64_t input_dimension = get_u32(block_prefix + ".input_dimension"); + const int64_t output_dimension = get_u32(block_prefix + ".output_dimension"); + const int64_t d_model = get_u32(block_prefix + ".d_model"); + const int64_t dim_feedforward = get_u32(block_prefix + ".dim_feedforward"); + const int64_t num_layers = get_u32(block_prefix + ".num_layers"); + + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, "weight", tensor_block), {input_dimension, d_model}, TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, "weight", tensor_block), {d_model, output_dimension}, TENSOR_NOT_REQUIRED); + + for (int64_t il = 0; il < num_layers; ++il) { + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, "weight", tensor_block, (int) il), {d_model, d_model * 3}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, "weight", tensor_block, (int) il), {d_model, d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_UP, "weight", tensor_block, (int) il), {d_model, dim_feedforward}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, "weight", tensor_block, (int) il), {dim_feedforward, d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "weight", tensor_block, (int) il), {d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "bias", tensor_block, (int) il), {d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "weight", tensor_block, (int) il), {d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "bias", tensor_block, (int) il), {d_model}, 0); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, "scale", tensor_block, (int) il), {d_model}, TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, "scale", tensor_block, (int) il), {d_model}, TENSOR_NOT_REQUIRED); + } + + tensor_block++; + } + } break; case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_RND1: @@ -8110,6 +8234,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NEO_BERT: case LLM_ARCH_EUROBERT: case LLM_ARCH_WAVTOKENIZER_DEC: + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: case LLM_ARCH_MODERN_BERT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_DREAM: @@ -8366,6 +8492,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + { + llm = std::make_unique(*this, params); + } break; + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_QWEN3MOE: { llm = std::make_unique(*this, params); @@ -8822,6 +8956,10 @@ int32_t llama_model_n_embd_out(const llama_model * model) { return model->hparams.n_embd_out(); } +int32_t llama_model_n_out_i32(const llama_model * model) { + return model->hparams.n_out_i32(); +} + int32_t llama_model_n_layer(const llama_model * model) { return model->hparams.n_layer; } @@ -8891,6 +9029,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_DECODER: case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_KIMI_LINEAR: @@ -9112,16 +9252,22 @@ uint64_t llama_model_n_params(const llama_model * model) { bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - case LLM_ARCH_T5ENCODER: return true; - default: return false; + case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + return true; + default: + return false; } } bool llama_model_has_decoder(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5ENCODER: return false; - default: return true; + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER: + return false; + default: + return true; } } diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 68ba292d4..1ed74e0b3 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1723,7 +1723,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // determine vocab type { - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); + if (!ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model, false)) { + if (kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER || kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) { + tokenizer_model = "none"; + } else { + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); + } + } ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false); @@ -1745,6 +1751,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) { LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens); id_to_token.resize(n_tokens); + } else if (kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) { + LLAMA_LOG_WARN("%s: missing vocab size for %s, adding a single dummy token for auxiliary audio batches\n", + __func__, llm_arch_name(kv.arch)); + id_to_token.resize(1); } return; diff --git a/src/models/models.h b/src/models/models.h index 3f21b0102..baa1a5e70 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -699,6 +699,14 @@ struct llm_build_t5_enc : public llm_graph_context { llm_build_t5_enc(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_moss_tts_audio_encoder : public llm_graph_context { + llm_build_moss_tts_audio_encoder(const llama_model & model, const llm_graph_params & params); +}; + +struct llm_build_moss_tts_audio_decoder : public llm_graph_context { + llm_build_moss_tts_audio_decoder(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_wavtokenizer_dec : public llm_graph_context { llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/moss-audio-common.cpp b/src/models/moss-audio-common.cpp new file mode 100644 index 000000000..e61e5e667 --- /dev/null +++ b/src/models/moss-audio-common.cpp @@ -0,0 +1,508 @@ +#include "moss-audio-common.h" + +#include +#include +#include +#include + +namespace moss_audio { + +std::string unquote(std::string value) { + if (value.size() >= 2 && value.front() == '\'' && value.back() == '\'') { + return value.substr(1, value.size() - 2); + } + if (value.size() >= 2 && value.front() == '"' && value.back() == '"') { + return value.substr(1, value.size() - 2); + } + return value; +} + +std::string meta_str(const llama_model & model, const std::string & key) { + const auto it = model.gguf_kv.find(key); + if (it == model.gguf_kv.end()) { + throw std::runtime_error("missing GGUF key: " + key); + } + return unquote(it->second); +} + +uint32_t meta_u32(const llama_model & model, const std::string & key) { + return (uint32_t) std::stoul(meta_str(model, key)); +} + +float meta_f32(const llama_model & model, const std::string & key, float def, bool required) { + const auto it = model.gguf_kv.find(key); + if (it == model.gguf_kv.end()) { + if (!required) { + return def; + } + throw std::runtime_error("missing GGUF key: " + key); + } + return std::stof(unquote(it->second)); +} + +ggml_tensor * require_tensor(const llama_model & model, const std::string & name) { + auto * tensor = const_cast(model.get_tensor(name.c_str())); + if (tensor == nullptr) { + throw std::runtime_error("missing tensor: " + name); + } + return tensor; +} + +ggml_tensor * optional_tensor(const llama_model & model, const std::string & name) { + return const_cast(model.get_tensor(name.c_str())); +} + +ggml_tensor * as_matrix(ggml_context * ctx0, ggml_tensor * tensor) { + if (tensor == nullptr) { + return nullptr; + } + + const int n_dims = ggml_n_dims(tensor); + if (n_dims == 2) { + return tensor; + } + if (n_dims == 3 && tensor->ne[0] == 1) { + return ggml_reshape_2d(ctx0, tensor, tensor->ne[1], tensor->ne[2]); + } + if (n_dims == 4 && tensor->ne[0] == 1 && tensor->ne[1] == 1) { + return ggml_reshape_2d(ctx0, tensor, tensor->ne[2], tensor->ne[3]); + } + + throw std::runtime_error("unsupported tensor rank for linear projection: " + std::string(ggml_get_name(tensor))); +} + +ggml_tensor * as_f32_matrix(ggml_context * ctx0, ggml_tensor * tensor) { + return tensor != nullptr ? ggml_cast(ctx0, as_matrix(ctx0, tensor), GGML_TYPE_F32) : nullptr; +} + +ggml_tensor * as_f32_vector(ggml_context * ctx0, ggml_tensor * tensor) { + return tensor != nullptr ? ggml_cast(ctx0, tensor, GGML_TYPE_F32) : nullptr; +} + +ggml_tensor * linear_f32( + ggml_context * ctx0, + ggml_tensor * input, + ggml_tensor * weight, + ggml_tensor * bias) { + ggml_tensor * cur = ggml_cast(ctx0, input, GGML_TYPE_F32); + + if (weight != nullptr) { + cur = ggml_mul_mat(ctx0, as_f32_matrix(ctx0, weight), cur); + } + if (bias != nullptr) { + cur = ggml_add(ctx0, cur, as_f32_vector(ctx0, bias)); + } + + return cur; +} + +graph_input_embd::graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} + +void graph_input_embd::set_input(const llama_ubatch * ubatch) { + GGML_ASSERT(ubatch->embd != nullptr); + GGML_ASSERT(embd != nullptr); + ggml_backend_tensor_set(embd, ubatch->embd, 0, (size_t) ubatch->n_tokens * (size_t) n_embd * sizeof(float)); +} + +bool graph_input_embd::can_reuse(const llm_graph_params & params) { + return params.ubatch.embd != nullptr && embd != nullptr && embd->ne[0] == n_embd && embd->ne[1] == params.ubatch.n_tokens; +} + +graph_input_channel::graph_input_channel(uint32_t channel, uint32_t n_channels) : channel(channel), n_channels(n_channels) {} + +void graph_input_channel::set_input(const llama_ubatch * ubatch) { + GGML_ASSERT(tokens != nullptr); + data.resize(ubatch->n_tokens, 0); + + if (ubatch->token_audio != nullptr) { + GGML_ASSERT(ubatch->n_token_audio == n_channels); + for (uint32_t i = 0; i < ubatch->n_tokens; ++i) { + data[i] = ubatch->token_audio[(size_t) i * n_channels + channel]; + } + } + + ggml_backend_tensor_set(tokens, data.data(), 0, data.size() * sizeof(int32_t)); +} + +bool graph_input_channel::can_reuse(const llm_graph_params & params) { + return tokens != nullptr && + tokens->ne[0] == params.ubatch.n_tokens && + ((params.ubatch.token_audio == nullptr && params.ubatch.n_token_audio == 0) || + (params.ubatch.token_audio != nullptr && params.ubatch.n_token_audio == n_channels)); +} + +graph_input_i32::graph_input_i32(std::vector data) : data(std::move(data)) {} + +void graph_input_i32::set_input(const llama_ubatch *) { + GGML_ASSERT(tensor != nullptr); + ggml_backend_tensor_set(tensor, data.data(), 0, data.size() * sizeof(int32_t)); +} + +bool graph_input_i32::can_reuse(const llm_graph_params &) { + return tensor != nullptr && tensor->ne[0] == (int64_t) data.size(); +} + +graph_input_f32::graph_input_f32(std::vector data) : data(std::move(data)) {} + +void graph_input_f32::set_input(const llama_ubatch *) { + GGML_ASSERT(tensor != nullptr); + ggml_backend_tensor_set(tensor, data.data(), 0, data.size() * sizeof(float)); +} + +bool graph_input_f32::can_reuse(const llm_graph_params &) { + return tensor != nullptr && ggml_nelements(tensor) == (int64_t) data.size(); +} + +std::vector make_positions(size_t n_tokens) { + std::vector positions(n_tokens); + for (size_t i = 0; i < n_tokens; ++i) { + positions[i] = (int32_t) i; + } + return positions; +} + +int64_t align_up(int64_t value, int64_t multiple) { + if (multiple <= 1) { + return value; + } + return ((value + multiple - 1) / multiple) * multiple; +} + +std::vector make_causal_mask(size_t n_tokens, int context) { + std::vector mask(n_tokens * n_tokens, -std::numeric_limits::infinity()); + + for (size_t iq = 0; iq < n_tokens; ++iq) { + for (size_t ik = 0; ik < n_tokens; ++ik) { + if (ik > iq) { + continue; + } + if (context > 0 && (int) (iq - ik) >= context) { + continue; + } + mask[iq * n_tokens + ik] = 0.0f; + } + } + + return mask; +} + +ggml_tensor * build_layer_norm( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * weight, + ggml_tensor * bias) { + cur = ggml_norm(ctx0, cur, LAYER_NORM_EPS); + cur = ggml_mul(ctx0, cur, weight); + cur = ggml_add(ctx0, cur, bias); + return cur; +} + +ggml_tensor * build_attention( + ggml_context * ctx0, + ggml_tensor * wo, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale) { + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); + + if (wo != nullptr) { + cur = ggml_mul_mat(ctx0, as_matrix(ctx0, wo), cur); + } + + return cur; +} + +ggml_tensor * patch_encode( + ggml_context * ctx0, + ggml_tensor * cur, + int channels, + int64_t n_frames, + int patch_size) { + GGML_ASSERT(patch_size > 0); + GGML_ASSERT(n_frames % patch_size == 0); + GGML_ASSERT(cur->ne[0] == channels); + GGML_ASSERT(cur->ne[1] == n_frames); + + cur = ggml_reshape_3d(ctx0, cur, channels, patch_size, n_frames / patch_size); + cur = ggml_permute(ctx0, cur, 1, 0, 2, 3); + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_2d(ctx0, cur, channels * patch_size, n_frames / patch_size); + return cur; +} + +ggml_tensor * patch_decode( + ggml_context * ctx0, + ggml_tensor * cur, + int channels, + int64_t n_frames, + int patch_size) { + GGML_ASSERT(patch_size > 0); + GGML_ASSERT(channels % patch_size == 0); + GGML_ASSERT(cur->ne[0] == channels); + GGML_ASSERT(cur->ne[1] == n_frames); + + const int out_channels = channels / patch_size; + cur = ggml_reshape_3d(ctx0, cur, patch_size, out_channels, n_frames); + cur = ggml_permute(ctx0, cur, 1, 0, 2, 3); + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_2d(ctx0, cur, out_channels, n_frames * patch_size); + return cur; +} + +quantizer_meta load_quantizer_meta(const llama_model & model, const std::string & arch_name) { + quantizer_meta meta; + meta.input_dim = (int) meta_u32(model, arch_name + ".quantizer.input_dim"); + meta.rvq_dim = (int) meta_u32(model, arch_name + ".quantizer.rvq_dim"); + meta.output_dim = (int) meta_u32(model, arch_name + ".quantizer.output_dim"); + meta.num_quantizers = (int) meta_u32(model, arch_name + ".quantizer.num_quantizers"); + meta.codebook_size = (int) meta_u32(model, arch_name + ".quantizer.codebook_size"); + meta.codebook_dim = (int) meta_u32(model, arch_name + ".quantizer.codebook_dim"); + return meta; +} + +std::vector load_modules( + const llama_model & model, + ggml_context * ctx0, + const std::string & arch_name, + const std::string & section_name) { + const auto tn = LLM_TN(model.arch); + const uint32_t block_count = meta_u32(model, arch_name + "." + section_name + ".block_count"); + std::vector modules(block_count); + int tensor_block = 0; + + for (uint32_t ib = 0; ib < block_count; ++ib) { + const std::string block_prefix = arch_name + "." + section_name + "." + std::to_string(ib); + auto & block = modules[ib]; + const std::string current_type = meta_str(model, block_prefix + ".module_type"); + + if (current_type == "PatchedPretransform") { + block.type = module_type::PATCHED_PRETRANSFORM; + block.patch_size = (int) meta_u32(model, block_prefix + ".patch_size"); + continue; + } + + if (current_type != "Transformer") { + throw std::runtime_error("unsupported MOSS audio module type: " + current_type); + } + + block.type = module_type::TRANSFORMER; + + auto & tr = block.transformer; + tr.input_dimension = (int) meta_u32(model, block_prefix + ".input_dimension"); + tr.output_dimension = (int) meta_u32(model, block_prefix + ".output_dimension"); + tr.d_model = (int) meta_u32(model, block_prefix + ".d_model"); + tr.num_heads = (int) meta_u32(model, block_prefix + ".num_heads"); + tr.num_layers = (int) meta_u32(model, block_prefix + ".num_layers"); + tr.context = (int) meta_u32(model, block_prefix + ".context"); + tr.max_period = meta_f32(model, block_prefix + ".max_period", 10000.0f, false); + + tr.input_proj = as_matrix(ctx0, optional_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, "weight", tensor_block).str())); + tr.output_proj = as_matrix(ctx0, optional_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, "weight", tensor_block).str())); + + tr.layers.resize(tr.num_layers); + for (int il = 0; il < tr.num_layers; ++il) { + auto & layer = tr.layers[il]; + layer.attn_in = as_matrix(ctx0, require_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, "weight", tensor_block, il).str())); + layer.attn_out = as_matrix(ctx0, require_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, "weight", tensor_block, il).str())); + layer.linear1 = as_matrix(ctx0, require_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_FFN_UP, "weight", tensor_block, il).str())); + layer.linear2 = as_matrix(ctx0, require_tensor(model, + tn(LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, "weight", tensor_block, il).str())); + layer.norm1_w = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "weight", tensor_block, il).str()); + layer.norm1_b = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "bias", tensor_block, il).str()); + layer.norm2_w = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "weight", tensor_block, il).str()); + layer.norm2_b = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "bias", tensor_block, il).str()); + layer.scale1 = optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, "scale", tensor_block, il).str()); + layer.scale2 = optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, "scale", tensor_block, il).str()); + } + + tensor_block++; + } + + return modules; +} + +ggml_tensor * build_transformer_block( + llm_graph_context & llm, + ggml_tensor * cur, + const transformer_block & block, + int64_t n_frames, + int module_index) { + GGML_ASSERT(cur->ne[1] == n_frames); + + auto inp_pos = std::make_unique(make_positions((size_t) n_frames)); + inp_pos->tensor = ggml_new_tensor_1d(llm.ctx0, GGML_TYPE_I32, n_frames); + llm.cb(inp_pos->tensor, "moss_audio_pos", module_index); + ggml_set_input(inp_pos->tensor); + ggml_tensor * positions = inp_pos->tensor; + llm.res->add_input(std::move(inp_pos)); + + auto inp_mask = std::make_unique(make_causal_mask((size_t) n_frames, block.context)); + inp_mask->tensor = ggml_new_tensor_4d(llm.ctx0, GGML_TYPE_F32, n_frames, n_frames, 1, 1); + llm.cb(inp_mask->tensor, "moss_audio_mask", module_index); + ggml_set_input(inp_mask->tensor); + ggml_tensor * mask = inp_mask->tensor; + llm.res->add_input(std::move(inp_mask)); + + if (block.input_proj != nullptr) { + cur = ggml_mul_mat(llm.ctx0, block.input_proj, cur); + } + + const int d_head = block.d_model / block.num_heads; + const float attn_scale = 1.0f / std::sqrt((float) d_head); + + for (int il = 0; il < block.num_layers; ++il) { + const auto & layer = block.layers[il]; + + ggml_tensor * inp_sa = cur; + ggml_tensor * x = build_layer_norm(llm.ctx0, cur, layer.norm1_w, layer.norm1_b); + ggml_tensor * qkv = ggml_mul_mat(llm.ctx0, layer.attn_in, x); + + ggml_tensor * q = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], 0); + ggml_tensor * k = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model)); + ggml_tensor * v = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model)); + + q = ggml_rope_ext(llm.ctx0, q, positions, nullptr, d_head, 0, 0, + block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + k = ggml_rope_ext(llm.ctx0, k, positions, nullptr, d_head, 0, 0, + block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + ggml_tensor * attn = build_attention(llm.ctx0, layer.attn_out, q, k, v, mask, attn_scale); + if (layer.scale1 != nullptr) { + attn = ggml_mul(llm.ctx0, attn, layer.scale1); + } + cur = ggml_add(llm.ctx0, inp_sa, attn); + + ggml_tensor * inp_ff = cur; + x = build_layer_norm(llm.ctx0, cur, layer.norm2_w, layer.norm2_b); + x = ggml_mul_mat(llm.ctx0, layer.linear1, x); + x = ggml_gelu(llm.ctx0, x); + x = ggml_mul_mat(llm.ctx0, layer.linear2, x); + if (layer.scale2 != nullptr) { + x = ggml_mul(llm.ctx0, x, layer.scale2); + } + cur = ggml_add(llm.ctx0, inp_ff, x); + } + + if (block.output_proj != nullptr) { + cur = ggml_mul_mat(llm.ctx0, block.output_proj, cur); + } + + return cur; +} + +ggml_tensor * build_decoder_quantizer( + llm_graph_context & llm, + const llama_model & model, + ggml_context * ctx0, + const quantizer_meta & quantizer, + int64_t n_frames) { + const auto tn = LLM_TN(model.arch); + + ggml_tensor * cur = nullptr; + for (int iq = 0; iq < quantizer.num_quantizers; ++iq) { + auto inp_code = std::make_unique((uint32_t) iq, (uint32_t) quantizer.num_quantizers); + inp_code->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_frames); + llm.cb(inp_code->tokens, "moss_audio_code", iq); + ggml_set_input(inp_code->tokens); + + ggml_tensor * codebook = ggml_cast(ctx0, + require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, iq).str()), + GGML_TYPE_F32); + ggml_tensor * emb = ggml_get_rows(ctx0, codebook, inp_code->tokens); + emb = linear_f32( + ctx0, + emb, + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, iq).str()), + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, iq).str())); + + cur = cur != nullptr ? ggml_add(ctx0, cur, emb) : emb; + llm.res->add_input(std::move(inp_code)); + } + + cur = linear_f32( + ctx0, + cur, + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "weight").str()), + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "bias").str())); + + GGML_ASSERT(cur != nullptr); + return cur; +} + +ggml_tensor * build_l2_normalize(ggml_context * ctx0, ggml_tensor * cur) { + constexpr float L2_NORM_EPS = 3.4526698e-4f; + + ggml_tensor * norm = ggml_sum_rows(ctx0, ggml_sqr(ctx0, cur)); + norm = ggml_sqrt(ctx0, norm); + norm = ggml_clamp(ctx0, norm, L2_NORM_EPS, INFINITY); + return ggml_div(ctx0, cur, ggml_repeat(ctx0, norm, cur)); +} + +ggml_tensor * build_encoder_quantizer_codes( + const llama_model & model, + ggml_context * ctx0, + const quantizer_meta & quantizer, + ggml_tensor * cur) { + const auto tn = LLM_TN(model.arch); + + cur = linear_f32( + ctx0, + cur, + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "weight").str()), + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "bias").str())); + + ggml_tensor * codes = nullptr; + ggml_tensor * residual = cur; + + for (int iq = 0; iq < quantizer.num_quantizers; ++iq) { + ggml_tensor * latent = linear_f32( + ctx0, + residual, + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "weight", -1, iq).str()), + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "bias", -1, iq).str())); + + ggml_tensor * latent_unit = build_l2_normalize(ctx0, latent); + ggml_tensor * codebook = ggml_cast(ctx0, + require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, iq).str()), + GGML_TYPE_F32); + ggml_tensor * codebook_unit = build_l2_normalize(ctx0, codebook); + ggml_tensor * scores = ggml_mul_mat(ctx0, codebook_unit, latent_unit); + ggml_tensor * code_i = ggml_argmax(ctx0, scores); + + ggml_tensor * code_i_row = ggml_reshape_2d(ctx0, code_i, 1, code_i->ne[0]); + codes = codes != nullptr ? ggml_concat(ctx0, codes, code_i_row, 0) : code_i_row; + + ggml_tensor * decoded = ggml_get_rows(ctx0, codebook, code_i); + decoded = linear_f32( + ctx0, + decoded, + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, iq).str()), + optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, iq).str())); + residual = ggml_sub(ctx0, residual, decoded); + } + + GGML_ASSERT(codes != nullptr); + return codes; +} + +} // namespace moss_audio diff --git a/src/models/moss-audio-common.h b/src/models/moss-audio-common.h new file mode 100644 index 000000000..05acc6093 --- /dev/null +++ b/src/models/moss-audio-common.h @@ -0,0 +1,171 @@ +#pragma once + +#include "models.h" + +#include +#include +#include + +namespace moss_audio { + +constexpr float LAYER_NORM_EPS = 1e-5f; + +enum class module_type { + PATCHED_PRETRANSFORM, + TRANSFORMER, +}; + +struct transformer_layer { + ggml_tensor * attn_in = nullptr; + ggml_tensor * attn_out = nullptr; + ggml_tensor * linear1 = nullptr; + ggml_tensor * linear2 = nullptr; + ggml_tensor * norm1_w = nullptr; + ggml_tensor * norm1_b = nullptr; + ggml_tensor * norm2_w = nullptr; + ggml_tensor * norm2_b = nullptr; + ggml_tensor * scale1 = nullptr; + ggml_tensor * scale2 = nullptr; +}; + +struct transformer_block { + int input_dimension = 0; + int output_dimension = 0; + int d_model = 0; + int num_heads = 0; + int num_layers = 0; + int context = 0; + float max_period = 10000.0f; + + ggml_tensor * input_proj = nullptr; + ggml_tensor * output_proj = nullptr; + + std::vector layers; +}; + +struct module { + module_type type = module_type::PATCHED_PRETRANSFORM; + int patch_size = 1; + transformer_block transformer; +}; + +struct quantizer_meta { + int input_dim = 0; + int rvq_dim = 0; + int output_dim = 0; + int num_quantizers = 0; + int codebook_size = 0; + int codebook_dim = 0; +}; + +std::string unquote(std::string value); +std::string meta_str(const llama_model & model, const std::string & key); +uint32_t meta_u32(const llama_model & model, const std::string & key); +float meta_f32(const llama_model & model, const std::string & key, float def = 0.0f, bool required = true); + +ggml_tensor * require_tensor(const llama_model & model, const std::string & name); +ggml_tensor * optional_tensor(const llama_model & model, const std::string & name); +ggml_tensor * as_matrix(ggml_context * ctx0, ggml_tensor * tensor); +ggml_tensor * as_f32_matrix(ggml_context * ctx0, ggml_tensor * tensor); +ggml_tensor * as_f32_vector(ggml_context * ctx0, ggml_tensor * tensor); +ggml_tensor * linear_f32(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * weight, ggml_tensor * bias); + +class graph_input_embd : public llm_graph_input_i { +public: + explicit graph_input_embd(int64_t n_embd); + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * embd = nullptr; + +private: + int64_t n_embd; +}; + +class graph_input_channel : public llm_graph_input_i { +public: + graph_input_channel(uint32_t channel, uint32_t n_channels); + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tokens = nullptr; + +private: + uint32_t channel; + uint32_t n_channels; + std::vector data; +}; + +class graph_input_i32 : public llm_graph_input_i { +public: + explicit graph_input_i32(std::vector data); + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tensor = nullptr; + +private: + std::vector data; +}; + +class graph_input_f32 : public llm_graph_input_i { +public: + explicit graph_input_f32(std::vector data); + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tensor = nullptr; + +private: + std::vector data; +}; + +std::vector make_positions(size_t n_tokens); +int64_t align_up(int64_t value, int64_t multiple); +std::vector make_causal_mask(size_t n_tokens, int context); + +ggml_tensor * build_layer_norm(ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias); +ggml_tensor * build_attention( + ggml_context * ctx0, + ggml_tensor * wo, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale); +ggml_tensor * patch_encode(ggml_context * ctx0, ggml_tensor * cur, int channels, int64_t n_frames, int patch_size); +ggml_tensor * patch_decode(ggml_context * ctx0, ggml_tensor * cur, int channels, int64_t n_frames, int patch_size); + +quantizer_meta load_quantizer_meta(const llama_model & model, const std::string & arch_name); +std::vector load_modules( + const llama_model & model, + ggml_context * ctx0, + const std::string & arch_name, + const std::string & section_name); + +ggml_tensor * build_transformer_block( + llm_graph_context & llm, + ggml_tensor * cur, + const transformer_block & block, + int64_t n_frames, + int module_index); + +ggml_tensor * build_decoder_quantizer( + llm_graph_context & llm, + const llama_model & model, + ggml_context * ctx0, + const quantizer_meta & quantizer, + int64_t n_frames); + +ggml_tensor * build_l2_normalize(ggml_context * ctx0, ggml_tensor * cur); +ggml_tensor * build_encoder_quantizer_codes( + const llama_model & model, + ggml_context * ctx0, + const quantizer_meta & quantizer, + ggml_tensor * cur); + +} // namespace moss_audio diff --git a/src/models/moss-audio-decoder.cpp b/src/models/moss-audio-decoder.cpp new file mode 100644 index 000000000..0b1854ecb --- /dev/null +++ b/src/models/moss-audio-decoder.cpp @@ -0,0 +1,37 @@ +#include "moss-audio-common.h" + +using namespace moss_audio; + +llm_build_moss_tts_audio_decoder::llm_build_moss_tts_audio_decoder(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + const std::string arch_name = llm_arch_name(model.arch); + const auto quantizer = load_quantizer_meta(model, arch_name); + const std::vector modules = load_modules(model, ctx0, arch_name, "decoder"); + + int64_t frames = ubatch.n_tokens; + ggml_tensor * cur = build_decoder_quantizer(*this, model, ctx0, quantizer, frames); + int channels = quantizer.output_dim; + int tensor_block = 0; + + for (size_t i = 0; i < modules.size(); ++i) { + const auto & current = modules[i]; + switch (current.type) { + case module_type::TRANSFORMER: + cur = build_transformer_block(*this, cur, current.transformer, frames, tensor_block); + channels = current.transformer.output_dimension; + tensor_block++; + break; + case module_type::PATCHED_PRETRANSFORM: + cur = patch_decode(ctx0, cur, channels, frames, current.patch_size); + channels /= current.patch_size; + frames *= current.patch_size; + break; + } + } + + GGML_ASSERT(channels == 1); + + cb(cur, "result_embd", -1); + res->t_embd = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/moss-audio-encoder.cpp b/src/models/moss-audio-encoder.cpp new file mode 100644 index 000000000..fc73b74f8 --- /dev/null +++ b/src/models/moss-audio-encoder.cpp @@ -0,0 +1,49 @@ +#include "moss-audio-common.h" + +using namespace moss_audio; + +llm_build_moss_tts_audio_encoder::llm_build_moss_tts_audio_encoder(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + const std::string arch_name = llm_arch_name(model.arch); + const auto quantizer = load_quantizer_meta(model, arch_name); + const std::vector modules = load_modules(model, ctx0, arch_name, "encoder"); + const int64_t downsample_rate = (int64_t) meta_u32(model, arch_name + ".downsample_rate"); + const int64_t reserve_frames = ubatch.embd != nullptr ? ubatch.n_tokens : align_up(ubatch.n_tokens, downsample_rate); + + auto inp = std::make_unique(1); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, reserve_frames); + cb(inp->embd, "moss_audio_waveform", -1); + ggml_set_input(inp->embd); + ggml_tensor * cur = inp->embd; + res->add_input(std::move(inp)); + + int channels = 1; + int64_t frames = reserve_frames; + int tensor_block = 0; + + for (size_t i = 0; i < modules.size(); ++i) { + const auto & current = modules[i]; + switch (current.type) { + case module_type::PATCHED_PRETRANSFORM: + cur = patch_encode(ctx0, cur, channels, frames, current.patch_size); + channels *= current.patch_size; + frames /= current.patch_size; + break; + case module_type::TRANSFORMER: + cur = build_transformer_block(*this, cur, current.transformer, frames, tensor_block); + channels = current.transformer.output_dimension; + tensor_block++; + break; + } + } + + GGML_ASSERT(channels == quantizer.input_dim); + + cb(cur, "result_embd", -1); + res->t_embd = cur; + + ggml_tensor * codes = build_encoder_quantizer_codes(model, ctx0, quantizer, cur); + cb(codes, "result_out_i32", -1); + res->t_out_i32 = codes; + ggml_build_forward_expand(gf, codes); +} diff --git a/src/models/moss-audio-tokenizer.cpp b/src/models/moss-audio-tokenizer.cpp new file mode 100644 index 000000000..a0f19415b --- /dev/null +++ b/src/models/moss-audio-tokenizer.cpp @@ -0,0 +1,1205 @@ +#include "llama-moss-audio-tokenizer.h" + +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" +#include "gguf.h" +#include "llama-impl.h" +#include "llama-model.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +constexpr char MOSS_CODEC_ARCH[] = "moss-audio-tokenizer"; +constexpr float MOSS_LAYER_NORM_EPS = 1e-5f; +constexpr size_t MOSS_CODEC_MAX_NODES_BASE = 256; +constexpr size_t MOSS_CODEC_MAX_NODES_PER_LAYER = 32; + +enum class moss_codec_module_type { + PATCHED_PRETRANSFORM, + TRANSFORMER, +}; + +struct moss_codec_transformer_layer { + ggml_tensor * attn_in = nullptr; + ggml_tensor * attn_out = nullptr; + ggml_tensor * linear1 = nullptr; + ggml_tensor * linear2 = nullptr; + ggml_tensor * norm1_w = nullptr; + ggml_tensor * norm1_b = nullptr; + ggml_tensor * norm2_w = nullptr; + ggml_tensor * norm2_b = nullptr; + ggml_tensor * scale1 = nullptr; + ggml_tensor * scale2 = nullptr; +}; + +struct moss_codec_transformer_block { + int input_dimension = 0; + int output_dimension = 0; + int d_model = 0; + int num_heads = 0; + int num_layers = 0; + int dim_feedforward = 0; + int context = 0; + float max_period = 10000.0f; + + ggml_tensor * input_proj = nullptr; + ggml_tensor * output_proj = nullptr; + + std::vector layers; +}; + +struct moss_codec_module { + moss_codec_module_type type = moss_codec_module_type::PATCHED_PRETRANSFORM; + int patch_size = 1; + moss_codec_transformer_block transformer; +}; + +struct moss_codec_quantizer_entry { + ggml_tensor * in_proj_w = nullptr; + ggml_tensor * in_proj_b = nullptr; + ggml_tensor * codebook = nullptr; + ggml_tensor * out_proj_w = nullptr; + ggml_tensor * out_proj_b = nullptr; +}; + +struct moss_codec_quantizer { + int input_dim = 0; + int rvq_dim = 0; + int output_dim = 0; + int num_quantizers = 0; + int codebook_size = 0; + int codebook_dim = 0; + + ggml_tensor * input_proj_w = nullptr; + ggml_tensor * input_proj_b = nullptr; + ggml_tensor * output_proj_w = nullptr; + ggml_tensor * output_proj_b = nullptr; + + std::vector quantizers; +}; + +static std::string moss_codec_module_type_to_string(const moss_codec_module_type type) { + switch (type) { + case moss_codec_module_type::PATCHED_PRETRANSFORM: + return "PatchedPretransform"; + case moss_codec_module_type::TRANSFORMER: + return "Transformer"; + } + return "Unknown"; +} + +static moss_codec_module_type moss_codec_module_type_from_string(const std::string & value) { + if (value == "PatchedPretransform") { + return moss_codec_module_type::PATCHED_PRETRANSFORM; + } + if (value == "Transformer") { + return moss_codec_module_type::TRANSFORMER; + } + throw std::runtime_error("unsupported codec module type: " + value); +} + +static void moss_codec_set_n_threads(ggml_backend_t backend, int n_threads) { + if (backend == nullptr || n_threads <= 0) { + return; + } + + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (!reg) { + return; + } + + auto fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (fn != nullptr) { + fn(backend, n_threads); + } +} + +static std::vector moss_codec_make_positions(const size_t n_tokens) { + std::vector positions(n_tokens); + for (size_t i = 0; i < n_tokens; ++i) { + positions[i] = (int32_t) i; + } + return positions; +} + +static std::vector moss_codec_make_causal_mask(const size_t n_tokens, const int context) { + std::vector mask(n_tokens * n_tokens, -std::numeric_limits::infinity()); + + for (size_t iq = 0; iq < n_tokens; ++iq) { + for (size_t ik = 0; ik < n_tokens; ++ik) { + if (ik > iq) { + continue; + } + if (context > 0 && (int) (iq - ik) >= context) { + continue; + } + mask[iq * n_tokens + ik] = 0.0f; + } + } + + return mask; +} + +static ggml_tensor * moss_codec_build_layer_norm( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * weight, + ggml_tensor * bias) { + cur = ggml_norm(ctx0, cur, MOSS_LAYER_NORM_EPS); + cur = ggml_mul(ctx0, cur, weight); + cur = ggml_add(ctx0, cur, bias); + return cur; +} + +static ggml_tensor * moss_codec_build_attention( + ggml_context * ctx0, + ggml_tensor * wo, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale) { + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); + + if (wo != nullptr) { + cur = ggml_mul_mat(ctx0, wo, cur); + } + + return cur; +} + +static std::vector moss_codec_patch_decode( + const std::vector & input, + const int channels, + const size_t n_frames, + const int patch_size) { + if (patch_size <= 0) { + throw std::runtime_error("invalid patch size"); + } + if (channels % patch_size != 0) { + throw std::runtime_error("patch decode channels not divisible by patch size"); + } + if (input.size() != (size_t) channels * n_frames) { + throw std::runtime_error("patch decode input size mismatch"); + } + + const int out_channels = channels / patch_size; + const size_t out_frames = n_frames * (size_t) patch_size; + std::vector output((size_t) out_channels * out_frames); + + for (size_t t = 0; t < n_frames; ++t) { + for (int d = 0; d < out_channels; ++d) { + for (int i = 0; i < patch_size; ++i) { + const float value = input[(size_t) (d * patch_size + i) + t * (size_t) channels]; + output[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) out_channels] = value; + } + } + } + + return output; +} + +static std::vector moss_codec_patch_encode( + const std::vector & input, + const int channels, + const size_t n_frames, + const int patch_size) { + if (patch_size <= 0) { + throw std::runtime_error("invalid patch size"); + } + if (n_frames % (size_t) patch_size != 0) { + throw std::runtime_error("patch encode frame count not divisible by patch size"); + } + if (input.size() != (size_t) channels * n_frames) { + throw std::runtime_error("patch encode input size mismatch"); + } + + const int out_channels = channels * patch_size; + const size_t out_frames = n_frames / (size_t) patch_size; + std::vector output((size_t) out_channels * out_frames); + + for (size_t t = 0; t < out_frames; ++t) { + for (int d = 0; d < channels; ++d) { + for (int i = 0; i < patch_size; ++i) { + const float value = input[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) channels]; + output[(size_t) (d * patch_size + i) + t * (size_t) out_channels] = value; + } + } + } + + return output; +} + +static std::vector moss_codec_copy_f32_output(ggml_tensor * tensor) { + std::vector output((size_t) ggml_nelements(tensor)); + ggml_backend_tensor_get(tensor, output.data(), 0, ggml_nbytes(tensor)); + return output; +} + +struct moss_codec_linear_f32 { + int in_features = 0; + int out_features = 0; + std::vector weight; + std::vector bias; + + bool empty() const { + return weight.empty(); + } +}; + +struct moss_codec_quantizer_entry_f32 { + moss_codec_linear_f32 in_proj; + moss_codec_linear_f32 out_proj; + int codebook_size = 0; + int codebook_dim = 0; + std::vector codebook; + std::vector codebook_unit; +}; + +static std::vector moss_codec_tensor_to_f32(const ggml_tensor * tensor) { + if (tensor == nullptr) { + return {}; + } + + const size_t n_elements = (size_t) ggml_nelements(tensor); + + switch (tensor->type) { + case GGML_TYPE_F32: { + std::vector values(n_elements); + ggml_backend_tensor_get(const_cast(tensor), values.data(), 0, ggml_nbytes(tensor)); + return values; + } + case GGML_TYPE_F16: { + std::vector values_f16(n_elements); + std::vector values(n_elements); + ggml_backend_tensor_get(const_cast(tensor), values_f16.data(), 0, ggml_nbytes(tensor)); + for (size_t i = 0; i < n_elements; ++i) { + values[i] = ggml_fp16_to_fp32(values_f16[i]); + } + return values; + } + default: + throw std::runtime_error("unsupported tensor dtype for float conversion: " + std::string(ggml_type_name(tensor->type))); + } +} + +static moss_codec_linear_f32 moss_codec_linear_from_tensors(ggml_tensor * weight, ggml_tensor * bias) { + moss_codec_linear_f32 result; + if (weight == nullptr) { + return result; + } + + switch (ggml_n_dims(weight)) { + case 2: + result.in_features = (int) weight->ne[0]; + result.out_features = (int) weight->ne[1]; + break; + case 3: + if (weight->ne[0] != 1) { + throw std::runtime_error("expected singleton leading dim for 3D linear weight tensor"); + } + result.in_features = (int) weight->ne[1]; + result.out_features = (int) weight->ne[2]; + break; + case 4: + if (weight->ne[0] != 1 || weight->ne[1] != 1) { + throw std::runtime_error("expected singleton leading dims for 4D linear weight tensor"); + } + result.in_features = (int) weight->ne[2]; + result.out_features = (int) weight->ne[3]; + break; + default: + throw std::runtime_error("expected 2D/3D/4D linear weight tensor"); + } + result.weight = moss_codec_tensor_to_f32(weight); + result.bias = moss_codec_tensor_to_f32(bias); + return result; +} + +static std::vector moss_codec_linear_apply( + const moss_codec_linear_f32 & linear, + const std::vector & input, + const size_t n_frames) { + if (linear.empty()) { + return input; + } + if (input.size() != (size_t) linear.in_features * n_frames) { + throw std::runtime_error("linear input size mismatch"); + } + + std::vector output((size_t) linear.out_features * n_frames, 0.0f); + for (size_t t = 0; t < n_frames; ++t) { + const float * x = input.data() + t * (size_t) linear.in_features; + float * y = output.data() + t * (size_t) linear.out_features; + + if (!linear.bias.empty()) { + std::copy(linear.bias.begin(), linear.bias.end(), y); + } + + for (int o = 0; o < linear.out_features; ++o) { + const float * w = linear.weight.data() + (size_t) o * (size_t) linear.in_features; + float acc = y[o]; + for (int i = 0; i < linear.in_features; ++i) { + acc += w[i] * x[i]; + } + y[o] = acc; + } + } + + return output; +} + +static std::vector moss_codec_normalize_rows( + const std::vector & input, + const int row_width) { + if (row_width <= 0 || input.size() % (size_t) row_width != 0) { + throw std::runtime_error("invalid row width for normalization"); + } + + std::vector output = input; + const size_t n_rows = input.size() / (size_t) row_width; + for (size_t r = 0; r < n_rows; ++r) { + float norm2 = 0.0f; + for (int c = 0; c < row_width; ++c) { + const float v = output[r * (size_t) row_width + (size_t) c]; + norm2 += v * v; + } + const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); + for (int c = 0; c < row_width; ++c) { + output[r * (size_t) row_width + (size_t) c] *= inv; + } + } + return output; +} + +struct moss_codec_gguf_loader { + ggml_context_ptr ctx_meta; + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; + ggml_backend_ptr backend; + ggml_backend_buffer_ptr buffer; + + std::string fname; + std::map tensor_offset; + std::map loaded_tensors; + std::vector tensors_to_load; + + explicit moss_codec_gguf_loader(const std::string & model_path) + : fname(model_path), + backend(ggml_backend_cpu_init()) { + if (!backend) { + throw std::runtime_error("failed to initialize CPU backend for codec"); + } + + ggml_context * meta = nullptr; + gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf.reset(gguf_init_from_file(fname.c_str(), params)); + if (!ctx_gguf) { + throw std::runtime_error("failed to load codec GGUF metadata from: " + fname); + } + + ctx_meta.reset(meta); + + for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); + } + + ggml_init_params data_params = { + /*.mem_size =*/ static_cast(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ctx_data.reset(ggml_init(data_params)); + if (!ctx_data) { + throw std::runtime_error("failed to initialize codec tensor context"); + } + } + + int find_key(const std::string & key, const bool required = true) const { + const int idx = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (idx < 0 && required) { + throw std::runtime_error("GGUF key not found: " + key); + } + return idx; + } + + bool has_key(const std::string & key) const { + return gguf_find_key(ctx_gguf.get(), key.c_str()) >= 0; + } + + uint32_t get_u32(const std::string & key, const bool required = true, const uint32_t fallback = 0) const { + const int idx = find_key(key, required); + if (idx < 0) { + return fallback; + } + return gguf_get_val_u32(ctx_gguf.get(), idx); + } + + float get_f32(const std::string & key, const bool required = true, const float fallback = 0.0f) const { + const int idx = find_key(key, required); + if (idx < 0) { + return fallback; + } + return gguf_get_val_f32(ctx_gguf.get(), idx); + } + + std::string get_string(const std::string & key, const bool required = true, const std::string & fallback = {}) const { + const int idx = find_key(key, required); + if (idx < 0) { + return fallback; + } + return std::string(gguf_get_val_str(ctx_gguf.get(), idx)); + } + + ggml_tensor * get_tensor(const std::string & name, const bool required = true) { + const auto it = loaded_tensors.find(name); + if (it != loaded_tensors.end()) { + return it->second; + } + + ggml_tensor * meta_tensor = ggml_get_tensor(ctx_meta.get(), name.c_str()); + if (!meta_tensor) { + if (required) { + throw std::runtime_error("codec tensor not found: " + name); + } + return nullptr; + } + + ggml_tensor * data_tensor = ggml_dup_tensor(ctx_data.get(), meta_tensor); + ggml_set_name(data_tensor, meta_tensor->name); + loaded_tensors.emplace(name, data_tensor); + tensors_to_load.push_back(data_tensor); + return data_tensor; + } + + void load_tensor_bytes() { + if (!buffer) { + buffer.reset(ggml_backend_alloc_ctx_tensors(ctx_data.get(), backend.get())); + if (!buffer) { + throw std::runtime_error("failed to allocate codec weight buffer"); + } + ggml_backend_buffer_set_usage(buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + + std::ifstream fin(fname, std::ios::binary); + if (!fin) { + throw std::runtime_error("failed to open codec GGUF for tensor loading: " + fname); + } + + std::vector read_buf; + for (ggml_tensor * tensor : tensors_to_load) { + const auto it = tensor_offset.find(tensor->name); + if (it == tensor_offset.end()) { + throw std::runtime_error("missing GGUF tensor offset for: " + std::string(tensor->name)); + } + + const size_t offset = it->second; + const size_t num_bytes = ggml_nbytes(tensor); + + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error("failed to seek codec tensor: " + std::string(tensor->name)); + } + + if (ggml_backend_buffer_is_host(buffer.get())) { + fin.read(reinterpret_cast(tensor->data), (std::streamsize) num_bytes); + } else { + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), (std::streamsize) num_bytes); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, num_bytes); + } + + if (!fin) { + throw std::runtime_error("failed to read codec tensor: " + std::string(tensor->name)); + } + } + } +}; + +} // namespace + +struct moss_audio_tokenizer::impl { + int sample_rate = 0; + uint32_t downsample_rate = 0; + uint32_t num_quantizers = 0; + int n_threads = -1; + + ggml_backend_ptr backend; + ggml_context_ptr ctx_meta; + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; + ggml_backend_buffer_ptr weights_buffer; + + moss_codec_quantizer quantizer; + moss_codec_linear_f32 quantizer_input_proj_f32; + std::vector quantizer_entries_f32; + std::vector encoder; + std::vector decoder; + + explicit impl(const std::string & model_path, const moss_audio_tokenizer_options & options) { + moss_codec_gguf_loader loader(model_path); + + if (!loader.has_key(std::string(MOSS_CODEC_ARCH) + ".quantizer_type")) { + throw std::runtime_error("model does not contain bundled MOSS audio tokenizer metadata"); + } + + sample_rate = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".sampling_rate"); + downsample_rate = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".downsample_rate"); + num_quantizers = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.num_quantizers"); + n_threads = options.n_threads; + + quantizer.input_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.input_dim"); + quantizer.rvq_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.rvq_dim"); + quantizer.output_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.output_dim"); + quantizer.num_quantizers = (int) num_quantizers; + quantizer.codebook_size = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_size"); + quantizer.codebook_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_dim"); + quantizer.input_proj_w = loader.get_tensor("audio_tokenizer.quantizer.input_proj.weight", false); + quantizer.input_proj_b = loader.get_tensor("audio_tokenizer.quantizer.input_proj.bias", false); + quantizer.output_proj_w = loader.get_tensor("audio_tokenizer.quantizer.output_proj.weight", false); + quantizer.output_proj_b = loader.get_tensor("audio_tokenizer.quantizer.output_proj.bias", false); + quantizer.quantizers.resize(num_quantizers); + for (uint32_t iq = 0; iq < num_quantizers; ++iq) { + auto & entry = quantizer.quantizers[iq]; + const std::string prefix = "audio_tokenizer.quantizer.quantizers." + std::to_string(iq); + entry.in_proj_w = loader.get_tensor(prefix + ".in_proj.weight", false); + entry.in_proj_b = loader.get_tensor(prefix + ".in_proj.bias", false); + entry.codebook = loader.get_tensor(prefix + ".codebook.weight"); + entry.out_proj_w = loader.get_tensor(prefix + ".out_proj.weight", false); + entry.out_proj_b = loader.get_tensor(prefix + ".out_proj.bias", false); + } + + const auto load_modules = [&](const std::string & section_name, std::vector & modules) { + const uint32_t block_count = loader.get_u32(std::string(MOSS_CODEC_ARCH) + "." + section_name + ".block_count"); + modules.resize(block_count); + for (uint32_t ib = 0; ib < block_count; ++ib) { + const std::string block_prefix = std::string(MOSS_CODEC_ARCH) + "." + section_name + "." + std::to_string(ib); + moss_codec_module & block = modules[ib]; + block.type = moss_codec_module_type_from_string(loader.get_string(block_prefix + ".module_type")); + + if (block.type == moss_codec_module_type::PATCHED_PRETRANSFORM) { + block.patch_size = (int) loader.get_u32(block_prefix + ".patch_size"); + continue; + } + + auto & tr = block.transformer; + tr.input_dimension = (int) loader.get_u32(block_prefix + ".input_dimension"); + tr.output_dimension = (int) loader.get_u32(block_prefix + ".output_dimension"); + tr.d_model = (int) loader.get_u32(block_prefix + ".d_model"); + tr.num_heads = (int) loader.get_u32(block_prefix + ".num_heads"); + tr.num_layers = (int) loader.get_u32(block_prefix + ".num_layers"); + tr.dim_feedforward = (int) loader.get_u32(block_prefix + ".dim_feedforward"); + tr.context = (int) loader.get_u32(block_prefix + ".context"); + tr.max_period = loader.get_f32(block_prefix + ".max_period", false, 10000.0f); + tr.input_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".input_proj.weight", false); + tr.output_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".output_proj.weight", false); + + tr.layers.resize(tr.num_layers); + for (int il = 0; il < tr.num_layers; ++il) { + auto & layer = tr.layers[il]; + const std::string layer_prefix = + "audio_tokenizer." + section_name + "." + std::to_string(ib) + ".transformer.layers." + std::to_string(il); + layer.attn_in = loader.get_tensor(layer_prefix + ".self_attn.in_projs.0.weight"); + layer.attn_out = loader.get_tensor(layer_prefix + ".self_attn.out_projs.0.weight"); + layer.linear1 = loader.get_tensor(layer_prefix + ".linear1.weight"); + layer.linear2 = loader.get_tensor(layer_prefix + ".linear2.weight"); + layer.norm1_w = loader.get_tensor(layer_prefix + ".norm1.weight"); + layer.norm1_b = loader.get_tensor(layer_prefix + ".norm1.bias"); + layer.norm2_w = loader.get_tensor(layer_prefix + ".norm2.weight"); + layer.norm2_b = loader.get_tensor(layer_prefix + ".norm2.bias"); + layer.scale1 = loader.get_tensor(layer_prefix + ".layer_scale_1.scale", false); + layer.scale2 = loader.get_tensor(layer_prefix + ".layer_scale_2.scale", false); + } + } + }; + + load_modules("encoder", encoder); + load_modules("decoder", decoder); + + loader.load_tensor_bytes(); + + backend = std::move(loader.backend); + ctx_meta = std::move(loader.ctx_meta); + ctx_gguf = std::move(loader.ctx_gguf); + ctx_data = std::move(loader.ctx_data); + weights_buffer = std::move(loader.buffer); + + quantizer_input_proj_f32 = moss_codec_linear_from_tensors(quantizer.input_proj_w, quantizer.input_proj_b); + quantizer_entries_f32.resize(num_quantizers); + for (uint32_t iq = 0; iq < num_quantizers; ++iq) { + auto & dst = quantizer_entries_f32[iq]; + const auto & src = quantizer.quantizers[iq]; + dst.in_proj = moss_codec_linear_from_tensors(src.in_proj_w, src.in_proj_b); + dst.out_proj = moss_codec_linear_from_tensors(src.out_proj_w, src.out_proj_b); + dst.codebook_dim = (int) src.codebook->ne[0]; + dst.codebook_size = (int) src.codebook->ne[1]; + dst.codebook = moss_codec_tensor_to_f32(src.codebook); + dst.codebook_unit = moss_codec_normalize_rows(dst.codebook, dst.codebook_dim); + } + + LLAMA_LOG_INFO("%s: sample_rate=%d downsample_rate=%u num_quantizers=%u encoder_blocks=%zu decoder_blocks=%zu\n", + __func__, sample_rate, downsample_rate, num_quantizers, encoder.size(), decoder.size()); + } + + std::vector run_quantizer_decode( + const std::vector & codes, + const size_t n_frames, + uint32_t n_quantizers_req) const { + const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; + if (nq == 0 || nq > num_quantizers) { + throw std::runtime_error("invalid quantizer count for decode"); + } + if (codes.size() != n_frames * (size_t) nq) { + throw std::runtime_error("raw code size does not match frame count"); + } + + const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) nq * 8; + const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false); + std::vector meta_buf(meta_size); + + ggml_init_params params = { + /*.mem_size =*/ meta_size, + /*.mem_buffer =*/ meta_buf.data(), + /*.no_alloc =*/ true, + }; + ggml_context * ctx0 = ggml_init(params); + if (!ctx0) { + throw std::runtime_error("failed to init quantizer decode ggml context"); + } + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false); + std::vector code_inputs(nq); + + ggml_tensor * cur = nullptr; + for (uint32_t iq = 0; iq < nq; ++iq) { + const auto & entry = quantizer.quantizers[iq]; + ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames); + ggml_set_input(inp); + code_inputs[iq] = inp; + + ggml_tensor * emb = ggml_get_rows(ctx0, entry.codebook, inp); + if (entry.out_proj_w) { + emb = ggml_mul_mat(ctx0, entry.out_proj_w, emb); + } + if (entry.out_proj_b) { + emb = ggml_add(ctx0, emb, entry.out_proj_b); + } + cur = cur ? ggml_add(ctx0, cur, emb) : emb; + } + + if (quantizer.output_proj_w) { + cur = ggml_mul_mat(ctx0, quantizer.output_proj_w, cur); + } + if (quantizer.output_proj_b) { + cur = ggml_add(ctx0, cur, quantizer.output_proj_b); + } + + ggml_build_forward_expand(gf, cur); + + ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) }; + ggml_gallocr_alloc_graph(allocr.get(), gf); + + for (uint32_t iq = 0; iq < nq; ++iq) { + std::vector gathered(n_frames); + for (size_t t = 0; t < n_frames; ++t) { + const llama_token code = codes[t * (size_t) nq + iq]; + if (code < 0 || code >= quantizer.codebook_size) { + ggml_free(ctx0); + throw std::runtime_error("audio code out of codec range during decode"); + } + gathered[t] = (int32_t) code; + } + ggml_backend_tensor_set(code_inputs[iq], gathered.data(), 0, gathered.size() * sizeof(int32_t)); + } + + moss_codec_set_n_threads(backend.get(), n_threads); + const ggml_status status = ggml_backend_graph_compute(backend.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + ggml_free(ctx0); + throw std::runtime_error("quantizer decode graph compute failed"); + } + + std::vector output = moss_codec_copy_f32_output(cur); + ggml_free(ctx0); + return output; + } + + std::vector run_transformer_block( + const moss_codec_transformer_block & block, + const std::vector & input, + const size_t n_frames) const { + if (input.size() != (size_t) block.input_dimension * n_frames) { + throw std::runtime_error("transformer block input size mismatch"); + } + + const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) block.num_layers * MOSS_CODEC_MAX_NODES_PER_LAYER; + const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false); + std::vector meta_buf(meta_size); + + ggml_init_params params = { + /*.mem_size =*/ meta_size, + /*.mem_buffer =*/ meta_buf.data(), + /*.no_alloc =*/ true, + }; + ggml_context * ctx0 = ggml_init(params); + if (!ctx0) { + throw std::runtime_error("failed to init transformer ggml context"); + } + + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false); + + ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, block.input_dimension, (int64_t) n_frames); + ggml_set_input(inp); + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames); + ggml_set_input(positions); + ggml_tensor * mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, (int64_t) n_frames, (int64_t) n_frames, 1, 1); + ggml_set_input(mask); + + ggml_tensor * cur = inp; + if (block.input_proj) { + cur = ggml_mul_mat(ctx0, block.input_proj, cur); + } + + const int d_head = block.d_model / block.num_heads; + const float attn_scale = 1.0f / std::sqrt((float) d_head); + + for (int il = 0; il < block.num_layers; ++il) { + const auto & layer = block.layers[il]; + + ggml_tensor * inp_sa = cur; + ggml_tensor * x = moss_codec_build_layer_norm(ctx0, cur, layer.norm1_w, layer.norm1_b); + ggml_tensor * qkv = ggml_mul_mat(ctx0, layer.attn_in, x); + + ggml_tensor * q = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], 0); + ggml_tensor * k = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model)); + ggml_tensor * v = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, + ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model)); + + q = ggml_rope_ext(ctx0, q, positions, nullptr, d_head, 0, 0, + block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + k = ggml_rope_ext(ctx0, k, positions, nullptr, d_head, 0, 0, + block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + ggml_tensor * attn = moss_codec_build_attention(ctx0, layer.attn_out, q, k, v, mask, attn_scale); + if (layer.scale1) { + attn = ggml_mul(ctx0, attn, layer.scale1); + } + cur = ggml_add(ctx0, inp_sa, attn); + + ggml_tensor * inp_ff = cur; + x = moss_codec_build_layer_norm(ctx0, cur, layer.norm2_w, layer.norm2_b); + x = ggml_mul_mat(ctx0, layer.linear1, x); + x = ggml_gelu(ctx0, x); + x = ggml_mul_mat(ctx0, layer.linear2, x); + if (layer.scale2) { + x = ggml_mul(ctx0, x, layer.scale2); + } + cur = ggml_add(ctx0, inp_ff, x); + } + + if (block.output_proj) { + cur = ggml_mul_mat(ctx0, block.output_proj, cur); + } + + ggml_build_forward_expand(gf, cur); + + ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) }; + ggml_gallocr_alloc_graph(allocr.get(), gf); + + const std::vector positions_data = moss_codec_make_positions(n_frames); + const std::vector mask_data = moss_codec_make_causal_mask(n_frames, block.context); + + ggml_backend_tensor_set(inp, input.data(), 0, input.size() * sizeof(float)); + ggml_backend_tensor_set(positions, positions_data.data(), 0, positions_data.size() * sizeof(int32_t)); + ggml_backend_tensor_set(mask, mask_data.data(), 0, mask_data.size() * sizeof(float)); + + moss_codec_set_n_threads(backend.get(), n_threads); + const ggml_status status = ggml_backend_graph_compute(backend.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + ggml_free(ctx0); + throw std::runtime_error("transformer graph compute failed"); + } + + std::vector output = moss_codec_copy_f32_output(cur); + ggml_free(ctx0); + return output; + } + + std::vector decode( + const std::vector & codes, + const size_t n_frames, + const uint32_t n_quantizers_req) const { + uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; + if (nq == 0 || nq > num_quantizers) { + throw std::runtime_error("invalid quantizer count"); + } + + std::vector cur = run_quantizer_decode(codes, n_frames, nq); + int channels = quantizer.output_dim; + size_t frames = n_frames; + + for (const auto & module : decoder) { + switch (module.type) { + case moss_codec_module_type::TRANSFORMER: + cur = run_transformer_block(module.transformer, cur, frames); + channels = module.transformer.output_dimension; + break; + case moss_codec_module_type::PATCHED_PRETRANSFORM: + cur = moss_codec_patch_decode(cur, channels, frames, module.patch_size); + channels /= module.patch_size; + frames *= (size_t) module.patch_size; + break; + } + } + + if (channels != 1) { + throw std::runtime_error("codec decoder did not end with a mono waveform channel"); + } + + return cur; + } + + std::vector run_quantizer_encode( + const std::vector & input, + const size_t n_frames, + const uint32_t n_quantizers_req) const { + const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; + if (nq == 0 || nq > num_quantizers) { + throw std::runtime_error("invalid quantizer count for encode"); + } + + std::vector residual = moss_codec_linear_apply(quantizer_input_proj_f32, input, n_frames); + if (residual.size() != (size_t) quantizer.rvq_dim * n_frames) { + throw std::runtime_error("quantizer input projection size mismatch"); + } + + std::vector codes(n_frames * (size_t) nq, 0); + std::vector latents; + std::vector latents_unit; + std::vector decoded; + + for (uint32_t iq = 0; iq < nq; ++iq) { + const auto & entry = quantizer_entries_f32[iq]; + latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames); + if (latents.size() != (size_t) entry.codebook_dim * n_frames) { + throw std::runtime_error("quantizer latent projection size mismatch"); + } + + latents_unit.resize(latents.size()); + for (size_t t = 0; t < n_frames; ++t) { + const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim; + float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim; + + float norm2 = 0.0f; + for (int d = 0; d < entry.codebook_dim; ++d) { + norm2 += in_ptr[d] * in_ptr[d]; + } + const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); + for (int d = 0; d < entry.codebook_dim; ++d) { + out_ptr[d] = in_ptr[d] * inv; + } + } + + std::vector codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f); + for (size_t t = 0; t < n_frames; ++t) { + const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim; + + float best_score = -std::numeric_limits::infinity(); + int best_index = 0; + for (int code = 0; code < entry.codebook_size; ++code) { + const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim; + float score = 0.0f; + for (int d = 0; d < entry.codebook_dim; ++d) { + score += row[d] * latent[d]; + } + if (score > best_score) { + best_score = score; + best_index = code; + } + } + + codes[t * (size_t) nq + iq] = best_index; + const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim; + std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim)); + } + + decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames); + if (decoded.size() != residual.size()) { + throw std::runtime_error("quantizer decoded embedding size mismatch"); + } + + for (size_t i = 0; i < residual.size(); ++i) { + residual[i] -= decoded[i]; + } + } + + return codes; + } + + std::vector encode( + const std::vector & audio, + size_t * out_frames, + const uint32_t n_quantizers_req) const { + const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; + if (nq == 0 || nq > num_quantizers) { + throw std::runtime_error("invalid quantizer count"); + } + + const size_t padded_samples = + ((audio.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate; + const size_t valid_frames = audio.size() / (size_t) downsample_rate; + + std::vector cur(padded_samples, 0.0f); + std::copy(audio.begin(), audio.end(), cur.begin()); + + int channels = 1; + size_t frames = padded_samples; + + for (const auto & module : encoder) { + switch (module.type) { + case moss_codec_module_type::PATCHED_PRETRANSFORM: + cur = moss_codec_patch_encode(cur, channels, frames, module.patch_size); + channels *= module.patch_size; + frames /= (size_t) module.patch_size; + break; + case moss_codec_module_type::TRANSFORMER: + cur = run_transformer_block(module.transformer, cur, frames); + channels = module.transformer.output_dimension; + break; + } + } + + if (channels != quantizer.input_dim) { + throw std::runtime_error("codec encoder output dimension does not match quantizer input dimension"); + } + + std::vector codes = run_quantizer_encode(cur, frames, nq); + if (out_frames) { + *out_frames = valid_frames; + } + + if (valid_frames >= frames) { + return codes; + } + + std::vector trimmed(valid_frames * (size_t) nq); + for (size_t t = 0; t < valid_frames; ++t) { + std::copy_n(codes.data() + t * (size_t) nq, nq, trimmed.data() + t * (size_t) nq); + } + return trimmed; + } +}; + +moss_audio_tokenizer::moss_audio_tokenizer( + const std::string & model_path, + const moss_audio_tokenizer_options & options) + : impl_(std::make_unique(model_path, options)) { +} + +moss_audio_tokenizer::~moss_audio_tokenizer() = default; + +moss_audio_tokenizer::moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept = default; + +moss_audio_tokenizer & moss_audio_tokenizer::operator=(moss_audio_tokenizer &&) noexcept = default; + +int moss_audio_tokenizer::sample_rate() const { + return impl_->sample_rate; +} + +uint32_t moss_audio_tokenizer::downsample_rate() const { + return impl_->downsample_rate; +} + +uint32_t moss_audio_tokenizer::num_quantizers() const { + return impl_->num_quantizers; +} + +std::vector moss_audio_tokenizer::decode( + const std::vector & codes, + const size_t n_frames, + const uint32_t n_quantizers) const { + return impl_->decode(codes, n_frames, n_quantizers); +} + +std::vector moss_audio_tokenizer::encode( + const std::vector & audio, + size_t * out_frames, + const uint32_t n_quantizers) const { + return impl_->encode(audio, out_frames, n_quantizers); +} + +static std::string moss_codec_model_meta_str(const llama_model * model, const std::string & key) { + const auto it = model->gguf_kv.find(key); + if (it == model->gguf_kv.end()) { + throw std::runtime_error("missing GGUF key: " + key); + } + + std::string value = it->second; + if (value.size() >= 2 && ((value.front() == '\'' && value.back() == '\'') || (value.front() == '"' && value.back() == '"'))) { + value = value.substr(1, value.size() - 2); + } + return value; +} + +static uint32_t moss_codec_model_meta_u32(const llama_model * model, const std::string & key) { + return (uint32_t) std::stoul(moss_codec_model_meta_str(model, key)); +} + +static const ggml_tensor * moss_codec_model_require_tensor(const llama_model * model, const std::string & name) { + const ggml_tensor * tensor = model->get_tensor(name.c_str()); + if (tensor == nullptr) { + throw std::runtime_error("missing tensor: " + name); + } + return tensor; +} + +static const ggml_tensor * moss_codec_model_optional_tensor(const llama_model * model, const std::string & name) { + return model->get_tensor(name.c_str()); +} + +int moss_audio_model_sample_rate(const llama_model * model) { + const std::string arch_name = llm_arch_name(model->arch); + return (int) moss_codec_model_meta_u32(model, arch_name + ".sampling_rate"); +} + +uint32_t moss_audio_model_downsample_rate(const llama_model * model) { + const std::string arch_name = llm_arch_name(model->arch); + return moss_codec_model_meta_u32(model, arch_name + ".downsample_rate"); +} + +uint32_t moss_audio_model_num_quantizers(const llama_model * model) { + const std::string arch_name = llm_arch_name(model->arch); + return moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers"); +} + +std::vector moss_audio_model_quantizer_encode( + const llama_model * model, + const std::vector & input, + size_t n_frames, + uint32_t n_quantizers_req) { + if (model->arch != LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) { + throw std::runtime_error("quantizer encode expects a moss-tts-audio-encoder model"); + } + + const std::string arch_name = llm_arch_name(model->arch); + const uint32_t num_quantizers = moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers"); + const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; + if (nq == 0 || nq > num_quantizers) { + throw std::runtime_error("invalid quantizer count"); + } + + moss_codec_linear_f32 quantizer_input_proj = moss_codec_linear_from_tensors( + const_cast(moss_codec_model_require_tensor(model, "quantizer.input_proj.weight")), + const_cast(moss_codec_model_optional_tensor(model, "quantizer.input_proj.bias"))); + + std::vector quantizers(nq); + for (uint32_t iq = 0; iq < nq; ++iq) { + auto & entry = quantizers[iq]; + const std::string prefix = "quantizer.quantizers." + std::to_string(iq); + entry.in_proj = moss_codec_linear_from_tensors( + const_cast(moss_codec_model_require_tensor(model, prefix + ".in_proj.weight")), + const_cast(moss_codec_model_optional_tensor(model, prefix + ".in_proj.bias"))); + entry.out_proj = moss_codec_linear_from_tensors( + const_cast(moss_codec_model_require_tensor(model, prefix + ".out_proj.weight")), + const_cast(moss_codec_model_optional_tensor(model, prefix + ".out_proj.bias"))); + const ggml_tensor * codebook = moss_codec_model_require_tensor(model, prefix + ".codebook.weight"); + entry.codebook_dim = (int) codebook->ne[0]; + entry.codebook_size = (int) codebook->ne[1]; + entry.codebook = moss_codec_tensor_to_f32(codebook); + entry.codebook_unit = moss_codec_normalize_rows(entry.codebook, entry.codebook_dim); + } + + std::vector residual = moss_codec_linear_apply(quantizer_input_proj, input, n_frames); + std::vector codes(n_frames * (size_t) nq, 0); + std::vector latents; + std::vector latents_unit; + std::vector decoded; + + for (uint32_t iq = 0; iq < nq; ++iq) { + const auto & entry = quantizers[iq]; + latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames); + if (latents.size() != (size_t) entry.codebook_dim * n_frames) { + throw std::runtime_error("quantizer latent projection size mismatch"); + } + + latents_unit.resize(latents.size()); + for (size_t t = 0; t < n_frames; ++t) { + const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim; + float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim; + + float norm2 = 0.0f; + for (int d = 0; d < entry.codebook_dim; ++d) { + norm2 += in_ptr[d] * in_ptr[d]; + } + const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); + for (int d = 0; d < entry.codebook_dim; ++d) { + out_ptr[d] = in_ptr[d] * inv; + } + } + + std::vector codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f); + for (size_t t = 0; t < n_frames; ++t) { + const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim; + + float best_score = -std::numeric_limits::infinity(); + int best_index = 0; + for (int code = 0; code < entry.codebook_size; ++code) { + const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim; + float score = 0.0f; + for (int d = 0; d < entry.codebook_dim; ++d) { + score += row[d] * latent[d]; + } + if (score > best_score) { + best_score = score; + best_index = code; + } + } + + codes[t * (size_t) nq + iq] = best_index; + const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim; + std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim)); + } + + decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames); + if (decoded.size() != residual.size()) { + throw std::runtime_error("quantizer decoded embedding size mismatch"); + } + + for (size_t i = 0; i < residual.size(); ++i) { + residual[i] -= decoded[i]; + } + } + + return codes; +} diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt index b91a84759..838ad3b42 100644 --- a/tools/tts/CMakeLists.txt +++ b/tools/tts/CMakeLists.txt @@ -8,7 +8,7 @@ if(LLAMA_TOOLS_INSTALL) endif() set(TARGET llama-moss-tts) -add_executable(${TARGET} moss-tts.cpp) +add_executable(${TARGET} run-moss-tts-delay.cpp) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/tts/moss-tts-audio-decode.py b/tools/tts/moss-tts-audio-decode.py index 160579149..bf7e3ebe2 100755 --- a/tools/tts/moss-tts-audio-decode.py +++ b/tools/tts/moss-tts-audio-decode.py @@ -3,32 +3,14 @@ from __future__ import annotations import argparse -import os import struct -import sys import wave from pathlib import Path import numpy as np - -def resolve_moss_tts_dir() -> Path: - env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT") - if env_dir: - path = Path(env_dir).expanduser().resolve() - else: - path = Path(__file__).resolve().parents[3] / "MOSS-TTS" - - if not path.is_dir(): - raise FileNotFoundError( - f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root." - ) - return path - - -sys.path.insert(0, str(resolve_moss_tts_dir())) - -from moss_tts_delay.llama_cpp._constants import N_VQ, SAMPLE_RATE # noqa: E402 +from moss_tts_onnx import OnnxAudioTokenizer +from moss_tts_processor import N_VQ, SAMPLE_RATE CODES_MAGIC = 0x53444F43 # "CODS" @@ -73,13 +55,6 @@ def main() -> int: ap.add_argument("--cpu", action="store_true") args = ap.parse_args() - try: - from moss_audio_tokenizer.onnx import OnnxAudioTokenizer - except Exception as exc: - raise RuntimeError( - "moss_audio_tokenizer.onnx is unavailable; initialize the submodule/package and install ONNX deps" - ) from exc - codes = read_codes(Path(args.codes_bin)) if codes.ndim != 2 or codes.shape[1] != N_VQ: raise RuntimeError(f"expected raw codes with shape (T, {N_VQ}), got {codes.shape}") diff --git a/tools/tts/moss-tts-build-generation-ref.py b/tools/tts/moss-tts-build-generation-ref.py index 48a784673..aa01d5e81 100755 --- a/tools/tts/moss-tts-build-generation-ref.py +++ b/tools/tts/moss-tts-build-generation-ref.py @@ -3,31 +3,19 @@ from __future__ import annotations import argparse -import os import struct import sys from pathlib import Path import numpy as np +from moss_tts_onnx import OnnxAudioTokenizer +from moss_tts_processor import AUDIO_PAD_CODE, Tokenizer, build_generation_prompt + REF_MAGIC = 0x4652474D # "MGRF" REF_VERSION = 1 -def resolve_moss_tts_dir() -> Path: - env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT") - if env_dir: - path = Path(env_dir).expanduser().resolve() - else: - path = Path(__file__).resolve().parents[3] / "MOSS-TTS" - - if not path.is_dir(): - raise FileNotFoundError( - f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root." - ) - return path - - def parse_args() -> argparse.Namespace: ap = argparse.ArgumentParser( description="Build first-class MOSS-TTS generation input (.bin) from text (+ optional reference audio)." @@ -59,7 +47,6 @@ def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None: raise ValueError("--encoder-onnx and --decoder-onnx are required when --reference-audio is set") import soundfile as sf - from moss_audio_tokenizer.onnx import OnnxAudioTokenizer wav, sr = sf.read(args.reference_audio, dtype="float32") if wav.ndim > 1: @@ -79,11 +66,6 @@ def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None: def main() -> int: args = parse_args() - sys.path.insert(0, str(resolve_moss_tts_dir())) - - from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE - from moss_tts_delay.llama_cpp.processor import Tokenizer, build_generation_prompt - text = _load_text(args) reference_codes = _read_reference_codes(args) diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py index 21647373f..bf112597e 100755 --- a/tools/tts/moss-tts-firstclass-e2e.py +++ b/tools/tts/moss-tts-firstclass-e2e.py @@ -13,8 +13,9 @@ def run_cmd(cmd: list[str], env: dict[str, str] | None = None) -> subprocess.CompletedProcess: - print("+", shlex.join(cmd), flush=True) - return subprocess.run(cmd, env=env, check=False) + cmd_str = shlex.join(cmd) + print("+", cmd_str, flush=True) + return subprocess.run(cmd_str, env=env, check=False, shell=True) def need_file(path: Path, name: str) -> None: @@ -32,7 +33,11 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", "")) - parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", ""))) + parser.add_argument( + "--moss-tts-dir", + default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", "")), + help="Deprecated compatibility flag; the first-class helpers no longer require a MOSS-TTS checkout.", + ) parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", "")) parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", "")) parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", "")) @@ -82,7 +87,6 @@ def main() -> int: onnx_decoder = Path(args.onnx_decoder).expanduser().resolve() python_bin = Path(args.python_bin).expanduser().resolve() output_wav = Path(args.output_wav).expanduser().resolve() - moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve() if args.moss_tts_dir else None need_file(python_bin, "python binary") need_file(model_gguf, "first-class model gguf") @@ -91,8 +95,6 @@ def main() -> int: need_file(onnx_decoder, "ONNX decoder") need_file(build_ref_script, "generation-ref builder") need_file(decode_script, "audio decode helper") - if moss_tts_dir is not None and not moss_tts_dir.is_dir(): - raise FileNotFoundError(f"missing MOSS-TTS repo: {moss_tts_dir}") if args.text_file: need_file(Path(args.text_file).expanduser().resolve(), "text file") if args.reference_audio: @@ -119,12 +121,6 @@ def main() -> int: need_file(llama_bin, "llama-moss-tts binary") output_wav.parent.mkdir(parents=True, exist_ok=True) shared_env = os.environ.copy() - if moss_tts_dir is not None: - shared_env["MOSS_TTS_DIR"] = str(moss_tts_dir) - old_pythonpath = shared_env.get("PYTHONPATH") - shared_env["PYTHONPATH"] = ( - f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir) - ) with tempfile.TemporaryDirectory(prefix="moss-tts-firstclass-") as tmpdir: tmpdir_path = Path(tmpdir) diff --git a/tools/tts/moss_tts_onnx.py b/tools/tts/moss_tts_onnx.py new file mode 100644 index 000000000..6235bc2f6 --- /dev/null +++ b/tools/tts/moss_tts_onnx.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np + +N_QUANTIZERS = 32 +DOWNSAMPLE_RATE = 1920 + + +def _load_ort_session(model_path: str | Path, use_gpu: bool): + try: + import onnxruntime as ort + except ImportError as exc: + raise RuntimeError("onnxruntime is required for MOSS audio tokenizer ONNX inference") from exc + + providers = ["CPUExecutionProvider"] + if use_gpu: + available = set(ort.get_available_providers()) + if "CUDAExecutionProvider" in available: + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + + session_options = ort.SessionOptions() + return ort.InferenceSession(str(model_path), sess_options=session_options, providers=providers) + + +class OnnxAudioTokenizer: + """Minimal ONNX wrapper for the MOSS audio tokenizer.""" + + def __init__(self, encoder_path: str | Path, decoder_path: str | Path, use_gpu: bool = True): + self.encoder_session = _load_ort_session(encoder_path, use_gpu) + self.decoder_session = _load_ort_session(decoder_path, use_gpu) + self.encoder_inputs = [item.name for item in self.encoder_session.get_inputs()] + self.encoder_outputs = [item.name for item in self.encoder_session.get_outputs()] + self.decoder_inputs = [item.name for item in self.decoder_session.get_inputs()] + self.decoder_outputs = [item.name for item in self.decoder_session.get_outputs()] + + def encode(self, waveform: np.ndarray, n_quantizers: int = N_QUANTIZERS) -> np.ndarray: + if waveform.ndim == 1: + waveform = waveform[np.newaxis, np.newaxis, :] + elif waveform.ndim == 2: + waveform = waveform[np.newaxis, :] + + t = waveform.shape[-1] + padded = ((t + DOWNSAMPLE_RATE - 1) // DOWNSAMPLE_RATE) * DOWNSAMPLE_RATE + if padded != t: + waveform = np.concatenate( + [waveform, np.zeros((waveform.shape[0], waveform.shape[1], padded - t), dtype=np.float32)], + axis=-1, + ) + + result = self.encoder_session.run( + self.encoder_outputs, + { + self.encoder_inputs[0]: waveform.astype(np.float32), + self.encoder_inputs[1]: np.array(n_quantizers, dtype=np.int64), + }, + ) + return result[0][:, 0, :int(result[1][0])].T.astype(np.int64) + + def decode(self, audio_codes: np.ndarray, n_quantizers: int = N_QUANTIZERS) -> np.ndarray: + if audio_codes.ndim == 2: + if audio_codes.shape[1] == N_QUANTIZERS and audio_codes.shape[0] != N_QUANTIZERS: + audio_codes = audio_codes.T + audio_codes = audio_codes[:, np.newaxis, :] + + result = self.decoder_session.run( + self.decoder_outputs, + { + self.decoder_inputs[0]: audio_codes.astype(np.int64), + self.decoder_inputs[1]: np.array(n_quantizers, dtype=np.int64), + }, + ) + return result[0][0, 0, :int(result[1][0])].astype(np.float32) diff --git a/tools/tts/moss_tts_processor.py b/tools/tts/moss_tts_processor.py new file mode 100644 index 000000000..6f1b456e5 --- /dev/null +++ b/tools/tts/moss_tts_processor.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import logging +import re +from pathlib import Path + +import numpy as np + +log = logging.getLogger(__name__) + +AUDIO_PLACEHOLDER = "<|audio|>" + +N_VQ = 32 +PAD_TOKEN_ID = 151643 +IM_START_TOKEN_ID = 151644 +IM_END_TOKEN_ID = 151645 +AUDIO_START_TOKEN_ID = 151652 +AUDIO_END_TOKEN_ID = 151653 +AUDIO_USER_SLOT_TOKEN_ID = 151654 +AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = 151656 +AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = 151662 +AUDIO_PAD_CODE = 1024 +AUDIO_VOCAB_SIZE = 1024 +SAMPLE_RATE = 24000 + + +class Tokenizer: + """Thin wrapper around the Hugging Face `tokenizers` library.""" + + def __init__(self, tokenizer_dir: str | Path): + from tokenizers import Tokenizer as HFTokenizer + + tokenizer_path = Path(tokenizer_dir) / "tokenizer.json" + if not tokenizer_path.exists(): + raise FileNotFoundError(f"tokenizer.json not found in {tokenizer_dir}") + self._tok = HFTokenizer.from_file(str(tokenizer_path)) + log.info("Tokenizer loaded from %s (vocab=%d)", tokenizer_path, self._tok.get_vocab_size()) + + def encode(self, text: str) -> list[int]: + return self._tok.encode(text).ids + + def decode(self, ids: list[int]) -> str: + return self._tok.decode(ids) + + @property + def vocab_size(self) -> int: + return self._tok.get_vocab_size() + + def id_to_token(self, token_id: int) -> str | None: + return self._tok.id_to_token(token_id) + + +def _get_special_token_str(tokenizer: Tokenizer, token_id: int) -> str: + token = tokenizer.id_to_token(token_id) + if token is None: + raise ValueError(f"Token ID {token_id} not in vocabulary") + return token + + +def build_generation_prompt( + tokenizer: Tokenizer, + text: str, + reference_codes: np.ndarray | None = None, + instruction: str | None = None, + tokens: int | None = None, + quality: str | None = None, + language: str | None = None, + sound_event: str | None = None, + ambient_sound: str | None = None, +) -> np.ndarray: + """Build the packed multi-channel prompt as (T, 1 + N_VQ).""" + + audio_start_tok = _get_special_token_str(tokenizer, AUDIO_START_TOKEN_ID) + audio_end_tok = _get_special_token_str(tokenizer, AUDIO_END_TOKEN_ID) + gen_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID) + delay_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID) + user_slot_tok = _get_special_token_str(tokenizer, AUDIO_USER_SLOT_TOKEN_ID) + im_start_tok = _get_special_token_str(tokenizer, IM_START_TOKEN_ID) + im_end_tok = _get_special_token_str(tokenizer, IM_END_TOKEN_ID) + + has_ref = reference_codes is not None and reference_codes.shape[0] > 0 + ref_str = f"[S1]:\n{AUDIO_PLACEHOLDER}" if has_ref else "None" + + user_content = ( + f"\n" + f"- Reference(s):\n{ref_str}\n" + f"- Instruction:\n{instruction}\n" + f"- Tokens:\n{tokens}\n" + f"- Quality:\n{quality}\n" + f"- Sound Event:\n{sound_event}\n" + f"- Ambient Sound:\n{ambient_sound}\n" + f"- Language:\n{language}\n" + f"- Text:\n{text}\n" + f"" + ) + + ref_lengths = [reference_codes.shape[0]] if has_ref else [] + user_content = _replace_audio_placeholders( + user_content, + ref_lengths, + n_vq=N_VQ, + gen_slot_token=user_slot_tok, + delay_slot_token=user_slot_tok, + audio_start_token=audio_start_tok, + audio_end_token=audio_end_tok, + ) + + full_text = f"{im_start_tok}user\n{user_content}{im_end_tok}\n{im_start_tok}assistant\n" + ref_audio_list = [reference_codes] if has_ref else [] + unified_codes = _get_unified_codes(tokenizer, full_text, ref_audio_list) + + assistant_gen = f"{audio_start_tok}" + gen_ids = np.array(tokenizer.encode(assistant_gen), dtype=np.int64) + gen_multi = np.full((len(gen_ids), 1 + N_VQ), AUDIO_PAD_CODE, dtype=np.int64) + gen_multi[:, 0] = gen_ids + + return np.concatenate([unified_codes, gen_multi], axis=0) + + +def parse_generation_output( + tokenizer: Tokenizer, + generation_ids: np.ndarray, + prompt_len: int, +) -> tuple[str, np.ndarray]: + """Parse generated packed IDs into text and raw audio codes.""" + + gen_part = generation_ids[prompt_len:] + text_channel = gen_part[:, 0].tolist() + audio_channels = gen_part[:, 1:] + + audio_start_tok = _get_special_token_str(tokenizer, AUDIO_START_TOKEN_ID) + gen_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID) + delay_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID) + audio_end_tok = _get_special_token_str(tokenizer, AUDIO_END_TOKEN_ID) + + raw_text = tokenizer.decode(text_channel) + pattern = re.compile( + rf"(?:{re.escape(audio_start_tok)})?" + rf"(?:{re.escape(gen_slot_tok)})*" + rf"(?:{re.escape(delay_slot_tok)})*" + rf"{re.escape(audio_end_tok)}" + ) + + def repl(match: re.Match[str]) -> str: + segment = match.group(0) + if gen_slot_tok in segment: + return AUDIO_PLACEHOLDER + return "" + + text = pattern.sub(repl, raw_text) + segments = extract_audio_segments(audio_channels) + if segments: + audio_codes = np.concatenate(segments, axis=0) + else: + audio_codes = np.zeros((0, N_VQ), dtype=np.int64) + + return text, audio_codes + + +def _replace_audio_placeholders( + content: str, + lengths: list[int], + n_vq: int, + gen_slot_token: str, + delay_slot_token: str, + audio_start_token: str, + audio_end_token: str, +) -> str: + num_placeholders = content.count(AUDIO_PLACEHOLDER) + if num_placeholders != len(lengths): + raise ValueError(f"Placeholder count ({num_placeholders}) != lengths count ({len(lengths)})") + + lengths_iter = iter(lengths) + + def _build_block(length: int) -> str: + if length == 0: + return f"{audio_start_token}{audio_end_token}" + step_tokens = gen_slot_token * length + delay_slot_token * (n_vq - 1) + return f"{audio_start_token}{step_tokens}{audio_end_token}" + + def replacer(match: re.Match[str]) -> str: + return _build_block(next(lengths_iter)) + + return re.sub(re.escape(AUDIO_PLACEHOLDER), replacer, content) + + +def _get_unified_codes( + tokenizer: Tokenizer, + content: str, + audio_codes_list: list[np.ndarray], + truncation: bool = False, +) -> np.ndarray: + text_ids = np.array(tokenizer.encode(content), dtype=np.int64) + + if len(audio_codes_list) == 0: + audio_channel = np.full((len(text_ids), N_VQ), AUDIO_PAD_CODE, dtype=np.int64) + return np.concatenate([text_ids[:, np.newaxis], audio_channel], axis=1) + + audio_start_indices = np.where(text_ids == AUDIO_START_TOKEN_ID)[0] + audio_end_indices = np.where(text_ids == AUDIO_END_TOKEN_ID)[0] + + if len(audio_start_indices) != len(audio_codes_list) or len(audio_end_indices) != len(audio_codes_list): + raise ValueError( + f"Audio markers ({len(audio_start_indices)} starts, {len(audio_end_indices)} ends) " + f"do not match codes ({len(audio_codes_list)})" + ) + + delay_parts: list[np.ndarray] = [] + prefix_idx = 0 + + for start_idx, end_idx, codes in zip(audio_start_indices, audio_end_indices, audio_codes_list): + start_idx = int(start_idx) + end_idx = int(end_idx) + delayed = apply_delay_pattern(codes, AUDIO_PAD_CODE) + pad_before = np.full((start_idx - prefix_idx + 1, N_VQ), AUDIO_PAD_CODE, dtype=np.int64) + delay_parts.extend([pad_before, delayed]) + prefix_idx = end_idx + + if truncation: + delay_parts[-1] = delay_parts[-1][:-(N_VQ - 1), :] + else: + last_end = int(audio_end_indices[-1]) + pad_after = np.full((len(text_ids) - last_end, N_VQ), AUDIO_PAD_CODE, dtype=np.int64) + delay_parts.append(pad_after) + + delay_audio = np.concatenate(delay_parts, axis=0) + if len(text_ids) != delay_audio.shape[0]: + text_ids = text_ids[:delay_audio.shape[0]] + + return np.concatenate([text_ids[:, np.newaxis], delay_audio], axis=1) + + +def apply_delay_pattern(codes: np.ndarray, pad_code: int = AUDIO_PAD_CODE) -> np.ndarray: + t, n_vq = codes.shape + delayed = np.full((t + n_vq - 1, n_vq), pad_code, dtype=codes.dtype) + for channel in range(n_vq): + delayed[channel:channel + t, channel] = codes[:, channel] + return delayed + + +def apply_de_delay_pattern(delay_codes: np.ndarray) -> np.ndarray: + total_len, n_vq = delay_codes.shape + t = total_len - n_vq + 1 + if t <= 0: + return np.zeros((0, n_vq), dtype=delay_codes.dtype) + codes = np.zeros((t, n_vq), dtype=delay_codes.dtype) + for channel in range(n_vq): + codes[:, channel] = delay_codes[channel:channel + t, channel] + return codes + + +def extract_audio_segments(generation_audio: np.ndarray) -> list[np.ndarray]: + codes = apply_de_delay_pattern(generation_audio) + if codes.shape[0] == 0: + return [] + + is_pad = np.all(codes == AUDIO_PAD_CODE, axis=1) + non_pad_idx = np.where(~is_pad)[0] + if len(non_pad_idx) == 0: + return [] + + segments: list[np.ndarray] = [] + start = int(non_pad_idx[0]) + for i in range(1, len(non_pad_idx)): + if int(non_pad_idx[i]) != int(non_pad_idx[i - 1]) + 1: + segments.append(codes[start:int(non_pad_idx[i - 1]) + 1]) + start = int(non_pad_idx[i]) + segments.append(codes[start:int(non_pad_idx[-1]) + 1]) + return segments diff --git a/tools/tts/moss-tts.cpp b/tools/tts/run-moss-tts-delay.cpp similarity index 60% rename from tools/tts/moss-tts.cpp rename to tools/tts/run-moss-tts-delay.cpp index 2e545297c..e98aab0ad 100644 --- a/tools/tts/moss-tts.cpp +++ b/tools/tts/run-moss-tts-delay.cpp @@ -3,8 +3,10 @@ #include "log.h" #include "llama.h" #include "llama-cpp.h" +#include "llama-moss-audio-tokenizer.h" #include +#include #include #include #include @@ -41,6 +43,23 @@ constexpr uint32_t MOSS_DECODE_REF_MAGIC = 0x4652444d; // "MDRF" constexpr uint32_t MOSS_DECODE_REF_VERSION = 1; constexpr uint32_t MOSS_GEN_REF_MAGIC = 0x4652474d; // "MGRF" constexpr uint32_t MOSS_GEN_REF_VERSION = 1; +constexpr const char * MOSS_AUDIO_PLACEHOLDER = "<|audio|>"; + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; + uint16_t num_channels = 1; + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; struct moss_sampling_config { float text_temperature = 1.5f; @@ -85,6 +104,12 @@ struct moss_generation_audio { size_t raw_frames = 0; }; +struct moss_prompt_input { + std::vector packed_ids; + size_t prompt_frames = 0; + size_t reference_frames = 0; +}; + struct moss_delay_state { int32_t audio_length = 0; int64_t delayed_length = MOSS_DELAY_INT64_MAX; @@ -173,6 +198,71 @@ static void moss_generate_from_ref( const moss_sampling_config & sampling_cfg, uint32_t seed, const std::string & dump_raw_codes_path, + const std::string & audio_decoder_model_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio); +static void moss_generate_from_prompt( + const std::string & model_path, + const std::vector & prompt_packed, + size_t prompt_frames, + size_t reference_frames, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & audio_decoder_model_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio); + +static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate); +static std::vector moss_encode_audio_llama( + const std::string & audio_encoder_model_path, + const std::string & wav_path, + int32_t n_gpu_layers, + uint32_t n_quantizers, + size_t * out_frames); +static void moss_decode_audio_llama( + const std::string & audio_decoder_model_path, + const std::vector & raw_codes, + size_t raw_frames, + const moss_delay_config & cfg, + int32_t n_gpu_layers, + const std::string & wav_out_path); +static void moss_decode_audio_native( + const std::string & model_path, + const std::vector & raw_codes, + size_t raw_frames, + const moss_delay_config & cfg, + const std::string & wav_out_path); +static std::vector moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate); +static moss_prompt_input moss_build_prompt_input( + const llama_vocab * vocab, + const moss_delay_config & cfg, + const std::string & text, + const std::string & language, + const std::vector & reference_codes, + size_t reference_frames); +static void moss_generate_from_text( + const std::string & model_path, + const std::string & text, + const std::string & language, + const std::string & reference_audio_path, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & audio_encoder_model_path, + const std::string & audio_decoder_model_path, const std::string & python_bin, const std::string & helper_script, const std::string & encoder_onnx, @@ -182,15 +272,25 @@ static void moss_generate_from_ref( struct llama_backend_scope { llama_backend_scope() { - llama_backend_init(); + if (refcount().fetch_add(1, std::memory_order_acq_rel) == 0) { + llama_backend_init(); + } } ~llama_backend_scope() { - llama_backend_free(); + if (refcount().fetch_sub(1, std::memory_order_acq_rel) == 1) { + llama_backend_free(); + } } llama_backend_scope(const llama_backend_scope &) = delete; llama_backend_scope & operator=(const llama_backend_scope &) = delete; + +private: + static std::atomic & refcount() { + static std::atomic value{0}; + return value; + } }; struct moss_owned_batch { @@ -256,9 +356,13 @@ static void print_usage(int argc, char ** argv) { LOG("\nexample usage:\n"); LOG(" %s -m model.gguf --print-delay-config\n", argv[0]); LOG(" %s -m model.gguf --generation-input generation.input.bin -ngl -1\n", argv[0]); + LOG(" %s -m model.gguf --audio-decoder-model audio_decoder.gguf --text \"你好,世界。\" --wav-out out.wav -ngl -1\n", argv[0]); + LOG(" %s -m model.gguf --audio-encoder-model audio_encoder.gguf --audio-decoder-model audio_decoder.gguf --text \"你好,世界。\" --reference-audio ref.wav --wav-out out.wav -ngl -1\n", argv[0]); LOG(" %s --decode-parity-ref decode.ref.bin\n", argv[0]); LOG("\noptions:\n"); LOG(" -ngl, --gpu-layers, --n-gpu-layers N number of layers to offload to GPU (default: -1)\n"); + LOG(" --audio-encoder-model PATH native moss-tts-audio-encoder GGUF for reference wav -> codes\n"); + LOG(" --audio-decoder-model PATH native moss-tts-audio-decoder GGUF for codes -> wav\n"); LOG("\n"); } @@ -347,6 +451,77 @@ static size_t moss_audio_vocab_with_pad(const moss_delay_config & cfg) { return std::max(cfg.audio_vocab_size + 1u, (size_t) cfg.audio_pad_code + 1u); } +static std::string moss_model_architecture(const llama_model * model) { + char buf[128]; + const int32_t n = llama_model_meta_val_str(model, "general.architecture", buf, sizeof(buf)); + if (n <= 0) { + throw std::runtime_error("missing general.architecture in GGUF metadata"); + } + return std::string(buf); +} + +struct moss_audio_runtime { + llama_model_ptr model; + llama_context_ptr ctx; +}; + +static llama_model_ptr moss_load_audio_model( + const std::string & model_path, + const char * expected_arch, + int32_t n_gpu_layers) { + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + mparams.n_gpu_layers = n_gpu_layers; + + llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); + if (!model) { + throw std::runtime_error("failed to load audio model: " + model_path); + } + + const std::string arch = moss_model_architecture(model.get()); + if (arch != expected_arch) { + throw std::runtime_error( + "unexpected audio model architecture for " + model_path + + ": expected " + expected_arch + ", got " + arch); + } + + return model; +} + +static llama_context_ptr moss_init_audio_context( + llama_model * model, + uint32_t n_ctx) { + llama_context_params cparams = llama_context_default_params(); + cparams.n_ctx = std::max(n_ctx, 1u); + cparams.n_batch = std::max(n_ctx, 1u); + cparams.n_ubatch = cparams.n_batch; + cparams.n_seq_max = 1; + cparams.embeddings = true; + cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; + + llama_context_ptr ctx(llama_init_from_model(model, cparams)); + if (!ctx) { + throw std::runtime_error("failed to create audio context"); + } + + llama_set_warmup(ctx.get(), false); + llama_set_causal_attn(ctx.get(), false); + llama_set_embeddings(ctx.get(), true); + + return ctx; +} + +static moss_audio_runtime moss_load_audio_runtime( + const std::string & model_path, + const char * expected_arch, + int32_t n_gpu_layers, + uint32_t n_ctx) { + moss_audio_runtime runtime; + runtime.model = moss_load_audio_model(model_path, expected_arch, n_gpu_layers); + runtime.ctx = moss_init_audio_context(runtime.model.get(), n_ctx); + return runtime; +} + static int64_t moss_find_last_equal(const std::vector & values, llama_token target) { for (int64_t i = (int64_t) values.size() - 1; i >= 0; --i) { if (values[(size_t) i] == target) { @@ -948,6 +1123,472 @@ static void moss_write_codes_file( moss_write_exact(out, raw_codes.data(), raw_codes.size(), "codes payload"); } +static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: failed to open '%s' for writing\n", __func__, fname.c_str()); + return false; + } + + wav_header header; + header.sample_rate = (uint32_t) sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = (uint32_t) (data.size() * (header.bits_per_sample / 8)); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const float sample : data) { + const int16_t pcm = (int16_t) std::clamp(sample * 32767.0f, -32768.0f, 32767.0f); + file.write(reinterpret_cast(&pcm), sizeof(pcm)); + } + + return file.good(); +} + +static moss_owned_batch moss_batch_from_audio_waveform(const std::vector & audio) { + moss_owned_batch owned_batch((int32_t) audio.size(), 1, 1); + llama_batch & batch = owned_batch.batch; + batch.n_tokens = (int32_t) audio.size(); + + for (size_t i = 0; i < audio.size(); ++i) { + batch.embd[i] = audio[i]; + batch.pos[i] = (llama_pos) i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + return owned_batch; +} + +static moss_owned_batch moss_batch_from_audio_codes( + const std::vector & raw_codes, + size_t raw_frames, + uint32_t n_quantizers) { + GGML_ASSERT(raw_codes.size() == raw_frames * (size_t) n_quantizers); + + moss_owned_batch owned_batch((int32_t) raw_frames, 0, 1); + llama_batch & batch = owned_batch.batch; + batch.n_tokens = (int32_t) raw_frames; + batch.n_token_audio = (int32_t) n_quantizers; + owned_batch.token_audio = raw_codes; + owned_batch.refresh_token_audio_ptr(); + + for (size_t i = 0; i < raw_frames; ++i) { + batch.token[i] = 0; + batch.pos[i] = (llama_pos) i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + return owned_batch; +} + +static std::vector moss_encode_audio_llama( + const std::string & audio_encoder_model_path, + const std::string & wav_path, + int32_t n_gpu_layers, + uint32_t n_quantizers, + size_t * out_frames) { + moss_audio_runtime runtime; + runtime.model = moss_load_audio_model( + audio_encoder_model_path, + "moss-tts-audio-encoder", + n_gpu_layers); + const int sample_rate = moss_audio_model_sample_rate(runtime.model.get()); + const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get()); + const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get()); + const uint32_t nq = n_quantizers == 0 ? model_quantizers : n_quantizers; + if (nq == 0 || nq > model_quantizers) { + throw std::runtime_error("invalid audio encoder quantizer count"); + } + + const std::vector wav = moss_read_wav_f32_mono(wav_path, sample_rate); + const size_t padded_samples = + ((wav.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate; + const size_t valid_frames = wav.size() / (size_t) downsample_rate; + + if (padded_samples == 0) { + if (out_frames != nullptr) { + *out_frames = 0; + } + return {}; + } + + std::vector padded_wav(padded_samples, 0.0f); + std::copy(wav.begin(), wav.end(), padded_wav.begin()); + + runtime.ctx = moss_init_audio_context(runtime.model.get(), (uint32_t) padded_samples); + + moss_owned_batch batch = moss_batch_from_audio_waveform(padded_wav); + const int ret = llama_encode(runtime.ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("audio encoder llama_encode failed: " + std::to_string(ret)); + } + + const int32_t n_out_i32 = llama_model_n_out_i32(runtime.model.get()); + const size_t padded_frames = padded_samples / (size_t) downsample_rate; + const int32_t * codes_i32 = llama_get_output_i32(runtime.ctx.get()); + if (codes_i32 == nullptr) { + throw std::runtime_error("audio encoder returned null raw i32 outputs"); + } + + if (n_out_i32 != (int32_t) nq) { + throw std::runtime_error("audio encoder raw i32 width does not match quantizer count"); + } + + std::vector codes(padded_frames * (size_t) nq); + for (size_t t = 0; t < padded_frames; ++t) { + const int32_t * row = codes_i32 + t * (size_t) n_out_i32; + std::copy_n(row, nq, codes.data() + t * (size_t) nq); + } + + if (out_frames != nullptr) { + *out_frames = valid_frames; + } + if (valid_frames >= padded_frames) { + return codes; + } + + std::vector trimmed(valid_frames * (size_t) nq); + for (size_t t = 0; t < valid_frames; ++t) { + std::copy_n( + codes.data() + t * (size_t) nq, + nq, + trimmed.data() + t * (size_t) nq); + } + return trimmed; +} + +static void moss_decode_audio_llama( + const std::string & audio_decoder_model_path, + const std::vector & raw_codes, + size_t raw_frames, + const moss_delay_config & cfg, + int32_t n_gpu_layers, + const std::string & wav_out_path) { + moss_audio_runtime runtime = moss_load_audio_runtime( + audio_decoder_model_path, + "moss-tts-audio-decoder", + n_gpu_layers, + std::max((uint32_t) raw_frames, 1u)); + + const int sample_rate = moss_audio_model_sample_rate(runtime.model.get()); + const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get()); + const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get()); + if (cfg.n_vq != model_quantizers) { + throw std::runtime_error( + "audio decoder quantizer count mismatch: model expects " + + std::to_string(model_quantizers) + ", got " + std::to_string(cfg.n_vq)); + } + if (raw_codes.size() != raw_frames * (size_t) cfg.n_vq) { + throw std::runtime_error("audio decoder raw code payload size mismatch"); + } + + std::vector audio; + if (raw_frames > 0) { + moss_owned_batch batch = moss_batch_from_audio_codes(raw_codes, raw_frames, cfg.n_vq); + const int ret = llama_encode(runtime.ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("audio decoder llama_encode failed: " + std::to_string(ret)); + } + + const int32_t n_embd_out = llama_model_n_embd_out(runtime.model.get()); + if (n_embd_out != 1) { + throw std::runtime_error("audio decoder output dimension must be 1"); + } + + const size_t n_samples = raw_frames * (size_t) downsample_rate; + const float * embd = llama_get_embeddings(runtime.ctx.get()); + if (embd == nullptr) { + throw std::runtime_error("audio decoder returned null embeddings"); + } + audio.assign(embd, embd + n_samples); + } + + if (!save_wav16(wav_out_path, audio, sample_rate)) { + throw std::runtime_error("failed to write WAV file: " + wav_out_path); + } +} + +static void moss_decode_audio_native( + const std::string & model_path, + const std::vector & raw_codes, + size_t raw_frames, + const moss_delay_config & cfg, + const std::string & wav_out_path) { + moss_audio_tokenizer_options codec_opts; + codec_opts.n_threads = cpu_get_num_math(); + + moss_audio_tokenizer codec(model_path, codec_opts); + const std::vector audio = codec.decode(raw_codes, raw_frames, cfg.n_vq); + + if (!save_wav16(wav_out_path, audio, codec.sample_rate())) { + throw std::runtime_error("failed to write WAV file: " + wav_out_path); + } +} + +static std::vector moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate) { + std::ifstream in(path, std::ios::binary); + if (!in) { + throw std::runtime_error("failed to open wav file: " + path); + } + + auto read_u16 = [&](uint16_t & value) { + in.read(reinterpret_cast(&value), sizeof(value)); + if (!in) { + throw std::runtime_error("failed to read wav u16 field"); + } + }; + auto read_u32 = [&](uint32_t & value) { + in.read(reinterpret_cast(&value), sizeof(value)); + if (!in) { + throw std::runtime_error("failed to read wav u32 field"); + } + }; + + char riff[4]; + char wave[4]; + uint32_t chunk_size = 0; + in.read(riff, 4); + read_u32(chunk_size); + in.read(wave, 4); + if (!in || std::memcmp(riff, "RIFF", 4) != 0 || std::memcmp(wave, "WAVE", 4) != 0) { + throw std::runtime_error("unsupported wav header: " + path); + } + + uint16_t audio_format = 0; + uint16_t num_channels = 0; + uint32_t sample_rate = 0; + uint16_t bits_per_sample = 0; + std::vector data_chunk; + + while (in) { + char chunk_id[4]; + uint32_t chunk_bytes = 0; + in.read(chunk_id, 4); + if (!in) { + break; + } + read_u32(chunk_bytes); + + if (std::memcmp(chunk_id, "fmt ", 4) == 0) { + uint32_t byte_rate = 0; + uint16_t block_align = 0; + read_u16(audio_format); + read_u16(num_channels); + read_u32(sample_rate); + read_u32(byte_rate); + read_u16(block_align); + read_u16(bits_per_sample); + + const size_t fmt_extra = chunk_bytes > 16 ? chunk_bytes - 16 : 0; + if (fmt_extra > 0) { + in.seekg((std::streamoff) fmt_extra, std::ios::cur); + } + } else if (std::memcmp(chunk_id, "data", 4) == 0) { + data_chunk.resize(chunk_bytes); + in.read(reinterpret_cast(data_chunk.data()), (std::streamsize) chunk_bytes); + } else { + in.seekg((std::streamoff) chunk_bytes, std::ios::cur); + } + + if (chunk_bytes & 1u) { + in.seekg(1, std::ios::cur); + } + } + + if (audio_format == 0 || num_channels == 0 || sample_rate == 0 || bits_per_sample == 0 || data_chunk.empty()) { + throw std::runtime_error("incomplete wav metadata: " + path); + } + if ((int) sample_rate != expected_sample_rate) { + throw std::runtime_error("reference wav sample rate must be " + std::to_string(expected_sample_rate)); + } + if (audio_format != 1 && audio_format != 3) { + throw std::runtime_error("only PCM16/PCM32-float wav is supported: " + path); + } + + const size_t bytes_per_sample = bits_per_sample / 8u; + if (bytes_per_sample == 0 || data_chunk.size() % (bytes_per_sample * num_channels) != 0) { + throw std::runtime_error("invalid wav data chunk size: " + path); + } + + const size_t n_frames = data_chunk.size() / (bytes_per_sample * num_channels); + std::vector mono(n_frames, 0.0f); + + for (size_t i = 0; i < n_frames; ++i) { + float acc = 0.0f; + for (uint16_t ch = 0; ch < num_channels; ++ch) { + const uint8_t * src = data_chunk.data() + (i * (size_t) num_channels + ch) * bytes_per_sample; + float sample = 0.0f; + if (audio_format == 1 && bits_per_sample == 16) { + int16_t v = 0; + std::memcpy(&v, src, sizeof(v)); + sample = (float) v / 32768.0f; + } else if (audio_format == 3 && bits_per_sample == 32) { + std::memcpy(&sample, src, sizeof(sample)); + } else { + throw std::runtime_error("unsupported wav sample encoding: " + path); + } + acc += sample; + } + mono[i] = acc / (float) num_channels; + } + + return mono; +} + +static moss_prompt_input moss_build_prompt_input( + const llama_vocab * vocab, + const moss_delay_config & cfg, + const std::string & text, + const std::string & language, + const std::vector & reference_codes, + size_t reference_frames) { + const std::string audio_start_tok = common_token_to_piece(vocab, cfg.audio_start_token_id, true); + const std::string audio_end_tok = common_token_to_piece(vocab, cfg.audio_end_token_id, true); + const std::string user_slot_tok = common_token_to_piece(vocab, cfg.audio_user_slot_token_id, true); + const std::string im_start_tok = common_token_to_piece(vocab, cfg.im_start_token_id, true); + const std::string im_end_tok = common_token_to_piece(vocab, cfg.im_end_token_id, true); + + const auto replace_audio_placeholders = [&]( + const std::string & content, + const std::vector & lengths) -> std::string { + size_t pos = 0; + size_t length_idx = 0; + std::string out; + + while (true) { + const size_t ph = content.find(MOSS_AUDIO_PLACEHOLDER, pos); + if (ph == std::string::npos) { + out.append(content, pos, std::string::npos); + break; + } + + out.append(content, pos, ph - pos); + if (length_idx >= lengths.size()) { + throw std::runtime_error("audio placeholder count does not match reference length count"); + } + + const size_t length = lengths[length_idx++]; + out += audio_start_tok; + if (length > 0) { + for (size_t i = 0; i < length; ++i) { + out += user_slot_tok; + } + for (size_t i = 1; i < cfg.n_vq; ++i) { + out += user_slot_tok; + } + } + out += audio_end_tok; + pos = ph + std::strlen(MOSS_AUDIO_PLACEHOLDER); + } + + if (length_idx != lengths.size()) { + throw std::runtime_error("unused reference audio lengths while replacing placeholders"); + } + + return out; + }; + + const auto build_unified_codes = [&]( + const std::string & content, + const std::vector> & audio_codes_list, + const std::vector & audio_frames_list) -> std::vector { + const std::vector text_ids = common_tokenize(vocab, content, false, true); + if (audio_codes_list.empty()) { + std::vector packed(text_ids.size() * cfg.packed_stride(), cfg.audio_pad_code); + for (size_t i = 0; i < text_ids.size(); ++i) { + packed[i * cfg.packed_stride()] = text_ids[i]; + } + return packed; + } + + std::vector audio_start_indices; + std::vector audio_end_indices; + for (size_t i = 0; i < text_ids.size(); ++i) { + if (text_ids[i] == cfg.audio_start_token_id) { + audio_start_indices.push_back(i); + } + if (text_ids[i] == cfg.audio_end_token_id) { + audio_end_indices.push_back(i); + } + } + + if (audio_start_indices.size() != audio_codes_list.size() || audio_end_indices.size() != audio_codes_list.size()) { + throw std::runtime_error("audio marker count does not match reference audio count"); + } + + std::vector delay_audio; + size_t prefix_idx = 0; + for (size_t i = 0; i < audio_codes_list.size(); ++i) { + const size_t start_idx = audio_start_indices[i]; + const size_t end_idx = audio_end_indices[i]; + const std::vector delayed = moss_apply_delay_pattern(audio_codes_list[i], audio_frames_list[i], cfg); + + const size_t pad_before_rows = start_idx - prefix_idx + 1; + delay_audio.insert(delay_audio.end(), pad_before_rows * cfg.n_vq, cfg.audio_pad_code); + delay_audio.insert(delay_audio.end(), delayed.begin(), delayed.end()); + prefix_idx = end_idx; + } + + const size_t last_end = audio_end_indices.back(); + const size_t pad_after_rows = text_ids.size() - last_end; + delay_audio.insert(delay_audio.end(), pad_after_rows * cfg.n_vq, cfg.audio_pad_code); + + const size_t delay_rows = delay_audio.size() / cfg.n_vq; + const size_t text_rows = std::min(text_ids.size(), delay_rows); + std::vector packed(text_rows * cfg.packed_stride(), cfg.audio_pad_code); + for (size_t row = 0; row < text_rows; ++row) { + packed[row * cfg.packed_stride()] = text_ids[row]; + std::copy_n( + delay_audio.data() + row * cfg.n_vq, + cfg.n_vq, + packed.data() + row * cfg.packed_stride() + 1); + } + return packed; + }; + + const bool has_ref = reference_frames > 0; + const std::string ref_str = has_ref ? "[S1]:\n<|audio|>" : "None"; + const std::string user_content = + "\n" + "- Reference(s):\n" + ref_str + "\n" + "- Instruction:\nNone\n" + "- Tokens:\nNone\n" + "- Quality:\nNone\n" + "- Sound Event:\nNone\n" + "- Ambient Sound:\nNone\n" + "- Language:\n" + language + "\n" + "- Text:\n" + text + "\n" + ""; + + const std::vector ref_lengths = has_ref ? std::vector { reference_frames } : std::vector {}; + const std::string replaced = replace_audio_placeholders(user_content, ref_lengths); + const std::string full_text = im_start_tok + "user\n" + replaced + im_end_tok + "\n" + im_start_tok + "assistant\n"; + + std::vector> ref_list; + std::vector ref_frames_list; + if (has_ref) { + ref_list.push_back(reference_codes); + ref_frames_list.push_back(reference_frames); + } + + moss_prompt_input out; + out.packed_ids = build_unified_codes(full_text, ref_list, ref_frames_list); + out.prompt_frames = out.packed_ids.size() / cfg.packed_stride(); + out.reference_frames = reference_frames; + + out.packed_ids.push_back(cfg.audio_start_token_id); + out.packed_ids.insert(out.packed_ids.end(), cfg.n_vq, cfg.audio_pad_code); + out.prompt_frames += 1; + + return out; +} + static int moss_run_audio_decoder_helper( const std::string & python_bin, const std::string & helper_script, @@ -973,8 +1614,11 @@ static int moss_run_audio_decoder_helper( } static bool moss_decode_parity( + const std::string & model_path, const std::string & ref_path, const std::string & dump_codes_path, + const std::string & audio_decoder_model_path, + int32_t n_gpu_layers, const std::string & python_bin, const std::string & helper_script, const std::string & encoder_onnx, @@ -1026,7 +1670,36 @@ static bool moss_decode_parity( moss_write_codes_file(dump_codes_path, decoded.raw_codes, decoded.raw_frames, cfg); } - if (!helper_script.empty()) { + if (!wav_out.empty()) { + if (!helper_script.empty()) { + if (dump_codes_path.empty()) { + throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); + } + if (encoder_onnx.empty() || decoder_onnx.empty()) { + throw std::runtime_error("--audio-decoder-script requires both --audio-encoder-onnx and --audio-decoder-onnx"); + } + + const int rc = moss_run_audio_decoder_helper( + python_bin, helper_script, dump_codes_path, wav_out, + encoder_onnx, decoder_onnx, use_gpu_audio); + if (rc != 0) { + throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc)); + } + } else if (!audio_decoder_model_path.empty()) { + moss_decode_audio_llama( + audio_decoder_model_path, + decoded.raw_codes, + decoded.raw_frames, + cfg, + n_gpu_layers, + wav_out); + } else { + if (model_path.empty()) { + throw std::runtime_error("--wav-out requires either --audio-decoder-model, --audio-decoder-script, or -m with bundled codec"); + } + moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out); + } + } else if (!helper_script.empty()) { if (dump_codes_path.empty()) { throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); } @@ -1090,6 +1763,7 @@ static void moss_generate_from_ref( const moss_sampling_config & sampling_cfg, uint32_t seed, const std::string & dump_raw_codes_path, + const std::string & audio_decoder_model_path, const std::string & python_bin, const std::string & helper_script, const std::string & encoder_onnx, @@ -1116,100 +1790,171 @@ static void moss_generate_from_ref( moss_read_exact(in, prompt_packed.data(), prompt_packed.size(), "prompt packed ids"); moss_read_exact(in, ignored_ref_raw_codes.data(), ignored_ref_raw_codes.size(), "reference raw codes"); - llama_backend_scope backend_scope; + moss_generate_from_prompt( + model_path, + prompt_packed, + hdr.prompt_frames, + hdr.raw_frames, + n_gpu_layers, + max_new_tokens, + sampling_cfg, + seed, + dump_raw_codes_path, + audio_decoder_model_path, + python_bin, + helper_script, + encoder_onnx, + decoder_onnx, + wav_out, + use_gpu_audio); +} - llama_model_params mparams = llama_model_default_params(); - mparams.use_mmap = true; - mparams.n_gpu_layers = n_gpu_layers; +static void moss_generate_from_prompt( + const std::string & model_path, + const std::vector & prompt_packed, + size_t prompt_frames, + size_t reference_frames, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & audio_decoder_model_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio) { + moss_delay_config cfg; + cfg.n_vq = MOSS_DELAY_DEFAULT_N_VQ; + cfg.audio_pad_code = MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE; - llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); - if (!model) { - throw std::runtime_error("failed to load model: " + model_path); - } + llama_backend_scope backend_scope; + moss_generation_audio decoded; + size_t generated_frames = 0; - const llama_vocab * vocab = llama_model_get_vocab(model.get()); - const int32_t text_vocab = llama_vocab_n_tokens(vocab); - const moss_delay_config model_cfg = moss_delay_config_from_model(model.get()); + { + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + mparams.n_gpu_layers = n_gpu_layers; - if (model_cfg.n_vq != cfg.n_vq) { - throw std::runtime_error("generation reference n_vq does not match model metadata"); - } - cfg.audio_vocab_size = model_cfg.audio_vocab_size; + llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); + if (!model) { + throw std::runtime_error("failed to load model: " + model_path); + } - llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = std::max((uint32_t) hdr.prompt_frames + (uint32_t) max_new_tokens + 8u, 64u); - cparams.n_batch = std::max((uint32_t) hdr.prompt_frames, 1u); - cparams.n_ubatch = cparams.n_batch; - cparams.n_seq_max = 1; - cparams.embeddings = false; + const llama_vocab * vocab = llama_model_get_vocab(model.get()); + const int32_t text_vocab = llama_vocab_n_tokens(vocab); + const moss_delay_config model_cfg = moss_delay_config_from_model(model.get()); - llama_context_ptr ctx(llama_init_from_model(model.get(), cparams)); - if (!ctx) { - throw std::runtime_error("failed to create context"); - } + cfg = model_cfg; + if (prompt_packed.size() % cfg.packed_stride() != 0) { + throw std::runtime_error("prompt packed input does not match model n_vq"); + } - llama_set_warmup(ctx.get(), false); - llama_set_causal_attn(ctx.get(), true); - llama_set_embeddings(ctx.get(), false); + llama_context_params cparams = llama_context_default_params(); + cparams.n_ctx = std::max((uint32_t) prompt_frames + (uint32_t) max_new_tokens + 8u, 64u); + cparams.n_batch = std::max((uint32_t) prompt_frames, 1u); + cparams.n_ubatch = cparams.n_batch; + cparams.n_seq_max = 1; + cparams.embeddings = false; - { - moss_owned_batch batch = moss_batch_from_packed_rows( - prompt_packed, 0, hdr.prompt_frames, cfg, 0, true); - const int ret = llama_decode(ctx.get(), batch.batch); - if (ret != 0) { - throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret)); + llama_context_ptr ctx(llama_init_from_model(model.get(), cparams)); + if (!ctx) { + throw std::runtime_error("failed to create context"); } - } - moss_delay_state state = moss_init_delay_state(prompt_packed, cfg); + llama_set_warmup(ctx.get(), false); + llama_set_causal_attn(ctx.get(), true); + llama_set_embeddings(ctx.get(), false); - std::vector generated_packed; - generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride()); + { + moss_owned_batch batch = moss_batch_from_packed_rows( + prompt_packed, 0, prompt_frames, cfg, 0, true); + const int ret = llama_decode(ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret)); + } + } - const size_t audio_vocab = moss_audio_vocab_with_pad(cfg); - moss_rng rng(seed); + moss_delay_state state = moss_init_delay_state(prompt_packed, cfg); - for (int32_t step = 0; step < max_new_tokens; ++step) { - const float * logits = llama_get_logits_ith(ctx.get(), -1); - if (logits == nullptr) { - throw std::runtime_error("llama_get_logits_ith returned null"); - } + std::vector generated_packed; + generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride()); - std::vector text_logits(logits, logits + text_vocab); - std::vector audio_logits( - logits + text_vocab, - logits + text_vocab + cfg.n_vq * audio_vocab); + const size_t audio_vocab = moss_audio_vocab_with_pad(cfg); + moss_rng rng(seed); - const std::vector next = moss_delay_step( - state, text_logits, audio_logits, sampling_cfg, cfg, rng); - generated_packed.insert(generated_packed.end(), next.begin(), next.end()); + for (int32_t step = 0; step < max_new_tokens; ++step) { + const float * logits = llama_get_logits_ith(ctx.get(), -1); + if (logits == nullptr) { + throw std::runtime_error("llama_get_logits_ith returned null"); + } - moss_owned_batch batch = moss_batch_from_packed_rows( - generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg, - hdr.prompt_frames + (size_t) step, true); - const int ret = llama_decode(ctx.get(), batch.batch); - if (ret != 0) { - throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret)); - } + std::vector text_logits(logits, logits + text_vocab); + std::vector audio_logits( + logits + text_vocab, + logits + text_vocab + cfg.n_vq * audio_vocab); + + const std::vector next = moss_delay_step( + state, text_logits, audio_logits, sampling_cfg, cfg, rng); + generated_packed.insert(generated_packed.end(), next.begin(), next.end()); + + moss_owned_batch batch = moss_batch_from_packed_rows( + generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg, + prompt_frames + (size_t) step, true); + const int ret = llama_decode(ctx.get(), batch.batch); + if (ret != 0) { + throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret)); + } - if (state.is_stopping) { - break; + if (state.is_stopping) { + break; + } } - } - const moss_generation_audio decoded = moss_decode_generation_audio(state, hdr.prompt_frames, cfg); + generated_frames = generated_packed.size() / cfg.packed_stride(); + decoded = moss_decode_generation_audio(state, prompt_frames, cfg); + } LOG("moss-tts first-class generation: prompt_frames=%u generated_frames=%zu raw_frames=%zu input_ref_raw_frames=%u\n", - hdr.prompt_frames, - generated_packed.size() / cfg.packed_stride(), + (uint32_t) prompt_frames, + generated_frames, decoded.raw_frames, - hdr.raw_frames); + (uint32_t) reference_frames); if (!dump_raw_codes_path.empty()) { moss_write_codes_file(dump_raw_codes_path, decoded.raw_codes, decoded.raw_frames, cfg); } - if (!helper_script.empty()) { + if (!wav_out.empty()) { + if (!helper_script.empty()) { + if (dump_raw_codes_path.empty()) { + throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); + } + if (encoder_onnx.empty() || decoder_onnx.empty()) { + throw std::runtime_error("--audio-decoder-script requires both ONNX paths"); + } + + const int rc = moss_run_audio_decoder_helper( + python_bin, helper_script, dump_raw_codes_path, wav_out, + encoder_onnx, decoder_onnx, use_gpu_audio); + if (rc != 0) { + throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc)); + } + } else if (!audio_decoder_model_path.empty()) { + moss_decode_audio_llama( + audio_decoder_model_path, + decoded.raw_codes, + decoded.raw_frames, + cfg, + n_gpu_layers, + wav_out); + } else { + moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out); + } + } else if (!helper_script.empty()) { if (dump_raw_codes_path.empty()) { throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes"); } @@ -1229,6 +1974,83 @@ static void moss_generate_from_ref( } } +static void moss_generate_from_text( + const std::string & model_path, + const std::string & text, + const std::string & language, + const std::string & reference_audio_path, + int32_t n_gpu_layers, + int32_t max_new_tokens, + const moss_sampling_config & sampling_cfg, + uint32_t seed, + const std::string & dump_raw_codes_path, + const std::string & audio_encoder_model_path, + const std::string & audio_decoder_model_path, + const std::string & python_bin, + const std::string & helper_script, + const std::string & encoder_onnx, + const std::string & decoder_onnx, + const std::string & wav_out, + bool use_gpu_audio) { + std::vector reference_codes; + size_t reference_frames = 0; + moss_prompt_input prompt; + + { + llama_backend_scope backend_scope; + + llama_model_params mparams = llama_model_default_params(); + mparams.use_mmap = true; + mparams.vocab_only = true; + + llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams)); + if (!model) { + throw std::runtime_error("failed to load vocab-only model: " + model_path); + } + + const llama_vocab * vocab = llama_model_get_vocab(model.get()); + const moss_delay_config cfg = moss_delay_config_from_model(model.get()); + + if (!reference_audio_path.empty()) { + if (!audio_encoder_model_path.empty()) { + reference_codes = moss_encode_audio_llama( + audio_encoder_model_path, + reference_audio_path, + n_gpu_layers, + cfg.n_vq, + &reference_frames); + } else { + moss_audio_tokenizer_options codec_opts; + codec_opts.n_threads = cpu_get_num_math(); + moss_audio_tokenizer codec(model_path, codec_opts); + const std::vector wav = moss_read_wav_f32_mono(reference_audio_path, codec.sample_rate()); + reference_codes = codec.encode(wav, &reference_frames, cfg.n_vq); + } + } + + prompt = moss_build_prompt_input( + vocab, cfg, text, language, reference_codes, reference_frames); + } + + moss_generate_from_prompt( + model_path, + prompt.packed_ids, + prompt.prompt_frames, + prompt.reference_frames, + n_gpu_layers, + max_new_tokens, + sampling_cfg, + seed, + dump_raw_codes_path, + audio_decoder_model_path, + python_bin, + helper_script, + encoder_onnx, + decoder_onnx, + wav_out, + use_gpu_audio); +} + static std::vector moss_audio_history_slice( const moss_delay_state & state, size_t start_frame, @@ -1530,7 +2352,13 @@ int main(int argc, char ** argv) { std::string model_path; std::string decode_parity_ref_path; std::string generation_input_path; + std::string text; + std::string text_file_path; + std::string reference_audio_path; + std::string language = "zh"; std::string dump_raw_codes_path; + std::string audio_encoder_model_path; + std::string audio_decoder_model_path; std::string audio_decoder_script; std::string audio_encoder_onnx; std::string audio_decoder_onnx; @@ -1554,6 +2382,22 @@ int main(int argc, char ** argv) { generation_input_path = argv[++i]; continue; } + if (arg == "--text" && i + 1 < argc) { + text = argv[++i]; + continue; + } + if (arg == "--text-file" && i + 1 < argc) { + text_file_path = argv[++i]; + continue; + } + if (arg == "--reference-audio" && i + 1 < argc) { + reference_audio_path = argv[++i]; + continue; + } + if (arg == "--language" && i + 1 < argc) { + language = argv[++i]; + continue; + } if (arg == "--generation-ref" && i + 1 < argc) { generation_input_path = argv[++i]; LOG("warning: --generation-ref is deprecated; use --generation-input instead.\n"); @@ -1579,6 +2423,14 @@ int main(int argc, char ** argv) { dump_raw_codes_path = argv[++i]; continue; } + if (arg == "--audio-encoder-model" && i + 1 < argc) { + audio_encoder_model_path = argv[++i]; + continue; + } + if (arg == "--audio-decoder-model" && i + 1 < argc) { + audio_decoder_model_path = argv[++i]; + continue; + } if (arg == "--audio-decoder-script" && i + 1 < argc) { audio_decoder_script = argv[++i]; continue; @@ -1657,11 +2509,17 @@ int main(int argc, char ** argv) { LOG("moss delay state self-test: ok\n"); } + llama_backend_scope backend_scope; + if (!generation_input_path.empty()) { if (model_path.empty()) { LOG_ERR("--generation-input requires -m \n"); return EXIT_FAILURE; } + if (!text.empty() || !text_file_path.empty()) { + LOG_ERR("--generation-input cannot be combined with --text/--text-file\n"); + return EXIT_FAILURE; + } try { moss_generate_from_ref( model_path, @@ -1671,6 +2529,7 @@ int main(int argc, char ** argv) { sampling_cfg, seed, dump_raw_codes_path, + audio_decoder_model_path, python_bin, audio_decoder_script, audio_encoder_onnx, @@ -1684,11 +2543,55 @@ int main(int argc, char ** argv) { } } + if (!text.empty() || !text_file_path.empty()) { + if (model_path.empty()) { + LOG_ERR("--text/--text-file requires -m \n"); + return EXIT_FAILURE; + } + try { + std::string input_text = text; + if (!text_file_path.empty()) { + std::ifstream in(text_file_path); + if (!in) { + throw std::runtime_error("failed to open text file: " + text_file_path); + } + std::ostringstream ss; + ss << in.rdbuf(); + input_text = ss.str(); + } + moss_generate_from_text( + model_path, + input_text, + language, + reference_audio_path, + n_gpu_layers, + max_new_tokens, + sampling_cfg, + seed, + dump_raw_codes_path, + audio_encoder_model_path, + audio_decoder_model_path, + python_bin, + audio_decoder_script, + audio_encoder_onnx, + audio_decoder_onnx, + wav_out_path, + use_gpu_audio); + return EXIT_SUCCESS; + } catch (const std::exception & err) { + LOG_ERR("text generation failed: %s\n", err.what()); + return EXIT_FAILURE; + } + } + if (!decode_parity_ref_path.empty()) { try { const bool ok = moss_decode_parity( + model_path, decode_parity_ref_path, dump_raw_codes_path, + audio_decoder_model_path, + n_gpu_layers, python_bin, audio_decoder_script, audio_encoder_onnx, @@ -1706,9 +2609,11 @@ int main(int argc, char ** argv) { if (self_test) { return EXIT_SUCCESS; } - LOG("moss delay state, multi-head sampler, and raw-code decode are in place; audio decode is available via the external Python/ONNX helper.\n"); + LOG("moss delay state, multi-head sampler, raw-code decode, and native audio encode/decode helpers are available.\n"); LOG("use --print-delay-config with -m to inspect model metadata.\n"); LOG("use --decode-parity-ref to verify C++ de-delay/raw-code extraction against Python.\n"); + LOG("use --text -m --audio-decoder-model --wav-out out.wav for native generation.\n"); + LOG("use --text --reference-audio ref.wav -m --audio-encoder-model --audio-decoder-model --wav-out out.wav for native voice cloning.\n"); LOG("use --generation-input -m for first-class generation.\n"); return EXIT_SUCCESS; } @@ -1718,8 +2623,6 @@ int main(int argc, char ** argv) { return EXIT_FAILURE; } - llama_backend_scope backend_scope; - llama_model_params mparams = llama_model_default_params(); mparams.use_mmap = true; mparams.n_gpu_layers = n_gpu_layers; From b59fdb941a846bb3e9904682ab8cc974cca8b0b8 Mon Sep 17 00:00:00 2001 From: CHiSwsz Date: Tue, 7 Apr 2026 21:18:00 +0800 Subject: [PATCH 2/3] Fix MOSS TTS processor typing --- tools/tts/moss_tts_processor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/tts/moss_tts_processor.py b/tools/tts/moss_tts_processor.py index 6f1b456e5..d8f264680 100644 --- a/tools/tts/moss_tts_processor.py +++ b/tools/tts/moss_tts_processor.py @@ -78,7 +78,8 @@ def build_generation_prompt( im_start_tok = _get_special_token_str(tokenizer, IM_START_TOKEN_ID) im_end_tok = _get_special_token_str(tokenizer, IM_END_TOKEN_ID) - has_ref = reference_codes is not None and reference_codes.shape[0] > 0 + ref_frame_count = int(reference_codes.shape[0]) if reference_codes is not None else 0 + has_ref = ref_frame_count > 0 ref_str = f"[S1]:\n{AUDIO_PLACEHOLDER}" if has_ref else "None" user_content = ( @@ -94,7 +95,7 @@ def build_generation_prompt( f"" ) - ref_lengths = [reference_codes.shape[0]] if has_ref else [] + ref_lengths = [ref_frame_count] if has_ref else [] user_content = _replace_audio_placeholders( user_content, ref_lengths, @@ -106,7 +107,10 @@ def build_generation_prompt( ) full_text = f"{im_start_tok}user\n{user_content}{im_end_tok}\n{im_start_tok}assistant\n" - ref_audio_list = [reference_codes] if has_ref else [] + ref_audio_list: list[np.ndarray] = [] + if has_ref: + assert reference_codes is not None + ref_audio_list.append(reference_codes) unified_codes = _get_unified_codes(tokenizer, full_text, ref_audio_list) assistant_gen = f"{audio_start_tok}" From b785003ba497794ecfa337c3e47f01af79489888 Mon Sep 17 00:00:00 2001 From: CHiSwsz Date: Wed, 8 Apr 2026 13:06:34 +0800 Subject: [PATCH 3/3] Remove bundled MOSS audio tokenizer fallback --- include/llama-moss-audio-tokenizer.h | 61 -- src/CMakeLists.txt | 2 - src/models/moss-audio-tokenizer.cpp | 1205 -------------------------- tools/tts/run-moss-tts-delay.cpp | 84 +- 4 files changed, 41 insertions(+), 1311 deletions(-) delete mode 100644 include/llama-moss-audio-tokenizer.h delete mode 100644 src/models/moss-audio-tokenizer.cpp diff --git a/include/llama-moss-audio-tokenizer.h b/include/llama-moss-audio-tokenizer.h deleted file mode 100644 index 5e0326787..000000000 --- a/include/llama-moss-audio-tokenizer.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#ifndef __cplusplus -#error "This header is for C++ only" -#endif - -#include "llama.h" - -#include -#include -#include -#include -#include - -struct moss_audio_tokenizer_options { - int n_threads = -1; -}; - -class LLAMA_API moss_audio_tokenizer { -public: - explicit moss_audio_tokenizer( - const std::string & model_path, - const moss_audio_tokenizer_options & options = {}); - ~moss_audio_tokenizer(); - - moss_audio_tokenizer(const moss_audio_tokenizer &) = delete; - moss_audio_tokenizer & operator=(const moss_audio_tokenizer &) = delete; - - moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept; - moss_audio_tokenizer & operator=(moss_audio_tokenizer &&) noexcept; - - int sample_rate() const; - uint32_t downsample_rate() const; - uint32_t num_quantizers() const; - - std::vector decode( - const std::vector & codes, - size_t n_frames, - uint32_t n_quantizers = 0) const; - - std::vector encode( - const std::vector & audio, - size_t * out_frames = nullptr, - uint32_t n_quantizers = 0) const; - -private: - struct impl; - std::unique_ptr impl_; -}; - -LLAMA_API int moss_audio_model_sample_rate(const struct llama_model * model); - -LLAMA_API uint32_t moss_audio_model_downsample_rate(const struct llama_model * model); - -LLAMA_API uint32_t moss_audio_model_num_quantizers(const struct llama_model * model); - -LLAMA_API std::vector moss_audio_model_quantizer_encode( - const struct llama_model * model, - const std::vector & input, - size_t n_frames, - uint32_t n_quantizers = 0); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b93054d70..8a0f29646 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,7 +7,6 @@ llama_add_compile_flags() # llama add_library(llama - ../include/llama-moss-audio-tokenizer.h ../include/llama.h llama.cpp llama-adapter.cpp @@ -104,7 +103,6 @@ add_library(llama models/moss-audio-common.cpp models/moss-audio-decoder.cpp models/moss-audio-encoder.cpp - models/moss-audio-tokenizer.cpp models/moss-tts-delay.cpp models/modern-bert.cpp models/mpt.cpp diff --git a/src/models/moss-audio-tokenizer.cpp b/src/models/moss-audio-tokenizer.cpp deleted file mode 100644 index a0f19415b..000000000 --- a/src/models/moss-audio-tokenizer.cpp +++ /dev/null @@ -1,1205 +0,0 @@ -#include "llama-moss-audio-tokenizer.h" - -#include "ggml.h" -#include "ggml-cpp.h" -#include "ggml-backend.h" -#include "ggml-cpu.h" -#include "gguf.h" -#include "llama-impl.h" -#include "llama-model.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace { - -constexpr char MOSS_CODEC_ARCH[] = "moss-audio-tokenizer"; -constexpr float MOSS_LAYER_NORM_EPS = 1e-5f; -constexpr size_t MOSS_CODEC_MAX_NODES_BASE = 256; -constexpr size_t MOSS_CODEC_MAX_NODES_PER_LAYER = 32; - -enum class moss_codec_module_type { - PATCHED_PRETRANSFORM, - TRANSFORMER, -}; - -struct moss_codec_transformer_layer { - ggml_tensor * attn_in = nullptr; - ggml_tensor * attn_out = nullptr; - ggml_tensor * linear1 = nullptr; - ggml_tensor * linear2 = nullptr; - ggml_tensor * norm1_w = nullptr; - ggml_tensor * norm1_b = nullptr; - ggml_tensor * norm2_w = nullptr; - ggml_tensor * norm2_b = nullptr; - ggml_tensor * scale1 = nullptr; - ggml_tensor * scale2 = nullptr; -}; - -struct moss_codec_transformer_block { - int input_dimension = 0; - int output_dimension = 0; - int d_model = 0; - int num_heads = 0; - int num_layers = 0; - int dim_feedforward = 0; - int context = 0; - float max_period = 10000.0f; - - ggml_tensor * input_proj = nullptr; - ggml_tensor * output_proj = nullptr; - - std::vector layers; -}; - -struct moss_codec_module { - moss_codec_module_type type = moss_codec_module_type::PATCHED_PRETRANSFORM; - int patch_size = 1; - moss_codec_transformer_block transformer; -}; - -struct moss_codec_quantizer_entry { - ggml_tensor * in_proj_w = nullptr; - ggml_tensor * in_proj_b = nullptr; - ggml_tensor * codebook = nullptr; - ggml_tensor * out_proj_w = nullptr; - ggml_tensor * out_proj_b = nullptr; -}; - -struct moss_codec_quantizer { - int input_dim = 0; - int rvq_dim = 0; - int output_dim = 0; - int num_quantizers = 0; - int codebook_size = 0; - int codebook_dim = 0; - - ggml_tensor * input_proj_w = nullptr; - ggml_tensor * input_proj_b = nullptr; - ggml_tensor * output_proj_w = nullptr; - ggml_tensor * output_proj_b = nullptr; - - std::vector quantizers; -}; - -static std::string moss_codec_module_type_to_string(const moss_codec_module_type type) { - switch (type) { - case moss_codec_module_type::PATCHED_PRETRANSFORM: - return "PatchedPretransform"; - case moss_codec_module_type::TRANSFORMER: - return "Transformer"; - } - return "Unknown"; -} - -static moss_codec_module_type moss_codec_module_type_from_string(const std::string & value) { - if (value == "PatchedPretransform") { - return moss_codec_module_type::PATCHED_PRETRANSFORM; - } - if (value == "Transformer") { - return moss_codec_module_type::TRANSFORMER; - } - throw std::runtime_error("unsupported codec module type: " + value); -} - -static void moss_codec_set_n_threads(ggml_backend_t backend, int n_threads) { - if (backend == nullptr || n_threads <= 0) { - return; - } - - ggml_backend_dev_t dev = ggml_backend_get_device(backend); - ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; - if (!reg) { - return; - } - - auto fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); - if (fn != nullptr) { - fn(backend, n_threads); - } -} - -static std::vector moss_codec_make_positions(const size_t n_tokens) { - std::vector positions(n_tokens); - for (size_t i = 0; i < n_tokens; ++i) { - positions[i] = (int32_t) i; - } - return positions; -} - -static std::vector moss_codec_make_causal_mask(const size_t n_tokens, const int context) { - std::vector mask(n_tokens * n_tokens, -std::numeric_limits::infinity()); - - for (size_t iq = 0; iq < n_tokens; ++iq) { - for (size_t ik = 0; ik < n_tokens; ++ik) { - if (ik > iq) { - continue; - } - if (context > 0 && (int) (iq - ik) >= context) { - continue; - } - mask[iq * n_tokens + ik] = 0.0f; - } - } - - return mask; -} - -static ggml_tensor * moss_codec_build_layer_norm( - ggml_context * ctx0, - ggml_tensor * cur, - ggml_tensor * weight, - ggml_tensor * bias) { - cur = ggml_norm(ctx0, cur, MOSS_LAYER_NORM_EPS); - cur = ggml_mul(ctx0, cur, weight); - cur = ggml_add(ctx0, cur, bias); - return cur; -} - -static ggml_tensor * moss_codec_build_attention( - ggml_context * ctx0, - ggml_tensor * wo, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_mask, - float kq_scale) { - ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); - ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); - v = ggml_cont(ctx0, v); - - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); - - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]); - - if (wo != nullptr) { - cur = ggml_mul_mat(ctx0, wo, cur); - } - - return cur; -} - -static std::vector moss_codec_patch_decode( - const std::vector & input, - const int channels, - const size_t n_frames, - const int patch_size) { - if (patch_size <= 0) { - throw std::runtime_error("invalid patch size"); - } - if (channels % patch_size != 0) { - throw std::runtime_error("patch decode channels not divisible by patch size"); - } - if (input.size() != (size_t) channels * n_frames) { - throw std::runtime_error("patch decode input size mismatch"); - } - - const int out_channels = channels / patch_size; - const size_t out_frames = n_frames * (size_t) patch_size; - std::vector output((size_t) out_channels * out_frames); - - for (size_t t = 0; t < n_frames; ++t) { - for (int d = 0; d < out_channels; ++d) { - for (int i = 0; i < patch_size; ++i) { - const float value = input[(size_t) (d * patch_size + i) + t * (size_t) channels]; - output[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) out_channels] = value; - } - } - } - - return output; -} - -static std::vector moss_codec_patch_encode( - const std::vector & input, - const int channels, - const size_t n_frames, - const int patch_size) { - if (patch_size <= 0) { - throw std::runtime_error("invalid patch size"); - } - if (n_frames % (size_t) patch_size != 0) { - throw std::runtime_error("patch encode frame count not divisible by patch size"); - } - if (input.size() != (size_t) channels * n_frames) { - throw std::runtime_error("patch encode input size mismatch"); - } - - const int out_channels = channels * patch_size; - const size_t out_frames = n_frames / (size_t) patch_size; - std::vector output((size_t) out_channels * out_frames); - - for (size_t t = 0; t < out_frames; ++t) { - for (int d = 0; d < channels; ++d) { - for (int i = 0; i < patch_size; ++i) { - const float value = input[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) channels]; - output[(size_t) (d * patch_size + i) + t * (size_t) out_channels] = value; - } - } - } - - return output; -} - -static std::vector moss_codec_copy_f32_output(ggml_tensor * tensor) { - std::vector output((size_t) ggml_nelements(tensor)); - ggml_backend_tensor_get(tensor, output.data(), 0, ggml_nbytes(tensor)); - return output; -} - -struct moss_codec_linear_f32 { - int in_features = 0; - int out_features = 0; - std::vector weight; - std::vector bias; - - bool empty() const { - return weight.empty(); - } -}; - -struct moss_codec_quantizer_entry_f32 { - moss_codec_linear_f32 in_proj; - moss_codec_linear_f32 out_proj; - int codebook_size = 0; - int codebook_dim = 0; - std::vector codebook; - std::vector codebook_unit; -}; - -static std::vector moss_codec_tensor_to_f32(const ggml_tensor * tensor) { - if (tensor == nullptr) { - return {}; - } - - const size_t n_elements = (size_t) ggml_nelements(tensor); - - switch (tensor->type) { - case GGML_TYPE_F32: { - std::vector values(n_elements); - ggml_backend_tensor_get(const_cast(tensor), values.data(), 0, ggml_nbytes(tensor)); - return values; - } - case GGML_TYPE_F16: { - std::vector values_f16(n_elements); - std::vector values(n_elements); - ggml_backend_tensor_get(const_cast(tensor), values_f16.data(), 0, ggml_nbytes(tensor)); - for (size_t i = 0; i < n_elements; ++i) { - values[i] = ggml_fp16_to_fp32(values_f16[i]); - } - return values; - } - default: - throw std::runtime_error("unsupported tensor dtype for float conversion: " + std::string(ggml_type_name(tensor->type))); - } -} - -static moss_codec_linear_f32 moss_codec_linear_from_tensors(ggml_tensor * weight, ggml_tensor * bias) { - moss_codec_linear_f32 result; - if (weight == nullptr) { - return result; - } - - switch (ggml_n_dims(weight)) { - case 2: - result.in_features = (int) weight->ne[0]; - result.out_features = (int) weight->ne[1]; - break; - case 3: - if (weight->ne[0] != 1) { - throw std::runtime_error("expected singleton leading dim for 3D linear weight tensor"); - } - result.in_features = (int) weight->ne[1]; - result.out_features = (int) weight->ne[2]; - break; - case 4: - if (weight->ne[0] != 1 || weight->ne[1] != 1) { - throw std::runtime_error("expected singleton leading dims for 4D linear weight tensor"); - } - result.in_features = (int) weight->ne[2]; - result.out_features = (int) weight->ne[3]; - break; - default: - throw std::runtime_error("expected 2D/3D/4D linear weight tensor"); - } - result.weight = moss_codec_tensor_to_f32(weight); - result.bias = moss_codec_tensor_to_f32(bias); - return result; -} - -static std::vector moss_codec_linear_apply( - const moss_codec_linear_f32 & linear, - const std::vector & input, - const size_t n_frames) { - if (linear.empty()) { - return input; - } - if (input.size() != (size_t) linear.in_features * n_frames) { - throw std::runtime_error("linear input size mismatch"); - } - - std::vector output((size_t) linear.out_features * n_frames, 0.0f); - for (size_t t = 0; t < n_frames; ++t) { - const float * x = input.data() + t * (size_t) linear.in_features; - float * y = output.data() + t * (size_t) linear.out_features; - - if (!linear.bias.empty()) { - std::copy(linear.bias.begin(), linear.bias.end(), y); - } - - for (int o = 0; o < linear.out_features; ++o) { - const float * w = linear.weight.data() + (size_t) o * (size_t) linear.in_features; - float acc = y[o]; - for (int i = 0; i < linear.in_features; ++i) { - acc += w[i] * x[i]; - } - y[o] = acc; - } - } - - return output; -} - -static std::vector moss_codec_normalize_rows( - const std::vector & input, - const int row_width) { - if (row_width <= 0 || input.size() % (size_t) row_width != 0) { - throw std::runtime_error("invalid row width for normalization"); - } - - std::vector output = input; - const size_t n_rows = input.size() / (size_t) row_width; - for (size_t r = 0; r < n_rows; ++r) { - float norm2 = 0.0f; - for (int c = 0; c < row_width; ++c) { - const float v = output[r * (size_t) row_width + (size_t) c]; - norm2 += v * v; - } - const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); - for (int c = 0; c < row_width; ++c) { - output[r * (size_t) row_width + (size_t) c] *= inv; - } - } - return output; -} - -struct moss_codec_gguf_loader { - ggml_context_ptr ctx_meta; - gguf_context_ptr ctx_gguf; - ggml_context_ptr ctx_data; - ggml_backend_ptr backend; - ggml_backend_buffer_ptr buffer; - - std::string fname; - std::map tensor_offset; - std::map loaded_tensors; - std::vector tensors_to_load; - - explicit moss_codec_gguf_loader(const std::string & model_path) - : fname(model_path), - backend(ggml_backend_cpu_init()) { - if (!backend) { - throw std::runtime_error("failed to initialize CPU backend for codec"); - } - - ggml_context * meta = nullptr; - gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &meta, - }; - - ctx_gguf.reset(gguf_init_from_file(fname.c_str(), params)); - if (!ctx_gguf) { - throw std::runtime_error("failed to load codec GGUF metadata from: " + fname); - } - - ctx_meta.reset(meta); - - for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { - const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); - tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); - } - - ggml_init_params data_params = { - /*.mem_size =*/ static_cast(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - ctx_data.reset(ggml_init(data_params)); - if (!ctx_data) { - throw std::runtime_error("failed to initialize codec tensor context"); - } - } - - int find_key(const std::string & key, const bool required = true) const { - const int idx = gguf_find_key(ctx_gguf.get(), key.c_str()); - if (idx < 0 && required) { - throw std::runtime_error("GGUF key not found: " + key); - } - return idx; - } - - bool has_key(const std::string & key) const { - return gguf_find_key(ctx_gguf.get(), key.c_str()) >= 0; - } - - uint32_t get_u32(const std::string & key, const bool required = true, const uint32_t fallback = 0) const { - const int idx = find_key(key, required); - if (idx < 0) { - return fallback; - } - return gguf_get_val_u32(ctx_gguf.get(), idx); - } - - float get_f32(const std::string & key, const bool required = true, const float fallback = 0.0f) const { - const int idx = find_key(key, required); - if (idx < 0) { - return fallback; - } - return gguf_get_val_f32(ctx_gguf.get(), idx); - } - - std::string get_string(const std::string & key, const bool required = true, const std::string & fallback = {}) const { - const int idx = find_key(key, required); - if (idx < 0) { - return fallback; - } - return std::string(gguf_get_val_str(ctx_gguf.get(), idx)); - } - - ggml_tensor * get_tensor(const std::string & name, const bool required = true) { - const auto it = loaded_tensors.find(name); - if (it != loaded_tensors.end()) { - return it->second; - } - - ggml_tensor * meta_tensor = ggml_get_tensor(ctx_meta.get(), name.c_str()); - if (!meta_tensor) { - if (required) { - throw std::runtime_error("codec tensor not found: " + name); - } - return nullptr; - } - - ggml_tensor * data_tensor = ggml_dup_tensor(ctx_data.get(), meta_tensor); - ggml_set_name(data_tensor, meta_tensor->name); - loaded_tensors.emplace(name, data_tensor); - tensors_to_load.push_back(data_tensor); - return data_tensor; - } - - void load_tensor_bytes() { - if (!buffer) { - buffer.reset(ggml_backend_alloc_ctx_tensors(ctx_data.get(), backend.get())); - if (!buffer) { - throw std::runtime_error("failed to allocate codec weight buffer"); - } - ggml_backend_buffer_set_usage(buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - } - - std::ifstream fin(fname, std::ios::binary); - if (!fin) { - throw std::runtime_error("failed to open codec GGUF for tensor loading: " + fname); - } - - std::vector read_buf; - for (ggml_tensor * tensor : tensors_to_load) { - const auto it = tensor_offset.find(tensor->name); - if (it == tensor_offset.end()) { - throw std::runtime_error("missing GGUF tensor offset for: " + std::string(tensor->name)); - } - - const size_t offset = it->second; - const size_t num_bytes = ggml_nbytes(tensor); - - fin.seekg(offset, std::ios::beg); - if (!fin) { - throw std::runtime_error("failed to seek codec tensor: " + std::string(tensor->name)); - } - - if (ggml_backend_buffer_is_host(buffer.get())) { - fin.read(reinterpret_cast(tensor->data), (std::streamsize) num_bytes); - } else { - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), (std::streamsize) num_bytes); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, num_bytes); - } - - if (!fin) { - throw std::runtime_error("failed to read codec tensor: " + std::string(tensor->name)); - } - } - } -}; - -} // namespace - -struct moss_audio_tokenizer::impl { - int sample_rate = 0; - uint32_t downsample_rate = 0; - uint32_t num_quantizers = 0; - int n_threads = -1; - - ggml_backend_ptr backend; - ggml_context_ptr ctx_meta; - gguf_context_ptr ctx_gguf; - ggml_context_ptr ctx_data; - ggml_backend_buffer_ptr weights_buffer; - - moss_codec_quantizer quantizer; - moss_codec_linear_f32 quantizer_input_proj_f32; - std::vector quantizer_entries_f32; - std::vector encoder; - std::vector decoder; - - explicit impl(const std::string & model_path, const moss_audio_tokenizer_options & options) { - moss_codec_gguf_loader loader(model_path); - - if (!loader.has_key(std::string(MOSS_CODEC_ARCH) + ".quantizer_type")) { - throw std::runtime_error("model does not contain bundled MOSS audio tokenizer metadata"); - } - - sample_rate = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".sampling_rate"); - downsample_rate = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".downsample_rate"); - num_quantizers = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.num_quantizers"); - n_threads = options.n_threads; - - quantizer.input_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.input_dim"); - quantizer.rvq_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.rvq_dim"); - quantizer.output_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.output_dim"); - quantizer.num_quantizers = (int) num_quantizers; - quantizer.codebook_size = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_size"); - quantizer.codebook_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_dim"); - quantizer.input_proj_w = loader.get_tensor("audio_tokenizer.quantizer.input_proj.weight", false); - quantizer.input_proj_b = loader.get_tensor("audio_tokenizer.quantizer.input_proj.bias", false); - quantizer.output_proj_w = loader.get_tensor("audio_tokenizer.quantizer.output_proj.weight", false); - quantizer.output_proj_b = loader.get_tensor("audio_tokenizer.quantizer.output_proj.bias", false); - quantizer.quantizers.resize(num_quantizers); - for (uint32_t iq = 0; iq < num_quantizers; ++iq) { - auto & entry = quantizer.quantizers[iq]; - const std::string prefix = "audio_tokenizer.quantizer.quantizers." + std::to_string(iq); - entry.in_proj_w = loader.get_tensor(prefix + ".in_proj.weight", false); - entry.in_proj_b = loader.get_tensor(prefix + ".in_proj.bias", false); - entry.codebook = loader.get_tensor(prefix + ".codebook.weight"); - entry.out_proj_w = loader.get_tensor(prefix + ".out_proj.weight", false); - entry.out_proj_b = loader.get_tensor(prefix + ".out_proj.bias", false); - } - - const auto load_modules = [&](const std::string & section_name, std::vector & modules) { - const uint32_t block_count = loader.get_u32(std::string(MOSS_CODEC_ARCH) + "." + section_name + ".block_count"); - modules.resize(block_count); - for (uint32_t ib = 0; ib < block_count; ++ib) { - const std::string block_prefix = std::string(MOSS_CODEC_ARCH) + "." + section_name + "." + std::to_string(ib); - moss_codec_module & block = modules[ib]; - block.type = moss_codec_module_type_from_string(loader.get_string(block_prefix + ".module_type")); - - if (block.type == moss_codec_module_type::PATCHED_PRETRANSFORM) { - block.patch_size = (int) loader.get_u32(block_prefix + ".patch_size"); - continue; - } - - auto & tr = block.transformer; - tr.input_dimension = (int) loader.get_u32(block_prefix + ".input_dimension"); - tr.output_dimension = (int) loader.get_u32(block_prefix + ".output_dimension"); - tr.d_model = (int) loader.get_u32(block_prefix + ".d_model"); - tr.num_heads = (int) loader.get_u32(block_prefix + ".num_heads"); - tr.num_layers = (int) loader.get_u32(block_prefix + ".num_layers"); - tr.dim_feedforward = (int) loader.get_u32(block_prefix + ".dim_feedforward"); - tr.context = (int) loader.get_u32(block_prefix + ".context"); - tr.max_period = loader.get_f32(block_prefix + ".max_period", false, 10000.0f); - tr.input_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".input_proj.weight", false); - tr.output_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".output_proj.weight", false); - - tr.layers.resize(tr.num_layers); - for (int il = 0; il < tr.num_layers; ++il) { - auto & layer = tr.layers[il]; - const std::string layer_prefix = - "audio_tokenizer." + section_name + "." + std::to_string(ib) + ".transformer.layers." + std::to_string(il); - layer.attn_in = loader.get_tensor(layer_prefix + ".self_attn.in_projs.0.weight"); - layer.attn_out = loader.get_tensor(layer_prefix + ".self_attn.out_projs.0.weight"); - layer.linear1 = loader.get_tensor(layer_prefix + ".linear1.weight"); - layer.linear2 = loader.get_tensor(layer_prefix + ".linear2.weight"); - layer.norm1_w = loader.get_tensor(layer_prefix + ".norm1.weight"); - layer.norm1_b = loader.get_tensor(layer_prefix + ".norm1.bias"); - layer.norm2_w = loader.get_tensor(layer_prefix + ".norm2.weight"); - layer.norm2_b = loader.get_tensor(layer_prefix + ".norm2.bias"); - layer.scale1 = loader.get_tensor(layer_prefix + ".layer_scale_1.scale", false); - layer.scale2 = loader.get_tensor(layer_prefix + ".layer_scale_2.scale", false); - } - } - }; - - load_modules("encoder", encoder); - load_modules("decoder", decoder); - - loader.load_tensor_bytes(); - - backend = std::move(loader.backend); - ctx_meta = std::move(loader.ctx_meta); - ctx_gguf = std::move(loader.ctx_gguf); - ctx_data = std::move(loader.ctx_data); - weights_buffer = std::move(loader.buffer); - - quantizer_input_proj_f32 = moss_codec_linear_from_tensors(quantizer.input_proj_w, quantizer.input_proj_b); - quantizer_entries_f32.resize(num_quantizers); - for (uint32_t iq = 0; iq < num_quantizers; ++iq) { - auto & dst = quantizer_entries_f32[iq]; - const auto & src = quantizer.quantizers[iq]; - dst.in_proj = moss_codec_linear_from_tensors(src.in_proj_w, src.in_proj_b); - dst.out_proj = moss_codec_linear_from_tensors(src.out_proj_w, src.out_proj_b); - dst.codebook_dim = (int) src.codebook->ne[0]; - dst.codebook_size = (int) src.codebook->ne[1]; - dst.codebook = moss_codec_tensor_to_f32(src.codebook); - dst.codebook_unit = moss_codec_normalize_rows(dst.codebook, dst.codebook_dim); - } - - LLAMA_LOG_INFO("%s: sample_rate=%d downsample_rate=%u num_quantizers=%u encoder_blocks=%zu decoder_blocks=%zu\n", - __func__, sample_rate, downsample_rate, num_quantizers, encoder.size(), decoder.size()); - } - - std::vector run_quantizer_decode( - const std::vector & codes, - const size_t n_frames, - uint32_t n_quantizers_req) const { - const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; - if (nq == 0 || nq > num_quantizers) { - throw std::runtime_error("invalid quantizer count for decode"); - } - if (codes.size() != n_frames * (size_t) nq) { - throw std::runtime_error("raw code size does not match frame count"); - } - - const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) nq * 8; - const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false); - std::vector meta_buf(meta_size); - - ggml_init_params params = { - /*.mem_size =*/ meta_size, - /*.mem_buffer =*/ meta_buf.data(), - /*.no_alloc =*/ true, - }; - ggml_context * ctx0 = ggml_init(params); - if (!ctx0) { - throw std::runtime_error("failed to init quantizer decode ggml context"); - } - - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false); - std::vector code_inputs(nq); - - ggml_tensor * cur = nullptr; - for (uint32_t iq = 0; iq < nq; ++iq) { - const auto & entry = quantizer.quantizers[iq]; - ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames); - ggml_set_input(inp); - code_inputs[iq] = inp; - - ggml_tensor * emb = ggml_get_rows(ctx0, entry.codebook, inp); - if (entry.out_proj_w) { - emb = ggml_mul_mat(ctx0, entry.out_proj_w, emb); - } - if (entry.out_proj_b) { - emb = ggml_add(ctx0, emb, entry.out_proj_b); - } - cur = cur ? ggml_add(ctx0, cur, emb) : emb; - } - - if (quantizer.output_proj_w) { - cur = ggml_mul_mat(ctx0, quantizer.output_proj_w, cur); - } - if (quantizer.output_proj_b) { - cur = ggml_add(ctx0, cur, quantizer.output_proj_b); - } - - ggml_build_forward_expand(gf, cur); - - ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) }; - ggml_gallocr_alloc_graph(allocr.get(), gf); - - for (uint32_t iq = 0; iq < nq; ++iq) { - std::vector gathered(n_frames); - for (size_t t = 0; t < n_frames; ++t) { - const llama_token code = codes[t * (size_t) nq + iq]; - if (code < 0 || code >= quantizer.codebook_size) { - ggml_free(ctx0); - throw std::runtime_error("audio code out of codec range during decode"); - } - gathered[t] = (int32_t) code; - } - ggml_backend_tensor_set(code_inputs[iq], gathered.data(), 0, gathered.size() * sizeof(int32_t)); - } - - moss_codec_set_n_threads(backend.get(), n_threads); - const ggml_status status = ggml_backend_graph_compute(backend.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - ggml_free(ctx0); - throw std::runtime_error("quantizer decode graph compute failed"); - } - - std::vector output = moss_codec_copy_f32_output(cur); - ggml_free(ctx0); - return output; - } - - std::vector run_transformer_block( - const moss_codec_transformer_block & block, - const std::vector & input, - const size_t n_frames) const { - if (input.size() != (size_t) block.input_dimension * n_frames) { - throw std::runtime_error("transformer block input size mismatch"); - } - - const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) block.num_layers * MOSS_CODEC_MAX_NODES_PER_LAYER; - const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false); - std::vector meta_buf(meta_size); - - ggml_init_params params = { - /*.mem_size =*/ meta_size, - /*.mem_buffer =*/ meta_buf.data(), - /*.no_alloc =*/ true, - }; - ggml_context * ctx0 = ggml_init(params); - if (!ctx0) { - throw std::runtime_error("failed to init transformer ggml context"); - } - - ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false); - - ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, block.input_dimension, (int64_t) n_frames); - ggml_set_input(inp); - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames); - ggml_set_input(positions); - ggml_tensor * mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, (int64_t) n_frames, (int64_t) n_frames, 1, 1); - ggml_set_input(mask); - - ggml_tensor * cur = inp; - if (block.input_proj) { - cur = ggml_mul_mat(ctx0, block.input_proj, cur); - } - - const int d_head = block.d_model / block.num_heads; - const float attn_scale = 1.0f / std::sqrt((float) d_head); - - for (int il = 0; il < block.num_layers; ++il) { - const auto & layer = block.layers[il]; - - ggml_tensor * inp_sa = cur; - ggml_tensor * x = moss_codec_build_layer_norm(ctx0, cur, layer.norm1_w, layer.norm1_b); - ggml_tensor * qkv = ggml_mul_mat(ctx0, layer.attn_in, x); - - ggml_tensor * q = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, - ggml_row_size(qkv->type, d_head), qkv->nb[1], 0); - ggml_tensor * k = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, - ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model)); - ggml_tensor * v = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames, - ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model)); - - q = ggml_rope_ext(ctx0, q, positions, nullptr, d_head, 0, 0, - block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - k = ggml_rope_ext(ctx0, k, positions, nullptr, d_head, 0, 0, - block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); - - ggml_tensor * attn = moss_codec_build_attention(ctx0, layer.attn_out, q, k, v, mask, attn_scale); - if (layer.scale1) { - attn = ggml_mul(ctx0, attn, layer.scale1); - } - cur = ggml_add(ctx0, inp_sa, attn); - - ggml_tensor * inp_ff = cur; - x = moss_codec_build_layer_norm(ctx0, cur, layer.norm2_w, layer.norm2_b); - x = ggml_mul_mat(ctx0, layer.linear1, x); - x = ggml_gelu(ctx0, x); - x = ggml_mul_mat(ctx0, layer.linear2, x); - if (layer.scale2) { - x = ggml_mul(ctx0, x, layer.scale2); - } - cur = ggml_add(ctx0, inp_ff, x); - } - - if (block.output_proj) { - cur = ggml_mul_mat(ctx0, block.output_proj, cur); - } - - ggml_build_forward_expand(gf, cur); - - ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) }; - ggml_gallocr_alloc_graph(allocr.get(), gf); - - const std::vector positions_data = moss_codec_make_positions(n_frames); - const std::vector mask_data = moss_codec_make_causal_mask(n_frames, block.context); - - ggml_backend_tensor_set(inp, input.data(), 0, input.size() * sizeof(float)); - ggml_backend_tensor_set(positions, positions_data.data(), 0, positions_data.size() * sizeof(int32_t)); - ggml_backend_tensor_set(mask, mask_data.data(), 0, mask_data.size() * sizeof(float)); - - moss_codec_set_n_threads(backend.get(), n_threads); - const ggml_status status = ggml_backend_graph_compute(backend.get(), gf); - if (status != GGML_STATUS_SUCCESS) { - ggml_free(ctx0); - throw std::runtime_error("transformer graph compute failed"); - } - - std::vector output = moss_codec_copy_f32_output(cur); - ggml_free(ctx0); - return output; - } - - std::vector decode( - const std::vector & codes, - const size_t n_frames, - const uint32_t n_quantizers_req) const { - uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; - if (nq == 0 || nq > num_quantizers) { - throw std::runtime_error("invalid quantizer count"); - } - - std::vector cur = run_quantizer_decode(codes, n_frames, nq); - int channels = quantizer.output_dim; - size_t frames = n_frames; - - for (const auto & module : decoder) { - switch (module.type) { - case moss_codec_module_type::TRANSFORMER: - cur = run_transformer_block(module.transformer, cur, frames); - channels = module.transformer.output_dimension; - break; - case moss_codec_module_type::PATCHED_PRETRANSFORM: - cur = moss_codec_patch_decode(cur, channels, frames, module.patch_size); - channels /= module.patch_size; - frames *= (size_t) module.patch_size; - break; - } - } - - if (channels != 1) { - throw std::runtime_error("codec decoder did not end with a mono waveform channel"); - } - - return cur; - } - - std::vector run_quantizer_encode( - const std::vector & input, - const size_t n_frames, - const uint32_t n_quantizers_req) const { - const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; - if (nq == 0 || nq > num_quantizers) { - throw std::runtime_error("invalid quantizer count for encode"); - } - - std::vector residual = moss_codec_linear_apply(quantizer_input_proj_f32, input, n_frames); - if (residual.size() != (size_t) quantizer.rvq_dim * n_frames) { - throw std::runtime_error("quantizer input projection size mismatch"); - } - - std::vector codes(n_frames * (size_t) nq, 0); - std::vector latents; - std::vector latents_unit; - std::vector decoded; - - for (uint32_t iq = 0; iq < nq; ++iq) { - const auto & entry = quantizer_entries_f32[iq]; - latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames); - if (latents.size() != (size_t) entry.codebook_dim * n_frames) { - throw std::runtime_error("quantizer latent projection size mismatch"); - } - - latents_unit.resize(latents.size()); - for (size_t t = 0; t < n_frames; ++t) { - const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim; - float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim; - - float norm2 = 0.0f; - for (int d = 0; d < entry.codebook_dim; ++d) { - norm2 += in_ptr[d] * in_ptr[d]; - } - const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); - for (int d = 0; d < entry.codebook_dim; ++d) { - out_ptr[d] = in_ptr[d] * inv; - } - } - - std::vector codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f); - for (size_t t = 0; t < n_frames; ++t) { - const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim; - - float best_score = -std::numeric_limits::infinity(); - int best_index = 0; - for (int code = 0; code < entry.codebook_size; ++code) { - const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim; - float score = 0.0f; - for (int d = 0; d < entry.codebook_dim; ++d) { - score += row[d] * latent[d]; - } - if (score > best_score) { - best_score = score; - best_index = code; - } - } - - codes[t * (size_t) nq + iq] = best_index; - const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim; - std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim)); - } - - decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames); - if (decoded.size() != residual.size()) { - throw std::runtime_error("quantizer decoded embedding size mismatch"); - } - - for (size_t i = 0; i < residual.size(); ++i) { - residual[i] -= decoded[i]; - } - } - - return codes; - } - - std::vector encode( - const std::vector & audio, - size_t * out_frames, - const uint32_t n_quantizers_req) const { - const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; - if (nq == 0 || nq > num_quantizers) { - throw std::runtime_error("invalid quantizer count"); - } - - const size_t padded_samples = - ((audio.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate; - const size_t valid_frames = audio.size() / (size_t) downsample_rate; - - std::vector cur(padded_samples, 0.0f); - std::copy(audio.begin(), audio.end(), cur.begin()); - - int channels = 1; - size_t frames = padded_samples; - - for (const auto & module : encoder) { - switch (module.type) { - case moss_codec_module_type::PATCHED_PRETRANSFORM: - cur = moss_codec_patch_encode(cur, channels, frames, module.patch_size); - channels *= module.patch_size; - frames /= (size_t) module.patch_size; - break; - case moss_codec_module_type::TRANSFORMER: - cur = run_transformer_block(module.transformer, cur, frames); - channels = module.transformer.output_dimension; - break; - } - } - - if (channels != quantizer.input_dim) { - throw std::runtime_error("codec encoder output dimension does not match quantizer input dimension"); - } - - std::vector codes = run_quantizer_encode(cur, frames, nq); - if (out_frames) { - *out_frames = valid_frames; - } - - if (valid_frames >= frames) { - return codes; - } - - std::vector trimmed(valid_frames * (size_t) nq); - for (size_t t = 0; t < valid_frames; ++t) { - std::copy_n(codes.data() + t * (size_t) nq, nq, trimmed.data() + t * (size_t) nq); - } - return trimmed; - } -}; - -moss_audio_tokenizer::moss_audio_tokenizer( - const std::string & model_path, - const moss_audio_tokenizer_options & options) - : impl_(std::make_unique(model_path, options)) { -} - -moss_audio_tokenizer::~moss_audio_tokenizer() = default; - -moss_audio_tokenizer::moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept = default; - -moss_audio_tokenizer & moss_audio_tokenizer::operator=(moss_audio_tokenizer &&) noexcept = default; - -int moss_audio_tokenizer::sample_rate() const { - return impl_->sample_rate; -} - -uint32_t moss_audio_tokenizer::downsample_rate() const { - return impl_->downsample_rate; -} - -uint32_t moss_audio_tokenizer::num_quantizers() const { - return impl_->num_quantizers; -} - -std::vector moss_audio_tokenizer::decode( - const std::vector & codes, - const size_t n_frames, - const uint32_t n_quantizers) const { - return impl_->decode(codes, n_frames, n_quantizers); -} - -std::vector moss_audio_tokenizer::encode( - const std::vector & audio, - size_t * out_frames, - const uint32_t n_quantizers) const { - return impl_->encode(audio, out_frames, n_quantizers); -} - -static std::string moss_codec_model_meta_str(const llama_model * model, const std::string & key) { - const auto it = model->gguf_kv.find(key); - if (it == model->gguf_kv.end()) { - throw std::runtime_error("missing GGUF key: " + key); - } - - std::string value = it->second; - if (value.size() >= 2 && ((value.front() == '\'' && value.back() == '\'') || (value.front() == '"' && value.back() == '"'))) { - value = value.substr(1, value.size() - 2); - } - return value; -} - -static uint32_t moss_codec_model_meta_u32(const llama_model * model, const std::string & key) { - return (uint32_t) std::stoul(moss_codec_model_meta_str(model, key)); -} - -static const ggml_tensor * moss_codec_model_require_tensor(const llama_model * model, const std::string & name) { - const ggml_tensor * tensor = model->get_tensor(name.c_str()); - if (tensor == nullptr) { - throw std::runtime_error("missing tensor: " + name); - } - return tensor; -} - -static const ggml_tensor * moss_codec_model_optional_tensor(const llama_model * model, const std::string & name) { - return model->get_tensor(name.c_str()); -} - -int moss_audio_model_sample_rate(const llama_model * model) { - const std::string arch_name = llm_arch_name(model->arch); - return (int) moss_codec_model_meta_u32(model, arch_name + ".sampling_rate"); -} - -uint32_t moss_audio_model_downsample_rate(const llama_model * model) { - const std::string arch_name = llm_arch_name(model->arch); - return moss_codec_model_meta_u32(model, arch_name + ".downsample_rate"); -} - -uint32_t moss_audio_model_num_quantizers(const llama_model * model) { - const std::string arch_name = llm_arch_name(model->arch); - return moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers"); -} - -std::vector moss_audio_model_quantizer_encode( - const llama_model * model, - const std::vector & input, - size_t n_frames, - uint32_t n_quantizers_req) { - if (model->arch != LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) { - throw std::runtime_error("quantizer encode expects a moss-tts-audio-encoder model"); - } - - const std::string arch_name = llm_arch_name(model->arch); - const uint32_t num_quantizers = moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers"); - const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req; - if (nq == 0 || nq > num_quantizers) { - throw std::runtime_error("invalid quantizer count"); - } - - moss_codec_linear_f32 quantizer_input_proj = moss_codec_linear_from_tensors( - const_cast(moss_codec_model_require_tensor(model, "quantizer.input_proj.weight")), - const_cast(moss_codec_model_optional_tensor(model, "quantizer.input_proj.bias"))); - - std::vector quantizers(nq); - for (uint32_t iq = 0; iq < nq; ++iq) { - auto & entry = quantizers[iq]; - const std::string prefix = "quantizer.quantizers." + std::to_string(iq); - entry.in_proj = moss_codec_linear_from_tensors( - const_cast(moss_codec_model_require_tensor(model, prefix + ".in_proj.weight")), - const_cast(moss_codec_model_optional_tensor(model, prefix + ".in_proj.bias"))); - entry.out_proj = moss_codec_linear_from_tensors( - const_cast(moss_codec_model_require_tensor(model, prefix + ".out_proj.weight")), - const_cast(moss_codec_model_optional_tensor(model, prefix + ".out_proj.bias"))); - const ggml_tensor * codebook = moss_codec_model_require_tensor(model, prefix + ".codebook.weight"); - entry.codebook_dim = (int) codebook->ne[0]; - entry.codebook_size = (int) codebook->ne[1]; - entry.codebook = moss_codec_tensor_to_f32(codebook); - entry.codebook_unit = moss_codec_normalize_rows(entry.codebook, entry.codebook_dim); - } - - std::vector residual = moss_codec_linear_apply(quantizer_input_proj, input, n_frames); - std::vector codes(n_frames * (size_t) nq, 0); - std::vector latents; - std::vector latents_unit; - std::vector decoded; - - for (uint32_t iq = 0; iq < nq; ++iq) { - const auto & entry = quantizers[iq]; - latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames); - if (latents.size() != (size_t) entry.codebook_dim * n_frames) { - throw std::runtime_error("quantizer latent projection size mismatch"); - } - - latents_unit.resize(latents.size()); - for (size_t t = 0; t < n_frames; ++t) { - const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim; - float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim; - - float norm2 = 0.0f; - for (int d = 0; d < entry.codebook_dim; ++d) { - norm2 += in_ptr[d] * in_ptr[d]; - } - const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits::epsilon())); - for (int d = 0; d < entry.codebook_dim; ++d) { - out_ptr[d] = in_ptr[d] * inv; - } - } - - std::vector codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f); - for (size_t t = 0; t < n_frames; ++t) { - const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim; - - float best_score = -std::numeric_limits::infinity(); - int best_index = 0; - for (int code = 0; code < entry.codebook_size; ++code) { - const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim; - float score = 0.0f; - for (int d = 0; d < entry.codebook_dim; ++d) { - score += row[d] * latent[d]; - } - if (score > best_score) { - best_score = score; - best_index = code; - } - } - - codes[t * (size_t) nq + iq] = best_index; - const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim; - std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim)); - } - - decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames); - if (decoded.size() != residual.size()) { - throw std::runtime_error("quantizer decoded embedding size mismatch"); - } - - for (size_t i = 0; i < residual.size(); ++i) { - residual[i] -= decoded[i]; - } - } - - return codes; -} diff --git a/tools/tts/run-moss-tts-delay.cpp b/tools/tts/run-moss-tts-delay.cpp index e98aab0ad..2df1e5f00 100644 --- a/tools/tts/run-moss-tts-delay.cpp +++ b/tools/tts/run-moss-tts-delay.cpp @@ -3,7 +3,6 @@ #include "log.h" #include "llama.h" #include "llama-cpp.h" -#include "llama-moss-audio-tokenizer.h" #include #include @@ -237,12 +236,6 @@ static void moss_decode_audio_llama( const moss_delay_config & cfg, int32_t n_gpu_layers, const std::string & wav_out_path); -static void moss_decode_audio_native( - const std::string & model_path, - const std::vector & raw_codes, - size_t raw_frames, - const moss_delay_config & cfg, - const std::string & wav_out_path); static std::vector moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate); static moss_prompt_input moss_build_prompt_input( const llama_vocab * vocab, @@ -460,6 +453,37 @@ static std::string moss_model_architecture(const llama_model * model) { return std::string(buf); } +static uint32_t moss_audio_model_meta_u32( + const llama_model * model, + const char * expected_arch, + const char * suffix) { + const std::string arch = moss_model_architecture(model); + if (arch != expected_arch) { + throw std::runtime_error( + "unexpected audio model architecture: expected " + + std::string(expected_arch) + ", got " + arch); + } + + uint32_t value = 0; + const std::string key = arch + "." + suffix; + if (!parse_meta_u32(model, key.c_str(), value)) { + throw std::runtime_error("missing audio model metadata key: " + key); + } + return value; +} + +static uint32_t moss_audio_model_sampling_rate(const llama_model * model, const char * expected_arch) { + return moss_audio_model_meta_u32(model, expected_arch, "sampling_rate"); +} + +static uint32_t moss_audio_model_downsample_rate(const llama_model * model, const char * expected_arch) { + return moss_audio_model_meta_u32(model, expected_arch, "downsample_rate"); +} + +static uint32_t moss_audio_model_num_quantizers(const llama_model * model, const char * expected_arch) { + return moss_audio_model_meta_u32(model, expected_arch, "quantizer.num_quantizers"); +} + struct moss_audio_runtime { llama_model_ptr model; llama_context_ptr ctx; @@ -1198,9 +1222,9 @@ static std::vector moss_encode_audio_llama( audio_encoder_model_path, "moss-tts-audio-encoder", n_gpu_layers); - const int sample_rate = moss_audio_model_sample_rate(runtime.model.get()); - const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get()); - const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get()); + const int sample_rate = (int) moss_audio_model_sampling_rate(runtime.model.get(), "moss-tts-audio-encoder"); + const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get(), "moss-tts-audio-encoder"); + const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get(), "moss-tts-audio-encoder"); const uint32_t nq = n_quantizers == 0 ? model_quantizers : n_quantizers; if (nq == 0 || nq > model_quantizers) { throw std::runtime_error("invalid audio encoder quantizer count"); @@ -1276,9 +1300,9 @@ static void moss_decode_audio_llama( n_gpu_layers, std::max((uint32_t) raw_frames, 1u)); - const int sample_rate = moss_audio_model_sample_rate(runtime.model.get()); - const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get()); - const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get()); + const int sample_rate = (int) moss_audio_model_sampling_rate(runtime.model.get(), "moss-tts-audio-decoder"); + const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get(), "moss-tts-audio-decoder"); + const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get(), "moss-tts-audio-decoder"); if (cfg.n_vq != model_quantizers) { throw std::runtime_error( "audio decoder quantizer count mismatch: model expects " + @@ -1314,23 +1338,6 @@ static void moss_decode_audio_llama( } } -static void moss_decode_audio_native( - const std::string & model_path, - const std::vector & raw_codes, - size_t raw_frames, - const moss_delay_config & cfg, - const std::string & wav_out_path) { - moss_audio_tokenizer_options codec_opts; - codec_opts.n_threads = cpu_get_num_math(); - - moss_audio_tokenizer codec(model_path, codec_opts); - const std::vector audio = codec.decode(raw_codes, raw_frames, cfg.n_vq); - - if (!save_wav16(wav_out_path, audio, codec.sample_rate())) { - throw std::runtime_error("failed to write WAV file: " + wav_out_path); - } -} - static std::vector moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate) { std::ifstream in(path, std::ios::binary); if (!in) { @@ -1614,7 +1621,6 @@ static int moss_run_audio_decoder_helper( } static bool moss_decode_parity( - const std::string & model_path, const std::string & ref_path, const std::string & dump_codes_path, const std::string & audio_decoder_model_path, @@ -1694,10 +1700,7 @@ static bool moss_decode_parity( n_gpu_layers, wav_out); } else { - if (model_path.empty()) { - throw std::runtime_error("--wav-out requires either --audio-decoder-model, --audio-decoder-script, or -m with bundled codec"); - } - moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out); + throw std::runtime_error("--wav-out requires either --audio-decoder-model or --audio-decoder-script"); } } else if (!helper_script.empty()) { if (dump_codes_path.empty()) { @@ -1952,7 +1955,7 @@ static void moss_generate_from_prompt( n_gpu_layers, wav_out); } else { - moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out); + throw std::runtime_error("--wav-out requires either --audio-decoder-model or --audio-decoder-script"); } } else if (!helper_script.empty()) { if (dump_raw_codes_path.empty()) { @@ -2020,11 +2023,7 @@ static void moss_generate_from_text( cfg.n_vq, &reference_frames); } else { - moss_audio_tokenizer_options codec_opts; - codec_opts.n_threads = cpu_get_num_math(); - moss_audio_tokenizer codec(model_path, codec_opts); - const std::vector wav = moss_read_wav_f32_mono(reference_audio_path, codec.sample_rate()); - reference_codes = codec.encode(wav, &reference_frames, cfg.n_vq); + throw std::runtime_error("--reference-audio requires --audio-encoder-model"); } } @@ -2587,7 +2586,6 @@ int main(int argc, char ** argv) { if (!decode_parity_ref_path.empty()) { try { const bool ok = moss_decode_parity( - model_path, decode_parity_ref_path, dump_raw_codes_path, audio_decoder_model_path, @@ -2609,7 +2607,7 @@ int main(int argc, char ** argv) { if (self_test) { return EXIT_SUCCESS; } - LOG("moss delay state, multi-head sampler, raw-code decode, and native audio encode/decode helpers are available.\n"); + LOG("moss delay state, multi-head sampler, raw-code decode, and native three-GGUF audio encode/decode are available.\n"); LOG("use --print-delay-config with -m to inspect model metadata.\n"); LOG("use --decode-parity-ref to verify C++ de-delay/raw-code extraction against Python.\n"); LOG("use --text -m --audio-decoder-model --wav-out out.wav for native generation.\n");