From a678b5567d31dbfa7647d137e51f309a212f5341 Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Mon, 6 Apr 2026 20:29:54 +0800
Subject: [PATCH 1/3] Add native MOSS-TTS audio encoder and decoder support

---
 convert_moss_audio_tokenizer_split_to_gguf.py |  427 ++++++
 convert_moss_audio_tokenizer_to_gguf.py       |  503 +++++++
 docs/moss-tts-firstclass-e2e.md               |  175 ++-
 docs/moss-tts-firstclass-e2e_zh.md            |  175 ++-
 include/llama-moss-audio-tokenizer.h          |   61 +
 include/llama.h                               |   13 +
 src/CMakeLists.txt                            |    5 +
 src/llama-arch.cpp                            |   54 +
 src/llama-arch.h                              |   17 +
 src/llama-context.cpp                         |  146 +-
 src/llama-context.h                           |    4 +
 src/llama-graph.cpp                           |    4 +
 src/llama-graph.h                             |    2 +
 src/llama-hparams.cpp                         |    4 +
 src/llama-hparams.h                           |    3 +
 src/llama-model.cpp                           |  172 ++-
 src/llama-vocab.cpp                           |   12 +-
 src/models/models.h                           |    8 +
 src/models/moss-audio-common.cpp              |  508 +++++++
 src/models/moss-audio-common.h                |  171 +++
 src/models/moss-audio-decoder.cpp             |   37 +
 src/models/moss-audio-encoder.cpp             |   49 +
 src/models/moss-audio-tokenizer.cpp           | 1205 +++++++++++++++++
 tools/tts/CMakeLists.txt                      |    2 +-
 tools/tts/moss-tts-audio-decode.py            |   29 +-
 tools/tts/moss-tts-build-generation-ref.py    |   24 +-
 tools/tts/moss-tts-firstclass-e2e.py          |   20 +-
 tools/tts/moss_tts_onnx.py                    |   74 +
 tools/tts/moss_tts_processor.py               |  269 ++++
 .../{moss-tts.cpp => run-moss-tts-delay.cpp}  | 1049 +++++++++++++-
 30 files changed, 4989 insertions(+), 233 deletions(-)
 create mode 100644 convert_moss_audio_tokenizer_split_to_gguf.py
 create mode 100755 convert_moss_audio_tokenizer_to_gguf.py
 create mode 100644 include/llama-moss-audio-tokenizer.h
 create mode 100644 src/models/moss-audio-common.cpp
 create mode 100644 src/models/moss-audio-common.h
 create mode 100644 src/models/moss-audio-decoder.cpp
 create mode 100644 src/models/moss-audio-encoder.cpp
 create mode 100644 src/models/moss-audio-tokenizer.cpp
 create mode 100644 tools/tts/moss_tts_onnx.py
 create mode 100644 tools/tts/moss_tts_processor.py
 rename tools/tts/{moss-tts.cpp => run-moss-tts-delay.cpp} (60%)

diff --git a/convert_moss_audio_tokenizer_split_to_gguf.py b/convert_moss_audio_tokenizer_split_to_gguf.py
new file mode 100644
index 000000000..74ffe4e89
--- /dev/null
+++ b/convert_moss_audio_tokenizer_split_to_gguf.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Any, Callable, Iterator
+
+import numpy as np
+
+from convert_moss_audio_tokenizer_to_gguf import SafeTensorsIndex
+from convert_moss_audio_tokenizer_to_gguf import convert_tensor_dtype
+from convert_moss_audio_tokenizer_to_gguf import load_config
+from convert_moss_audio_tokenizer_to_gguf import map_tensor_name
+from convert_moss_audio_tokenizer_to_gguf import merge_weight_norm
+from convert_moss_audio_tokenizer_to_gguf import validate_config
+from convert_moss_audio_tokenizer_to_gguf import write_module_list_metadata
+
+import sys
+import types
+
+# Older local Python envs can ship a NumPy build without numpy.typing.
+try:
+    import numpy.typing  # type: ignore  # noqa: F401
+except Exception:
+    numpy_typing = types.ModuleType("numpy.typing")
+    numpy_typing.DTypeLike = object
+    sys.modules["numpy.typing"] = numpy_typing
+
+sys.path.insert(0, str(Path(__file__).parent / "gguf-py"))
+import gguf  # noqa: E402
+
+
+logger = logging.getLogger("convert_moss_audio_tokenizer_split_to_gguf")
+
+ARCH_ENCODER = "moss-tts-audio-encoder"
+ARCH_DECODER = "moss-tts-audio-decoder"
+
+DEFAULT_SAMPLING_RATE = 24_000
+DEFAULT_DOWNSAMPLE_RATE = 1_920
+DEFAULT_CONTEXT_DURATION = 10.0
+
+
+def default_encoder_outfile(model_dir: Path, outtype: str) -> Path:
+    return model_dir / f"{model_dir.name}-encoder-{outtype}.gguf"
+
+
+def default_decoder_outfile(model_dir: Path, outtype: str) -> Path:
+    return model_dir / f"{model_dir.name}-decoder-{outtype}.gguf"
+
+
+def build_transformer_block_index_map(module_cfgs: list[dict[str, Any]]) -> dict[int, int]:
+    result: dict[int, int] = {}
+    tensor_block = 0
+    for module_idx, module_cfg in enumerate(module_cfgs):
+        if module_cfg.get("module_type") != "Transformer":
+            continue
+        result[module_idx] = tensor_block
+        tensor_block += 1
+    return result
+
+
+def map_transformer_tensor_name(tensor_block: int, tail: str) -> str | None:
+    if tail == "input_proj.weight":
+        return f"blk.{tensor_block}.input_proj.weight"
+    if tail == "output_proj.weight":
+        return f"blk.{tensor_block}.output_proj.weight"
+
+    parts = tail.split(".")
+    if len(parts) < 5 or parts[0] != "transformer" or parts[1] != "layers":
+        return None
+
+    layer_idx = int(parts[2])
+    layer_prefix = f"blk.{tensor_block}.layer.{layer_idx}"
+    layer_tail = ".".join(parts[3:])
+
+    if layer_tail == "layer_scale_1.scale":
+        return f"{layer_prefix}.attn_scale.scale"
+    if layer_tail == "layer_scale_2.scale":
+        return f"{layer_prefix}.ffn_scale.scale"
+    if layer_tail == "linear1.weight":
+        return f"{layer_prefix}.ffn_up.weight"
+    if layer_tail == "linear2.weight":
+        return f"{layer_prefix}.ffn_down.weight"
+    if layer_tail == "norm1.weight":
+        return f"{layer_prefix}.attn_norm.weight"
+    if layer_tail == "norm1.bias":
+        return f"{layer_prefix}.attn_norm.bias"
+    if layer_tail == "norm2.weight":
+        return f"{layer_prefix}.ffn_norm.weight"
+    if layer_tail == "norm2.bias":
+        return f"{layer_prefix}.ffn_norm.bias"
+    if layer_tail == "self_attn.in_projs.0.weight":
+        return f"{layer_prefix}.attn_qkv.weight"
+    if layer_tail == "self_attn.out_projs.0.weight":
+        return f"{layer_prefix}.attn_output.weight"
+
+    return None
+
+
+def map_split_tensor_name(
+    name: str,
+    encoder_block_map: dict[int, int],
+    decoder_block_map: dict[int, int],
+) -> str | None:
+    mapped = map_tensor_name(name)
+    if mapped is None:
+        return None
+
+    if mapped.startswith("encoder."):
+        rest = mapped[len("encoder."):]
+        module_idx_str, tail = rest.split(".", 1)
+        return map_transformer_tensor_name(encoder_block_map[int(module_idx_str)], tail)
+
+    if mapped.startswith("decoder."):
+        rest = mapped[len("decoder."):]
+        module_idx_str, tail = rest.split(".", 1)
+        return map_transformer_tensor_name(decoder_block_map[int(module_idx_str)], tail)
+
+    return mapped
+
+
+def _count_path(name: str) -> str:
+    parts = name.split(".")
+    if len(parts) >= 4 and parts[0] == "quantizer" and parts[1] == "quantizers":
+        return ".".join(parts[:4])
+    if len(parts) >= 4 and parts[0] == "blk" and parts[2] == "layer":
+        return ".".join(parts[:4])
+    if len(parts) >= 2:
+        return ".".join(parts[:2])
+    return name
+
+
+def is_encoder_tensor(name: str) -> bool:
+    if name.startswith("encoder."):
+        return True
+    if name.startswith("quantizer.input_proj."):
+        return True
+    if name.startswith("quantizer.quantizers.") and (
+        ".in_proj." in name or ".out_proj." in name or ".codebook." in name
+    ):
+        return True
+    return False
+
+
+def is_decoder_tensor(name: str) -> bool:
+    if name.startswith("decoder."):
+        return True
+    if name.startswith("quantizer.output_proj."):
+        return True
+    if name.startswith("quantizer.quantizers.") and (
+        ".out_proj." in name or ".codebook." in name
+    ):
+        return True
+    return False
+
+
+def count_filtered_output_tensors(
+    index: SafeTensorsIndex,
+    include_fn: Callable[[str], bool],
+    rename_fn: Callable[[str], str | None],
+) -> int:
+    seen: set[str] = set()
+    for name in index:
+        mapped_name = map_tensor_name(name)
+        renamed_name = rename_fn(name)
+        if mapped_name is None or renamed_name is None or not include_fn(mapped_name):
+            continue
+        seen.add(renamed_name)
+    return len(seen)
+
+
+def iter_filtered_tensors(
+    index: SafeTensorsIndex,
+    outtype: str,
+    include_fn: Callable[[str], bool],
+    rename_fn: Callable[[str], str | None],
+) -> Iterator[tuple[str, np.ndarray[Any, Any]]]:
+    emitted: set[str] = set()
+
+    for name in index:
+        mapped_name = map_tensor_name(name)
+        renamed_name = rename_fn(name)
+        if (
+            mapped_name is None
+            or renamed_name is None
+            or renamed_name in emitted
+            or not include_fn(mapped_name)
+        ):
+            continue
+
+        if ".parametrizations.weight.original0" in name:
+            prefix = name.replace(".parametrizations.weight.original0", "")
+            g_name = f"{prefix}.parametrizations.weight.original0"
+            v_name = f"{prefix}.parametrizations.weight.original1"
+            weight = merge_weight_norm(index.load(g_name), index.load(v_name))
+            yield renamed_name, convert_tensor_dtype(weight, outtype)
+            emitted.add(renamed_name)
+            continue
+
+        tensor = index.load(name)
+        yield renamed_name, convert_tensor_dtype(tensor, outtype)
+        emitted.add(renamed_name)
+
+
+def add_common_metadata(
+    writer: gguf.GGUFWriter,
+    arch: str,
+    config: dict[str, Any],
+    model_name: str,
+) -> None:
+    writer.add_type("model")
+    writer.add_name(model_name)
+
+    sampling_rate = int(config.get("sampling_rate", DEFAULT_SAMPLING_RATE))
+    downsample_rate = int(config.get("downsample_rate", DEFAULT_DOWNSAMPLE_RATE))
+    context_duration = float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION))
+    quantizer_cfg = dict(config.get("quantizer_kwargs", {}))
+    quantizer_type = config.get("quantizer_type") or quantizer_cfg.get("quantizer_type", "rlfq")
+
+    writer.add_uint32(f"{arch}.sampling_rate", sampling_rate)
+    writer.add_uint32(f"{arch}.downsample_rate", downsample_rate)
+    writer.add_float32(f"{arch}.causal_transformer_context_duration", context_duration)
+    writer.add_uint32(f"{arch}.code_dim", int(config.get("code_dim", quantizer_cfg.get("output_dim", 0))))
+    writer.add_string(f"{arch}.quantizer_type", quantizer_type)
+    writer.add_uint32(f"{arch}.quantizer.input_dim", int(quantizer_cfg["input_dim"]))
+    writer.add_uint32(f"{arch}.quantizer.rvq_dim", int(quantizer_cfg.get("rvq_dim", quantizer_cfg["input_dim"])))
+    writer.add_uint32(f"{arch}.quantizer.output_dim", int(quantizer_cfg.get("output_dim", quantizer_cfg["input_dim"])))
+    writer.add_uint32(f"{arch}.quantizer.num_quantizers", int(quantizer_cfg["num_quantizers"]))
+    writer.add_uint32(f"{arch}.quantizer.codebook_size", int(quantizer_cfg["codebook_size"]))
+    writer.add_uint32(f"{arch}.quantizer.codebook_dim", int(quantizer_cfg["codebook_dim"]))
+
+
+def add_encoder_metadata(writer: gguf.GGUFWriter, config: dict[str, Any], model_name: str) -> None:
+    add_common_metadata(writer, ARCH_ENCODER, config, model_name)
+    write_module_list_metadata(
+        writer=writer,
+        arch=ARCH_ENCODER,
+        section_name="encoder",
+        module_cfgs=list(config.get("encoder_kwargs", [])),
+        initial_frame_rate=float(config.get("sampling_rate", DEFAULT_SAMPLING_RATE)),
+        context_duration=float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)),
+        is_encoder=True,
+    )
+
+
+def add_decoder_metadata(writer: gguf.GGUFWriter, config: dict[str, Any], model_name: str) -> None:
+    add_common_metadata(writer, ARCH_DECODER, config, model_name)
+
+    encoder_frame_rate = float(config.get("sampling_rate", DEFAULT_SAMPLING_RATE))
+    for module_cfg in list(config.get("encoder_kwargs", [])):
+        if module_cfg.get("module_type") == "PatchedPretransform":
+            encoder_frame_rate /= int(module_cfg["patch_size"])
+
+    write_module_list_metadata(
+        writer=writer,
+        arch=ARCH_DECODER,
+        section_name="decoder",
+        module_cfgs=list(config.get("decoder_kwargs", [])),
+        initial_frame_rate=encoder_frame_rate,
+        context_duration=float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION)),
+        is_encoder=False,
+    )
+
+
+def build_writer(outfile: Path, arch: str, outtype: str, config: dict[str, Any], model_name: str) -> gguf.GGUFWriter:
+    ftype_map = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+    }
+    writer = gguf.GGUFWriter(path=outfile, arch=arch)
+    writer.add_file_type(ftype_map[outtype])
+    if arch == ARCH_ENCODER:
+        add_encoder_metadata(writer, config, model_name)
+    elif arch == ARCH_DECODER:
+        add_decoder_metadata(writer, config, model_name)
+    else:
+        raise ValueError(f"unexpected split arch {arch!r}")
+    return writer
+
+
+def convert_one(
+    model_dir: Path,
+    outfile: Path,
+    outtype: str,
+    model_name: str,
+    include_fn: Callable[[str], bool],
+    arch: str,
+    dry_run: bool,
+) -> None:
+    config = load_config(model_dir)
+    validate_config(config)
+    index = SafeTensorsIndex(model_dir)
+    encoder_block_map = build_transformer_block_index_map(list(config.get("encoder_kwargs", [])))
+    decoder_block_map = build_transformer_block_index_map(list(config.get("decoder_kwargs", [])))
+    rename_fn = lambda name: map_split_tensor_name(name, encoder_block_map, decoder_block_map)
+    total_tensors = count_filtered_output_tensors(index, include_fn, rename_fn)
+    logger.info(
+        "%s: selected %d output tensors for %s",
+        arch,
+        total_tensors,
+        outfile,
+    )
+
+    if dry_run:
+        paths: dict[str, int] = {}
+        for name in index:
+            mapped_name = map_tensor_name(name)
+            renamed_name = rename_fn(name)
+            if mapped_name is None or renamed_name is None or not include_fn(mapped_name):
+                continue
+            key = _count_path(renamed_name)
+            paths[key] = paths.get(key, 0) + 1
+
+        for key in sorted(paths):
+            logger.debug("%s keeps %3d tensors under %s", arch, paths[key], key)
+        logger.info("%s: dry-run only, not writing %s", arch, outfile)
+        return
+
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    writer = build_writer(outfile, arch, outtype, config, model_name)
+    try:
+        for i, (name, tensor) in enumerate(iter_filtered_tensors(index, outtype, include_fn, rename_fn), start=1):
+            logger.debug("[%4d / %4d] %s %s", i, total_tensors, name, list(tensor.shape))
+            writer.add_tensor(name, tensor)
+
+        writer.write_header_to_file()
+        writer.write_kv_data_to_file()
+        writer.write_tensors_to_file(progress=False)
+        logger.info("%s: wrote %s", arch, outfile)
+    finally:
+        writer.close()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Split a Hugging Face MOSS Audio Tokenizer checkpoint into "
+            "moss-tts-audio-encoder and moss-tts-audio-decoder GGUF files."
+        )
+    )
+    parser.add_argument(
+        "model_dir",
+        type=Path,
+        help="Path to a local MOSS Audio Tokenizer HF checkpoint directory.",
+    )
+    parser.add_argument(
+        "--encoder-outfile",
+        type=Path,
+        default=None,
+        help="Output path for the moss-tts-audio-encoder GGUF.",
+    )
+    parser.add_argument(
+        "--decoder-outfile",
+        type=Path,
+        default=None,
+        help="Output path for the moss-tts-audio-decoder GGUF.",
+    )
+    parser.add_argument(
+        "--outtype",
+        choices=("f16", "f32"),
+        default="f16",
+        help="GGUF floating-point storage type.",
+    )
+    parser.add_argument(
+        "--model-name-prefix",
+        type=str,
+        default=None,
+        help="Optional prefix for general.name. Defaults to the checkpoint directory name.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Validate config and tensor split without writing GGUF files.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable per-tensor logging.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(levelname)s:%(name)s:%(message)s",
+    )
+
+    model_dir = args.model_dir.resolve()
+    name_prefix = args.model_name_prefix or model_dir.name
+    encoder_outfile = (
+        args.encoder_outfile.resolve()
+        if args.encoder_outfile is not None
+        else default_encoder_outfile(model_dir, args.outtype)
+    )
+    decoder_outfile = (
+        args.decoder_outfile.resolve()
+        if args.decoder_outfile is not None
+        else default_decoder_outfile(model_dir, args.outtype)
+    )
+
+    convert_one(
+        model_dir=model_dir,
+        outfile=encoder_outfile,
+        outtype=args.outtype,
+        model_name=f"{name_prefix} Encoder",
+        include_fn=is_encoder_tensor,
+        arch=ARCH_ENCODER,
+        dry_run=args.dry_run,
+    )
+    convert_one(
+        model_dir=model_dir,
+        outfile=decoder_outfile,
+        outtype=args.outtype,
+        model_name=f"{name_prefix} Decoder",
+        include_fn=is_decoder_tensor,
+        arch=ARCH_DECODER,
+        dry_run=args.dry_run,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/convert_moss_audio_tokenizer_to_gguf.py b/convert_moss_audio_tokenizer_to_gguf.py
new file mode 100755
index 000000000..b503370b7
--- /dev/null
+++ b/convert_moss_audio_tokenizer_to_gguf.py
@@ -0,0 +1,503 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import struct
+import sys
+import types
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterator
+
+import numpy as np
+
+# Older local Python envs can ship a NumPy build without numpy.typing.
+try:
+    import numpy.typing  # type: ignore  # noqa: F401
+except Exception:
+    numpy_typing = types.ModuleType("numpy.typing")
+    numpy_typing.DTypeLike = object
+    sys.modules["numpy.typing"] = numpy_typing
+
+sys.path.insert(0, str(Path(__file__).parent / "gguf-py"))
+import gguf  # noqa: E402
+
+
+logger = logging.getLogger("convert_moss_audio_tokenizer_to_gguf")
+
+ARCH = "moss-audio-tokenizer"
+
+DEFAULT_SAMPLING_RATE = 24_000
+DEFAULT_DOWNSAMPLE_RATE = 1_920
+DEFAULT_CONTEXT_DURATION = 10.0
+
+SUPPORTED_MODULE_TYPES = {"PatchedPretransform", "Transformer"}
+SUPPORTED_GATING = {"none"}
+SUPPORTED_POSITIONAL_EMBEDDINGS = {"rope"}
+SUPPORTED_QUANTIZER_TYPES = {"rlfq"}
+
+_SAFETENSORS_DTYPES: dict[str, np.dtype[Any]] = {
+    "BOOL": np.dtype(np.bool_),
+    "U8": np.dtype(np.uint8),
+    "I8": np.dtype(np.int8),
+    "I16": np.dtype(np.int16),
+    "U16": np.dtype(np.uint16),
+    "I32": np.dtype(np.int32),
+    "U32": np.dtype(np.uint32),
+    "I64": np.dtype(np.int64),
+    "U64": np.dtype(np.uint64),
+    "F16": np.dtype(np.float16),
+    "F32": np.dtype(np.float32),
+    "F64": np.dtype(np.float64),
+}
+
+
+@dataclass(frozen=True)
+class TensorLocation:
+    name: str
+    shard: Path
+    dtype: str
+    shape: tuple[int, ...]
+    data_offsets: tuple[int, int]
+    data_start: int
+
+
+class SafeTensorsIndex:
+    def __init__(self, model_dir: Path):
+        self.model_dir = model_dir
+        self.locations: OrderedDict[str, TensorLocation] = OrderedDict()
+        self._headers: dict[Path, dict[str, Any]] = {}
+
+        index_path = model_dir / "model.safetensors.index.json"
+        if index_path.exists():
+            index = json.loads(index_path.read_text())
+            weight_map = index["weight_map"]
+            for tensor_name, shard_name in weight_map.items():
+                shard_path = model_dir / shard_name
+                header, data_start = self._load_header(shard_path)
+                meta = header[tensor_name]
+                self.locations[tensor_name] = TensorLocation(
+                    name=tensor_name,
+                    shard=shard_path,
+                    dtype=meta["dtype"],
+                    shape=tuple(int(v) for v in meta["shape"]),
+                    data_offsets=(int(meta["data_offsets"][0]), int(meta["data_offsets"][1])),
+                    data_start=data_start,
+                )
+            return
+
+        shard_paths = sorted(model_dir.glob("*.safetensors"))
+        if not shard_paths:
+            raise FileNotFoundError(f"No safetensors files found under {model_dir}")
+
+        for shard_path in shard_paths:
+            header, data_start = self._load_header(shard_path)
+            for tensor_name, meta in header.items():
+                if tensor_name == "__metadata__":
+                    continue
+                self.locations[tensor_name] = TensorLocation(
+                    name=tensor_name,
+                    shard=shard_path,
+                    dtype=meta["dtype"],
+                    shape=tuple(int(v) for v in meta["shape"]),
+                    data_offsets=(int(meta["data_offsets"][0]), int(meta["data_offsets"][1])),
+                    data_start=data_start,
+                )
+
+    def _load_header(self, shard_path: Path) -> tuple[dict[str, Any], int]:
+        cached = self._headers.get(shard_path)
+        if cached is not None:
+            return cached, cached["__data_start__"]
+
+        with shard_path.open("rb") as f:
+            header_len = struct.unpack("<Q", f.read(8))[0]
+            header = json.loads(f.read(header_len))
+
+        data_start = 8 + header_len
+        header["__data_start__"] = data_start
+        self._headers[shard_path] = header
+        return header, data_start
+
+    def __contains__(self, name: str) -> bool:
+        return name in self.locations
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.locations.keys())
+
+    def load(self, name: str) -> np.ndarray[Any, Any]:
+        loc = self.locations[name]
+        shape = tuple(loc.shape)
+        offset = loc.data_start + loc.data_offsets[0]
+
+        if loc.dtype == "BF16":
+            raw = np.memmap(loc.shard, mode="r", dtype=np.uint16, offset=offset, shape=shape)
+            return bf16_to_float32(raw)
+
+        dtype = _SAFETENSORS_DTYPES.get(loc.dtype)
+        if dtype is None:
+            raise ValueError(f"Unsupported safetensors dtype {loc.dtype!r} for tensor {name!r}")
+
+        tensor = np.memmap(loc.shard, mode="r", dtype=dtype, offset=offset, shape=shape)
+        return np.asarray(tensor)
+
+
+def bf16_to_float32(raw: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
+    u32 = raw.astype(np.uint32) << 16
+    return u32.view(np.float32)
+
+
+def to_serializable_config_value(value: Any) -> Any:
+    if isinstance(value, (str, bool, int, float)):
+        return value
+    raise TypeError(f"Unsupported config value type: {type(value)!r}")
+
+
+def add_config_value(writer: gguf.GGUFWriter, key: str, value: Any) -> None:
+    value = to_serializable_config_value(value)
+    if isinstance(value, bool):
+        writer.add_bool(key, value)
+    elif isinstance(value, int):
+        if value >= 0:
+            writer.add_uint32(key, value)
+        else:
+            writer.add_int32(key, value)
+    elif isinstance(value, float):
+        writer.add_float32(key, value)
+    elif isinstance(value, str):
+        writer.add_string(key, value)
+    else:
+        raise TypeError(f"Unsupported config value type for {key!r}: {type(value)!r}")
+
+
+def load_config(model_dir: Path) -> dict[str, Any]:
+    config_path = model_dir / "config.json"
+    if not config_path.exists():
+        raise FileNotFoundError(f"Missing config.json under {model_dir}")
+    return json.loads(config_path.read_text())
+
+
+def validate_config(config: dict[str, Any]) -> None:
+    quantizer_type = config.get("quantizer_type") or config.get("quantizer_kwargs", {}).get("quantizer_type")
+    if quantizer_type not in SUPPORTED_QUANTIZER_TYPES:
+        raise ValueError(
+            f"Unsupported quantizer_type {quantizer_type!r}. "
+            f"This converter currently supports: {sorted(SUPPORTED_QUANTIZER_TYPES)}"
+        )
+
+    for section in ("encoder_kwargs", "decoder_kwargs"):
+        for idx, module_cfg in enumerate(config.get(section, [])):
+            module_type = module_cfg.get("module_type")
+            if module_type not in SUPPORTED_MODULE_TYPES:
+                raise ValueError(f"Unsupported {section}[{idx}].module_type={module_type!r}")
+            if module_type != "Transformer":
+                continue
+            gating = module_cfg.get("gating", "none")
+            if gating not in SUPPORTED_GATING:
+                raise ValueError(f"Unsupported {section}[{idx}].gating={gating!r}")
+            positional_embedding = module_cfg.get("positional_embedding", "rope")
+            if positional_embedding not in SUPPORTED_POSITIONAL_EMBEDDINGS:
+                raise ValueError(
+                    f"Unsupported {section}[{idx}].positional_embedding={positional_embedding!r}"
+                )
+            if "weights_per_step" in module_cfg and module_cfg["weights_per_step"]:
+                raise ValueError(f"Unsupported {section}[{idx}].weights_per_step={module_cfg['weights_per_step']!r}")
+            if "weights_per_step_schedule" in module_cfg and module_cfg["weights_per_step_schedule"]:
+                raise ValueError(
+                    f"Unsupported {section}[{idx}].weights_per_step_schedule="
+                    f"{module_cfg['weights_per_step_schedule']!r}"
+                )
+
+
+def add_metadata(
+    writer: gguf.GGUFWriter,
+    config: dict[str, Any],
+    model_name: str,
+    *,
+    include_general_fields: bool = True,
+) -> None:
+    if include_general_fields:
+        writer.add_type("audio_tokenizer")
+        writer.add_name(model_name)
+
+    sampling_rate = int(config.get("sampling_rate", DEFAULT_SAMPLING_RATE))
+    downsample_rate = int(config.get("downsample_rate", DEFAULT_DOWNSAMPLE_RATE))
+    context_duration = float(config.get("causal_transformer_context_duration", DEFAULT_CONTEXT_DURATION))
+
+    writer.add_uint32(f"{ARCH}.sampling_rate", sampling_rate)
+    writer.add_uint32(f"{ARCH}.downsample_rate", downsample_rate)
+    writer.add_float32(f"{ARCH}.causal_transformer_context_duration", context_duration)
+
+    if "code_dim" in config:
+        writer.add_uint32(f"{ARCH}.code_dim", int(config["code_dim"]))
+
+    quantizer_type = config.get("quantizer_type") or config.get("quantizer_kwargs", {}).get("quantizer_type", "rlfq")
+    writer.add_string(f"{ARCH}.quantizer_type", quantizer_type)
+
+    quantizer_cfg = dict(config.get("quantizer_kwargs", {}))
+    writer.add_uint32(f"{ARCH}.quantizer.input_dim", int(quantizer_cfg["input_dim"]))
+    writer.add_uint32(f"{ARCH}.quantizer.rvq_dim", int(quantizer_cfg.get("rvq_dim", quantizer_cfg["input_dim"])))
+    writer.add_uint32(f"{ARCH}.quantizer.output_dim", int(quantizer_cfg.get("output_dim", quantizer_cfg["input_dim"])))
+    writer.add_uint32(f"{ARCH}.quantizer.num_quantizers", int(quantizer_cfg["num_quantizers"]))
+    writer.add_uint32(f"{ARCH}.quantizer.codebook_size", int(quantizer_cfg["codebook_size"]))
+    writer.add_uint32(f"{ARCH}.quantizer.codebook_dim", int(quantizer_cfg["codebook_dim"]))
+
+    write_module_list_metadata(
+        writer=writer,
+        arch=ARCH,
+        section_name="encoder",
+        module_cfgs=list(config.get("encoder_kwargs", [])),
+        initial_frame_rate=float(sampling_rate),
+        context_duration=context_duration,
+        is_encoder=True,
+    )
+
+    encoder_final_frame_rate = compute_final_encoder_frame_rate(
+        module_cfgs=list(config.get("encoder_kwargs", [])),
+        sampling_rate=float(sampling_rate),
+    )
+    write_module_list_metadata(
+        writer=writer,
+        arch=ARCH,
+        section_name="decoder",
+        module_cfgs=list(config.get("decoder_kwargs", [])),
+        initial_frame_rate=encoder_final_frame_rate,
+        context_duration=context_duration,
+        is_encoder=False,
+    )
+
+
+def compute_final_encoder_frame_rate(module_cfgs: list[dict[str, Any]], sampling_rate: float) -> float:
+    frame_rate = sampling_rate
+    for module_cfg in module_cfgs:
+        if module_cfg.get("module_type") == "PatchedPretransform":
+            frame_rate /= int(module_cfg["patch_size"])
+    return frame_rate
+
+
+def write_module_list_metadata(
+    writer: gguf.GGUFWriter,
+    arch: str,
+    section_name: str,
+    module_cfgs: list[dict[str, Any]],
+    initial_frame_rate: float,
+    context_duration: float,
+    is_encoder: bool,
+) -> None:
+    writer.add_uint32(f"{arch}.{section_name}.block_count", len(module_cfgs))
+
+    frame_rate = initial_frame_rate
+    for idx, module_cfg in enumerate(module_cfgs):
+        prefix = f"{arch}.{section_name}.{idx}"
+        module_type = module_cfg["module_type"]
+        writer.add_string(f"{prefix}.module_type", module_type)
+
+        if module_type == "PatchedPretransform":
+            patch_size = int(module_cfg["patch_size"])
+            writer.add_uint32(f"{prefix}.patch_size", patch_size)
+            if is_encoder:
+                frame_rate /= patch_size
+            else:
+                frame_rate *= patch_size
+            continue
+
+        context = int(frame_rate * context_duration)
+        add_config_value(writer, f"{prefix}.context", context)
+        for key in (
+            "input_dimension",
+            "output_dimension",
+            "d_model",
+            "num_heads",
+            "num_layers",
+            "dim_feedforward",
+            "causal",
+            "norm",
+            "positional_embedding",
+            "max_period",
+            "layer_scale",
+            "conv_layout",
+            "gating",
+        ):
+            if key in module_cfg:
+                add_config_value(writer, f"{prefix}.{key}", module_cfg[key])
+
+
+def map_tensor_name(name: str) -> str | None:
+    if ".parametrizations.weight.original0" in name:
+        return name.replace(".parametrizations.weight.original0", ".weight")
+    if ".parametrizations.weight.original1" in name:
+        return None
+    return name
+
+
+def is_float_tensor(tensor: np.ndarray[Any, Any]) -> bool:
+    return np.issubdtype(tensor.dtype, np.floating)
+
+
+def choose_output_dtype(tensor: np.ndarray[Any, Any], outtype: str) -> np.dtype[Any] | None:
+    if not is_float_tensor(tensor):
+        return None
+    if outtype == "f32":
+        return np.dtype(np.float32)
+    if outtype == "f16":
+        if tensor.ndim <= 1:
+            return np.dtype(np.float32)
+        return np.dtype(np.float16)
+    raise ValueError(f"Unsupported outtype {outtype!r}")
+
+
+def convert_tensor_dtype(tensor: np.ndarray[Any, Any], outtype: str) -> np.ndarray[Any, Any]:
+    dst_dtype = choose_output_dtype(tensor, outtype)
+    if dst_dtype is None:
+        return np.ascontiguousarray(tensor)
+    if tensor.dtype == dst_dtype:
+        return np.ascontiguousarray(tensor)
+    return np.ascontiguousarray(tensor.astype(dst_dtype, copy=False))
+
+
+def merge_weight_norm(g: np.ndarray[Any, Any], v: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
+    axes = tuple(range(1, v.ndim))
+    norm = np.linalg.norm(v.astype(np.float32), axis=axes, keepdims=True)
+    norm = np.maximum(norm, np.finfo(np.float32).eps)
+    return g.astype(np.float32) * v.astype(np.float32) / norm
+
+
+def iter_converted_tensors(index: SafeTensorsIndex, outtype: str) -> Iterator[tuple[str, np.ndarray[Any, Any]]]:
+    emitted: set[str] = set()
+
+    for name in index:
+        mapped_name = map_tensor_name(name)
+        if mapped_name is None or mapped_name in emitted:
+            continue
+
+        if ".parametrizations.weight.original0" in name:
+            prefix = name.replace(".parametrizations.weight.original0", "")
+            g_name = f"{prefix}.parametrizations.weight.original0"
+            v_name = f"{prefix}.parametrizations.weight.original1"
+            weight = merge_weight_norm(index.load(g_name), index.load(v_name))
+            yield mapped_name, convert_tensor_dtype(weight, outtype)
+            emitted.add(mapped_name)
+            continue
+
+        tensor = index.load(name)
+        yield mapped_name, convert_tensor_dtype(tensor, outtype)
+        emitted.add(mapped_name)
+
+
+def count_output_tensors(index: SafeTensorsIndex) -> int:
+    seen: set[str] = set()
+    for name in index:
+        mapped_name = map_tensor_name(name)
+        if mapped_name is not None:
+            seen.add(mapped_name)
+    return len(seen)
+
+
+def default_outfile(model_dir: Path, outtype: str) -> Path:
+    return model_dir / f"{model_dir.name}-{outtype}.gguf"
+
+
+def build_writer(outfile: Path, outtype: str, model_name: str, config: dict[str, Any]) -> gguf.GGUFWriter:
+    ftype_map = {
+        "f32": gguf.LlamaFileType.ALL_F32,
+        "f16": gguf.LlamaFileType.MOSTLY_F16,
+    }
+    writer = gguf.GGUFWriter(path=outfile, arch=ARCH)
+    writer.add_file_type(ftype_map[outtype])
+    add_metadata(writer, config, model_name)
+    return writer
+
+
+def convert(model_dir: Path, outfile: Path, outtype: str, model_name: str, dry_run: bool) -> None:
+    config = load_config(model_dir)
+    validate_config(config)
+
+    index = SafeTensorsIndex(model_dir)
+    total_tensors = count_output_tensors(index)
+    logger.info("Found %d input tensors, %d output tensors", len(index.locations), total_tensors)
+
+    if dry_run:
+        logger.info("Dry-run only, not writing %s", outfile)
+        return
+
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+    writer = build_writer(outfile=outfile, outtype=outtype, model_name=model_name, config=config)
+    try:
+        for i, (name, tensor) in enumerate(iter_converted_tensors(index, outtype), start=1):
+            logger.debug("[%4d / %4d] %s %s", i, total_tensors, name, list(tensor.shape))
+            writer.add_tensor(name, tensor)
+
+        writer.write_header_to_file()
+        writer.write_kv_data_to_file()
+        writer.write_tensors_to_file(progress=False)
+        logger.info("Wrote %s", outfile)
+    finally:
+        writer.close()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a Hugging Face MOSS Audio Tokenizer checkpoint to GGUF without modifying convert_hf_to_gguf.py."
+    )
+    parser.add_argument(
+        "model_dir",
+        type=Path,
+        help="Path to a local MOSS Audio Tokenizer HF checkpoint directory containing config.json and safetensors shards.",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=Path,
+        default=None,
+        help="Output GGUF path. Defaults to <model_dir>/<model_dir.name>-<outtype>.gguf",
+    )
+    parser.add_argument(
+        "--outtype",
+        choices=("f16", "f32"),
+        default="f16",
+        help="GGUF floating-point storage type. f16 keeps 1D float tensors in f32, matching MOSTLY_F16 semantics.",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help="Optional GGUF general.name override. Defaults to the checkpoint directory name.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Validate the checkpoint and tensor mapping without writing the GGUF file.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable per-tensor logging.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(levelname)s:%(name)s:%(message)s",
+    )
+
+    model_dir = args.model_dir.resolve()
+    outfile = args.outfile.resolve() if args.outfile is not None else default_outfile(model_dir, args.outtype)
+    model_name = args.model_name or model_dir.name
+
+    convert(
+        model_dir=model_dir,
+        outfile=outfile,
+        outtype=args.outtype,
+        model_name=model_name,
+        dry_run=args.dry_run,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/moss-tts-firstclass-e2e.md b/docs/moss-tts-firstclass-e2e.md
index bdf9efd96..2c9e871e6 100644
--- a/docs/moss-tts-firstclass-e2e.md
+++ b/docs/moss-tts-firstclass-e2e.md
@@ -4,24 +4,21 @@
 
 This document describes the **first-class** MOSS-TTS end-to-end inference pipeline in the current `llama.cpp` repository.
 
-This pipeline uses:
+There are currently two ways to run it:
 
-- **llama.cpp** and `llama-moss-tts` to run the first-class MOSS-TTS-Delay GGUF model
-- **ONNX Runtime** for reference-audio encoding and final waveform decoding
-- **Python helper scripts** for prompt construction and end-to-end orchestration
-- A local **MOSS-TTS** checkout that provides the prompt builder and ONNX tokenizer Python modules
+- **Recommended native path**: all three models run inside `llama.cpp`
+  - `moss-tts-delay` backbone via `llama_decode()`
+  - `moss-tts-audio-encoder` via `llama_encode()`
+  - `moss-tts-audio-decoder` via `llama_encode()`
+- **Hybrid wrapper path**: backbone in `llama.cpp`, audio tokenizer in ONNX, orchestrated by Python
 
-Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`. Python is only responsible for preparing inputs and invoking the ONNX audio tokenizer.
+Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository, this path moves multi-channel inputs, the transformer backbone, multi-head outputs, and delay-pattern decoding into `llama.cpp`.
 
 ## Prerequisites
 
 1. **llama.cpp** built from source with the `llama-moss-tts` target
-2. **Python >= 3.10**
-3. A local **MOSS-TTS** checkout, provided in any of the following ways:
-   - available at `../MOSS-TTS` relative to the repository root
-   - passed through `--moss-tts-dir`
-   - passed through `MOSS_TTS_DIR` or `MOSS_TTS_ROOT`
-4. Python packages required by the helper scripts:
+2. **Python >= 3.10** if you want to use the hybrid wrapper or the converter scripts
+3. Python packages required by the hybrid helper scripts:
    - `numpy`
    - `soundfile`
    - `tokenizers`
@@ -29,22 +26,37 @@ Unlike the older `moss_tts_delay/llama_cpp` backend in the `MOSS-TTS` repository
 
 ## Build
 
+### CPU-only build
+
 ```bash
 cd /path/to/llama.cpp
 
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
 cmake --build build --target llama-moss-tts -j
 ```
 
-The resulting binary is:
+Binary:
 
 - `build/bin/llama-moss-tts`
 
-If you want to build at runtime, you can also pass `--build` to the e2e script.
+### CUDA build
+
+```bash
+cd /path/to/llama.cpp
+
+cmake -S . -B build-cuda -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake --build build-cuda --target llama-moss-tts -j
+```
+
+Binary:
+
+- `build-cuda/bin/llama-moss-tts`
+
+If you want to build the hybrid wrapper at runtime, you can also pass `--build` to the e2e script.
 
 ## Weight Preparation
 
-### Step 1: Prepare the first-class GGUF model
+### Step 1: Prepare the backbone GGUF
 
 You need a first-class MOSS-TTS-Delay GGUF model that already contains:
 
@@ -75,7 +87,30 @@ Important:
 - It is **not** the same thing as a generic GGUF downloaded from `OpenMOSS/MOSS-TTS-GGUF`.
 - Do not point this pipeline at a file from `OpenMOSS/MOSS-TTS-GGUF` unless that file was explicitly produced as a first-class MOSS-TTS-Delay GGUF for this `llama.cpp` implementation.
 
-### Step 2: Prepare the tokenizer directory
+### Step 2: Prepare the native audio encoder / decoder GGUFs
+
+You need two additional GGUF files:
+
+- `moss-tts-audio-encoder`
+- `moss-tts-audio-decoder`
+
+They can be generated from the Hugging Face `MOSS-Audio-Tokenizer` directory with:
+
+```bash
+huggingface-cli download OpenMOSS-Team/MOSS-Audio-Tokenizer --local-dir /path/to/MOSS-Audio-Tokenizer-hf
+
+python convert_moss_audio_tokenizer_split_to_gguf.py \
+    /path/to/MOSS-Audio-Tokenizer-hf \
+    --outdir /path/to/out \
+    --outtype f16
+```
+
+Typical outputs:
+
+- `/path/to/out/moss_tts_audio_encoder_f16.gguf`
+- `/path/to/out/moss_tts_audio_decoder_f16.gguf`
+
+### Step 3: Prepare the tokenizer directory for the hybrid wrapper
 
 You need a tokenizer directory containing at least:
 
@@ -85,7 +120,7 @@ For example:
 
 - `weights/extracted/qwen3_backbone/`
 
-### Step 3: Prepare the ONNX audio tokenizer
+### Step 4: Prepare the ONNX audio tokenizer for the hybrid wrapper
 
 You need both ONNX files:
 
@@ -97,34 +132,70 @@ For example:
 - `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx`
 - `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx`
 
-### Step 4: Make the MOSS-TTS repository visible
+## Usage
 
-The helper scripts import:
+### Current Native Runtime: Three GGUFs
 
-- `moss_tts_delay.llama_cpp.processor`
-- `moss_audio_tokenizer.onnx`
+This is the current recommended path.
 
-You can provide the repository path like this:
+#### CPU
 
 ```bash
-export MOSS_TTS_DIR=/path/to/MOSS-TTS
+# Text-only TTS on CPU
+build/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text "Hello, world!" \
+    --wav-out /path/to/output.wav
+
+# Voice cloning on CPU
+build/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --wav-out /path/to/output.wav
 ```
 
-or:
+#### GPU
 
 ```bash
-python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ...
+# Text-only TTS on GPU
+build-cuda/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text "Hello, world!" \
+    --wav-out /path/to/output.wav \
+    -ngl -1
+
+# Voice cloning on GPU
+build-cuda/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --wav-out /path/to/output.wav \
+    -ngl -1
 ```
 
-## Usage
+Notes:
+
+- `--reference-audio` must be a 24 kHz mono wav.
+- `-ngl -1` means "offload all eligible layers to GPU".
+- If you built `build-cuda/bin/llama-moss-tts` but want to force CPU execution, use `-ngl 0`.
 
-### CLI
+### Hybrid Wrapper: Backbone in GGUF, Audio Tokenizer in ONNX
+
+This path remains useful for parity checks and intermediate artifact inspection.
+
+#### CLI
 
 ```bash
 # Voice cloning: text + reference audio -> wav
 python tools/tts/moss-tts-firstclass-e2e.py \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -135,7 +206,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 # Direct generation without reference audio
 python tools/tts/moss-tts-firstclass-e2e.py \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -146,7 +216,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 python tools/tts/moss-tts-firstclass-e2e.py \
     --build \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -159,7 +228,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 | Option | Values | Description |
 |------|------|------|
 | `--model-gguf` | path | First-class MOSS-TTS GGUF model |
-| `--moss-tts-dir` | path | Local `MOSS-TTS` repository root |
+| `--moss-tts-dir` | path | Deprecated compatibility flag; no longer required |
 | `--tokenizer-dir` | path | Directory containing `tokenizer.json` |
 | `--onnx-encoder` | path | Audio tokenizer encoder ONNX |
 | `--onnx-decoder` | path | Audio tokenizer decoder ONNX |
@@ -174,8 +243,41 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 | `--cpu-audio-encode` | flag | Force ONNX reference-audio encoding on CPU |
 | `--build` | flag | Build `llama-moss-tts` before running |
 
+### Native Runtime Options
+
+| Option | Values | Description |
+|------|------|------|
+| `-m` | path | Backbone `moss-tts-delay` GGUF |
+| `--audio-encoder-model` | path | Native `moss-tts-audio-encoder` GGUF |
+| `--audio-decoder-model` | path | Native `moss-tts-audio-decoder` GGUF |
+| `--text` / `--text-file` | string / path | Input text, choose exactly one |
+| `--reference-audio` | path | Optional 24 kHz reference wav |
+| `--language` | `zh` / `en` / tag | Language tag passed to the prompt builder |
+| `--max-new-tokens` | int | Maximum generation steps |
+| `--gpu-layers` / `-ngl` | `-1` / `0` / `N` | GPU offload layers |
+| `--wav-out` | path | Output wav path |
+
 ## Architecture
 
+### Native Three-GGUF Path
+
+```text
+Input text (+ optional reference wav)
+  |
+  v
+llama-moss-tts
+  |
+  |- text prompt packing
+  |- optional reference wav -> moss-tts-audio-encoder -> reference audio codes
+  |- moss-tts-delay backbone via llama_decode()
+  |- multi-head sampling + C++ delay-pattern decoding
+  |- raw audio codes -> moss-tts-audio-decoder -> waveform
+  v
+wav
+```
+
+### Hybrid Wrapper Path
+
 ```text
 Input text (+ optional reference wav)
   |
@@ -184,7 +286,7 @@ moss-tts-build-generation-ref.py
   |
   |- tokenizes text with the Qwen3 tokenizer
   |- optionally encodes the reference wav into audio codes with ONNX
-  |- calls the prompt builder from the local MOSS-TTS repo
+  |- builds the packed prompt with the local lightweight MOSS-TTS processor
   v
 generation.ref.bin
   |
@@ -232,11 +334,14 @@ llama.cpp/
 ├── docs/
 │   ├── moss-tts-firstclass-e2e.md
 │   └── moss-tts-firstclass-e2e_zh.md
+├── convert_moss_audio_tokenizer_split_to_gguf.py
 ├── tools/tts/
 │   ├── moss-tts-firstclass-e2e.py       # End-to-end wrapper
 │   ├── moss-tts-build-generation-ref.py # Prompt / input builder
 │   ├── moss-tts-audio-decode.py         # ONNX audio decode helper
-│   └── moss-tts.cpp                     # llama-moss-tts implementation
-└── build/bin/
+│   └── run-moss-tts-delay.cpp           # llama-moss-tts implementation
+├── build/bin/
+│   └── llama-moss-tts
+└── build-cuda/bin/
     └── llama-moss-tts
 ```
diff --git a/docs/moss-tts-firstclass-e2e_zh.md b/docs/moss-tts-firstclass-e2e_zh.md
index 644a4bf4c..593e6b736 100644
--- a/docs/moss-tts-firstclass-e2e_zh.md
+++ b/docs/moss-tts-firstclass-e2e_zh.md
@@ -4,24 +4,21 @@
 
 本文档说明当前 `llama.cpp` 仓库中的 **first-class** MOSS-TTS 端到端推理链路。
 
-这条链路使用：
+目前有两种运行方式：
 
-- **llama.cpp** 和 `llama-moss-tts` 运行 first-class MOSS-TTS-Delay GGUF 模型
-- **ONNX Runtime** 完成参考音频编码和最终波形解码
-- **Python helper scripts** 负责 prompt 构建和整条链路编排
-- 本地 **MOSS-TTS** 仓库 checkout 提供 prompt builder 和 ONNX tokenizer Python 模块
+- **推荐的原生路径**：三个模型都在 `llama.cpp` 里运行
+  - `moss-tts-delay` backbone 通过 `llama_decode()`
+  - `moss-tts-audio-encoder` 通过 `llama_encode()`
+  - `moss-tts-audio-decoder` 通过 `llama_encode()`
+- **Hybrid wrapper 路径**：backbone 在 `llama.cpp`，音频 tokenizer 仍走 ONNX，由 Python 统一编排
 
-与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同，这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。Python 只负责准备输入和调用 ONNX 音频编解码器。
+与 `MOSS-TTS` 仓库中较早的 `moss_tts_delay/llama_cpp` 后端不同，这条链路把多通道输入、transformer backbone、多头输出以及 delay-pattern decode 都放进了 `llama.cpp`。
 
 ## 前置条件
 
 1. **llama.cpp** 已从源码编译，并包含 `llama-moss-tts` 目标
-2. **Python >= 3.10**
-3. 本地存在一个 **MOSS-TTS** checkout，可以通过以下任一方式提供：
-   - 位于当前仓库根目录旁边的 `../MOSS-TTS`
-   - 通过 `--moss-tts-dir` 指定
-   - 通过 `MOSS_TTS_DIR` 或 `MOSS_TTS_ROOT` 指定
-4. helper scripts 需要的 Python 包：
+2. **Python >= 3.10**，如果你要使用 hybrid wrapper 或转换脚本
+3. hybrid helper scripts 需要的 Python 包：
    - `numpy`
    - `soundfile`
    - `tokenizers`
@@ -29,22 +26,37 @@
 
 ## 编译
 
+### 仅 CPU 构建
+
 ```bash
 cd /path/to/llama.cpp
 
-cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
 cmake --build build --target llama-moss-tts -j
 ```
 
-编译产物为：
+产物：
 
 - `build/bin/llama-moss-tts`
 
-如果你希望在运行时自动构建，也可以在 e2e 脚本里传 `--build`。
+### CUDA 构建
+
+```bash
+cd /path/to/llama.cpp
+
+cmake -S . -B build-cuda -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON
+cmake --build build-cuda --target llama-moss-tts -j
+```
+
+产物：
+
+- `build-cuda/bin/llama-moss-tts`
+
+如果你希望在 hybrid wrapper 运行时自动构建，也可以传 `--build`。
 
 ## 权重准备
 
-### 第一步：准备 first-class GGUF 模型
+### 第一步：准备 backbone GGUF
 
 需要一个已经包含以下内容的 first-class MOSS-TTS-Delay GGUF：
 
@@ -75,7 +87,30 @@ python convert_hf_to_gguf.py \
 - 它**不是** `OpenMOSS/MOSS-TTS-GGUF` 仓库里的通用 GGUF 文件。
 - 除非某个文件被明确说明为适配这套 `llama.cpp` first-class 实现的 MOSS-TTS-Delay GGUF，否则不要把 `OpenMOSS/MOSS-TTS-GGUF` 里的文件直接拿来给这条 e2e 流水线使用。
 
-### 第二步：准备 tokenizer 目录
+### 第二步：准备原生 audio encoder / decoder GGUF
+
+还需要两个额外的 GGUF 文件：
+
+- `moss-tts-audio-encoder`
+- `moss-tts-audio-decoder`
+
+它们可以从 Hugging Face 的 `MOSS-Audio-Tokenizer` 目录转换得到：
+
+```bash
+huggingface-cli download OpenMOSS-Team/MOSS-Audio-Tokenizer --local-dir /path/to/MOSS-Audio-Tokenizer-hf
+
+python convert_moss_audio_tokenizer_split_to_gguf.py \
+    /path/to/MOSS-Audio-Tokenizer-hf \
+    --outdir /path/to/out \
+    --outtype f16
+```
+
+典型输出：
+
+- `/path/to/out/moss_tts_audio_encoder_f16.gguf`
+- `/path/to/out/moss_tts_audio_decoder_f16.gguf`
+
+### 第三步：为 hybrid wrapper 准备 tokenizer 目录
 
 需要一个至少包含以下文件的 tokenizer 目录：
 
@@ -85,7 +120,7 @@ python convert_hf_to_gguf.py \
 
 - `weights/extracted/qwen3_backbone/`
 
-### 第三步：准备 ONNX 音频编解码器
+### 第四步：为 hybrid wrapper 准备 ONNX 音频编解码器
 
 需要同时提供两个 ONNX 文件：
 
@@ -97,34 +132,70 @@ python convert_hf_to_gguf.py \
 - `weights/MOSS-Audio-Tokenizer-ONNX/encoder.onnx`
 - `weights/MOSS-Audio-Tokenizer-ONNX/decoder.onnx`
 
-### 第四步：让脚本能找到 MOSS-TTS 仓库
+## 使用方式
 
-helper scripts 会导入：
+### 当前原生运行方式：三个 GGUF
 
-- `moss_tts_delay.llama_cpp.processor`
-- `moss_audio_tokenizer.onnx`
+这是当前推荐路径。
 
-可以通过以下方式提供 repo 路径：
+#### CPU
 
 ```bash
-export MOSS_TTS_DIR=/path/to/MOSS-TTS
+# 纯文本 TTS，CPU 运行
+build/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text "你好，世界！" \
+    --wav-out /path/to/output.wav
+
+# 音色克隆，CPU 运行
+build/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --wav-out /path/to/output.wav
 ```
 
-或者：
+#### GPU
 
 ```bash
-python tools/tts/moss-tts-firstclass-e2e.py --moss-tts-dir /path/to/MOSS-TTS ...
+# 纯文本 TTS，GPU 运行
+build-cuda/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text "你好，世界！" \
+    --wav-out /path/to/output.wav \
+    -ngl -1
+
+# 音色克隆，GPU 运行
+build-cuda/bin/llama-moss-tts \
+    -m /path/to/moss_delay_firstclass_f16.gguf \
+    --audio-encoder-model /path/to/moss_tts_audio_encoder_f16.gguf \
+    --audio-decoder-model /path/to/moss_tts_audio_decoder_f16.gguf \
+    --text-file /path/to/text.txt \
+    --reference-audio /path/to/reference_24k.wav \
+    --wav-out /path/to/output.wav \
+    -ngl -1
 ```
 
-## 使用方式
+说明：
+
+- `--reference-audio` 必须是 24 kHz 单声道 wav。
+- `-ngl -1` 表示尽可能把可 offload 的层全部放到 GPU。
+- 如果你使用的是 `build-cuda/bin/llama-moss-tts` 但想强制走 CPU，可以传 `-ngl 0`。
 
-### 命令行
+### Hybrid wrapper：backbone 走 GGUF，音频 tokenizer 走 ONNX
+
+这条路径仍然适合做 parity 检查和中间产物调试。
+
+#### 命令行
 
 ```bash
 # 音色克隆：text + reference audio -> wav
 python tools/tts/moss-tts-firstclass-e2e.py \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -135,7 +206,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 # 不带参考音频的直接生成
 python tools/tts/moss-tts-firstclass-e2e.py \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -146,7 +216,6 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 python tools/tts/moss-tts-firstclass-e2e.py \
     --build \
     --model-gguf /path/to/moss_delay_firstclass.gguf \
-    --moss-tts-dir /path/to/MOSS-TTS \
     --tokenizer-dir /path/to/tokenizer_dir \
     --onnx-encoder /path/to/encoder.onnx \
     --onnx-decoder /path/to/decoder.onnx \
@@ -160,7 +229,7 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 | 参数 | 取值 | 说明 |
 |------|------|------|
 | `--model-gguf` | path | first-class MOSS-TTS GGUF 模型 |
-| `--moss-tts-dir` | path | 本地 `MOSS-TTS` 仓库根目录 |
+| `--moss-tts-dir` | path | 已废弃的兼容参数；不再需要 |
 | `--tokenizer-dir` | path | 含 `tokenizer.json` 的目录 |
 | `--onnx-encoder` | path | 音频 tokenizer encoder ONNX |
 | `--onnx-decoder` | path | 音频 tokenizer decoder ONNX |
@@ -175,8 +244,41 @@ python tools/tts/moss-tts-firstclass-e2e.py \
 | `--cpu-audio-encode` | flag | 强制 ONNX 参考音频编码走 CPU |
 | `--build` | flag | 运行前构建 `llama-moss-tts` |
 
+### 原生运行参数
+
+| 参数 | 取值 | 说明 |
+|------|------|------|
+| `-m` | path | `moss-tts-delay` backbone GGUF |
+| `--audio-encoder-model` | path | 原生 `moss-tts-audio-encoder` GGUF |
+| `--audio-decoder-model` | path | 原生 `moss-tts-audio-decoder` GGUF |
+| `--text` / `--text-file` | string / path | 输入文本，二选一 |
+| `--reference-audio` | path | 可选 24 kHz reference wav |
+| `--language` | `zh` / `en` / tag | 传给 prompt builder 的语言标签 |
+| `--max-new-tokens` | int | 最大生成步数 |
+| `--gpu-layers` / `-ngl` | `-1` / `0` / `N` | GPU offload 层数 |
+| `--wav-out` | path | 输出 wav 路径 |
+
 ## 架构
 
+### 原生三 GGUF 路径
+
+```text
+输入文本（+ 可选 reference wav）
+  |
+  v
+llama-moss-tts
+  |
+  |- 文本 prompt 打包
+  |- 可选：reference wav -> moss-tts-audio-encoder -> reference audio codes
+  |- moss-tts-delay backbone，经由 llama_decode()
+  |- 多头采样 + C++ delay-pattern decode
+  |- raw audio codes -> moss-tts-audio-decoder -> waveform
+  v
+wav
+```
+
+### Hybrid wrapper 路径
+
 ```text
 输入文本（+ 可选 reference wav）
   |
@@ -185,7 +287,7 @@ moss-tts-build-generation-ref.py
   |
   |- 用 Qwen3 tokenizer 处理文本
   |- 可选：用 ONNX 把 reference wav 编成 audio codes
-  |- 调用本地 MOSS-TTS repo 的 prompt builder
+  |- 用仓库内置的轻量 MOSS-TTS processor 构建 packed prompt
   v
 generation.ref.bin
   |
@@ -233,11 +335,14 @@ llama.cpp/
 ├── docs/
 │   ├── moss-tts-firstclass-e2e.md
 │   └── moss-tts-firstclass-e2e_zh.md
+├── convert_moss_audio_tokenizer_split_to_gguf.py
 ├── tools/tts/
 │   ├── moss-tts-firstclass-e2e.py       # 端到端 wrapper
 │   ├── moss-tts-build-generation-ref.py # prompt / input 构建器
 │   ├── moss-tts-audio-decode.py         # ONNX 音频解码 helper
-│   └── moss-tts.cpp                     # llama-moss-tts 实现
-└── build/bin/
+│   └── run-moss-tts-delay.cpp           # llama-moss-tts 实现
+├── build/bin/
+│   └── llama-moss-tts
+└── build-cuda/bin/
     └── llama-moss-tts
 ```
diff --git a/include/llama-moss-audio-tokenizer.h b/include/llama-moss-audio-tokenizer.h
new file mode 100644
index 000000000..5e0326787
--- /dev/null
+++ b/include/llama-moss-audio-tokenizer.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "llama.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+struct moss_audio_tokenizer_options {
+    int n_threads = -1;
+};
+
+class LLAMA_API moss_audio_tokenizer {
+public:
+    explicit moss_audio_tokenizer(
+            const std::string & model_path,
+            const moss_audio_tokenizer_options & options = {});
+    ~moss_audio_tokenizer();
+
+    moss_audio_tokenizer(const moss_audio_tokenizer &) = delete;
+    moss_audio_tokenizer & operator=(const moss_audio_tokenizer &) = delete;
+
+    moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept;
+    moss_audio_tokenizer & operator=(moss_audio_tokenizer &&) noexcept;
+
+    int sample_rate() const;
+    uint32_t downsample_rate() const;
+    uint32_t num_quantizers() const;
+
+    std::vector<float> decode(
+            const std::vector<llama_token> & codes,
+            size_t n_frames,
+            uint32_t n_quantizers = 0) const;
+
+    std::vector<llama_token> encode(
+            const std::vector<float> & audio,
+            size_t * out_frames = nullptr,
+            uint32_t n_quantizers = 0) const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> impl_;
+};
+
+LLAMA_API int moss_audio_model_sample_rate(const struct llama_model * model);
+
+LLAMA_API uint32_t moss_audio_model_downsample_rate(const struct llama_model * model);
+
+LLAMA_API uint32_t moss_audio_model_num_quantizers(const struct llama_model * model);
+
+LLAMA_API std::vector<llama_token> moss_audio_model_quantizer_encode(
+        const struct llama_model * model,
+        const std::vector<float> & input,
+        size_t n_frames,
+        uint32_t n_quantizers = 0);
diff --git a/include/llama.h b/include/llama.h
index c79adbaf5..3abcd000b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -556,6 +556,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_out_i32  (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
@@ -1016,6 +1017,18 @@ extern "C" {
     // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
+    // Get all raw int32 outputs.
+    // shape: [n_outputs*n_out_i32]
+    // Returns NULL if the current model/graph does not expose any raw int32 outputs.
+    LLAMA_API int32_t * llama_get_output_i32(struct llama_context * ctx);
+
+    // Get the raw int32 outputs for the ith token/output row.
+    // For positive indices, equivalent to:
+    // llama_get_output_i32(ctx) + ctx->output_ids[i]*n_out_i32
+    // Negative indices can be used to access outputs in reverse order, -1 is the last row.
+    // Returns NULL for invalid ids or when no raw int32 outputs are available.
+    LLAMA_API int32_t * llama_get_output_i32_ith(struct llama_context * ctx, int32_t i);
+
     //
     // backend sampling API [EXPERIMENTAL]
     // note: use only if the llama_context was created with at least one llama_sampler_seq_config
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 06e6e23ed..b93054d70 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,6 +7,7 @@ llama_add_compile_flags()
 # llama
 
 add_library(llama
+            ../include/llama-moss-audio-tokenizer.h
             ../include/llama.h
             llama.cpp
             llama-adapter.cpp
@@ -100,6 +101,10 @@ add_library(llama
             models/minicpm3.cpp
             models/minimax-m2.cpp
             models/mistral3.cpp
+            models/moss-audio-common.cpp
+            models/moss-audio-decoder.cpp
+            models/moss-audio-encoder.cpp
+            models/moss-audio-tokenizer.cpp
             models/moss-tts-delay.cpp
             models/modern-bert.cpp
             models/mpt.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 07800c68a..c27828ee0 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -36,6 +36,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
     { LLM_ARCH_QWEN3,            "qwen3"            },
     { LLM_ARCH_MOSS_TTS_DELAY,   "moss-tts-delay"   },
+    { LLM_ARCH_MOSS_TTS_AUDIO_ENCODER, "moss-tts-audio-encoder" },
+    { LLM_ARCH_MOSS_TTS_AUDIO_DECODER, "moss-tts-audio-decoder" },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
     { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
     { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
@@ -548,6 +550,21 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_PROJ,                           "blk.%d.indexer.proj" },
     { LLM_TENSOR_INDEXER_ATTN_K,                         "blk.%d.indexer.attn_k" },
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
+    { LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ,            "blk.%d.input_proj" },
+    { LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ,           "blk.%d.output_proj" },
+    { LLM_TENSOR_MOSS_AUDIO_ATTN_QKV,                    "blk.%d.layer.%d.attn_qkv" },
+    { LLM_TENSOR_MOSS_AUDIO_ATTN_OUT,                    "blk.%d.layer.%d.attn_output" },
+    { LLM_TENSOR_MOSS_AUDIO_ATTN_NORM,                   "blk.%d.layer.%d.attn_norm" },
+    { LLM_TENSOR_MOSS_AUDIO_FFN_UP,                      "blk.%d.layer.%d.ffn_up" },
+    { LLM_TENSOR_MOSS_AUDIO_FFN_DOWN,                    "blk.%d.layer.%d.ffn_down" },
+    { LLM_TENSOR_MOSS_AUDIO_FFN_NORM,                    "blk.%d.layer.%d.ffn_norm" },
+    { LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE,                  "blk.%d.layer.%d.attn_scale" },
+    { LLM_TENSOR_MOSS_AUDIO_FFN_SCALE,                   "blk.%d.layer.%d.ffn_scale" },
+    { LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ,            "quantizer.input_proj" },
+    { LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ,           "quantizer.output_proj" },
+    { LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK,              "quantizer.quantizers.%d.codebook" },
+    { LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ,               "quantizer.quantizers.%d.in_proj" },
+    { LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ,              "quantizer.quantizers.%d.out_proj" },
 };
 
 static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
@@ -1002,6 +1019,25 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_FFN_DOWN,
                 LLM_TENSOR_FFN_UP,
             };
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+            return {
+                LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ,
+                LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ,
+                LLM_TENSOR_MOSS_AUDIO_ATTN_QKV,
+                LLM_TENSOR_MOSS_AUDIO_ATTN_OUT,
+                LLM_TENSOR_MOSS_AUDIO_ATTN_NORM,
+                LLM_TENSOR_MOSS_AUDIO_FFN_UP,
+                LLM_TENSOR_MOSS_AUDIO_FFN_DOWN,
+                LLM_TENSOR_MOSS_AUDIO_FFN_NORM,
+                LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE,
+                LLM_TENSOR_MOSS_AUDIO_FFN_SCALE,
+                LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ,
+                LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ,
+                LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK,
+                LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ,
+                LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ,
+            };
         case LLM_ARCH_QWEN3MOE:
         case LLM_ARCH_QWEN3VLMOE:
         case LLM_ARCH_OLMOE:
@@ -2792,6 +2828,21 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_NEXTN_HNORM,                {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
     {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,     {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+    {LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ,{LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ,{LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_ATTN_QKV,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_ATTN_OUT,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_ATTN_NORM,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_MOSS_AUDIO_FFN_UP,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_FFN_DOWN,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_FFN_NORM,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE,      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_MOSS_AUDIO_FFN_SCALE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ,{LLM_TENSOR_LAYER_INPUT,     GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ,{LLM_TENSOR_LAYER_OUTPUT,   GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK,  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ,   {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ,  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     // Nemotron 3 Super
     {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -2827,6 +2878,9 @@ std::string LLM_TN_IMPL::str() const {
     switch (tensor) {
         case LLM_TENSOR_TOKEN_EMBD_AUDIO:
         case LLM_TENSOR_OUTPUT_AUDIO:
+        case LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK:
+        case LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ:
+        case LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ:
             name = ::format(LLM_TENSOR_NAMES.at(tensor), xid);
             break;
         default:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 9320b01da..4b042cb66 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -40,6 +40,8 @@ enum llm_arch {
     LLM_ARCH_QWEN2VL,
     LLM_ARCH_QWEN3,
     LLM_ARCH_MOSS_TTS_DELAY,
+    LLM_ARCH_MOSS_TTS_AUDIO_ENCODER,
+    LLM_ARCH_MOSS_TTS_AUDIO_DECODER,
     LLM_ARCH_QWEN3MOE,
     LLM_ARCH_QWEN3NEXT,
     LLM_ARCH_QWEN3VL,
@@ -556,6 +558,21 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_HNORM,
     LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+    LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ,
+    LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ,
+    LLM_TENSOR_MOSS_AUDIO_ATTN_QKV,
+    LLM_TENSOR_MOSS_AUDIO_ATTN_OUT,
+    LLM_TENSOR_MOSS_AUDIO_ATTN_NORM,
+    LLM_TENSOR_MOSS_AUDIO_FFN_UP,
+    LLM_TENSOR_MOSS_AUDIO_FFN_DOWN,
+    LLM_TENSOR_MOSS_AUDIO_FFN_NORM,
+    LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE,
+    LLM_TENSOR_MOSS_AUDIO_FFN_SCALE,
+    LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ,
+    LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ,
+    LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK,
+    LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ,
+    LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ,
 };
 
 enum llm_tensor_layer {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 915380b26..9206aa83f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -832,6 +832,12 @@ float * llama_context::get_embeddings() {
     return embd.data;
 }
 
+int32_t * llama_context::get_output_i32() {
+    output_reorder();
+
+    return out_i32.data;
+}
+
 llama_token * llama_context::get_sampled_tokens()  const{
     return sampling.sampled.data;
 }
@@ -866,6 +872,26 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
     return it->second.data();
 }
 
+int32_t * llama_context::get_output_i32_ith(int32_t i) {
+    output_reorder();
+
+    try {
+        if (out_i32.data == nullptr) {
+            throw std::runtime_error("no raw int32 outputs");
+        }
+
+        const int64_t j = output_resolve_row(i);
+        return out_i32.data + j*out_i32_stride;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: invalid raw i32 output id %d, reason: %s\n", __func__, i, err.what());
+#ifndef NDEBUG
+        GGML_ABORT("fatal error");
+#else
+        return nullptr;
+#endif
+    }
+}
+
 llama_token llama_context::get_sampled_token_ith(int32_t idx) {
     output_reorder();
 
@@ -1235,6 +1261,43 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
     return res;
 }
 
+static uint32_t llama_encode_expected_outputs(const llama_model & model, uint32_t n_tokens) {
+    switch (model.arch) {
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+            {
+                const auto key = llm_arch_name(model.arch) + std::string(".downsample_rate");
+                const auto it = model.gguf_kv.find(key);
+                const uint32_t downsample_rate = it != model.gguf_kv.end() ? std::max<uint32_t>(1, (uint32_t) std::stoul(it->second)) : 1u;
+                return std::max<uint32_t>(1, n_tokens / downsample_rate);
+            }
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+            {
+                const auto key = llm_arch_name(model.arch) + std::string(".downsample_rate");
+                const auto it = model.gguf_kv.find(key);
+                const uint32_t downsample_rate = it != model.gguf_kv.end() ? std::max<uint32_t>(1, (uint32_t) std::stoul(it->second)) : 1u;
+                return n_tokens * downsample_rate;
+            }
+        default:
+            return n_tokens;
+    }
+}
+
+static uint32_t llama_encode_actual_outputs(const llama_model & model, ggml_tensor * t_embd, ggml_tensor * t_out_i32, uint32_t fallback) {
+    switch (model.arch) {
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+            if (t_embd != nullptr) {
+                return (uint32_t) std::max<int64_t>(1, t_embd->ne[1]);
+            }
+            if (t_out_i32 != nullptr) {
+                return (uint32_t) std::max<int64_t>(1, t_out_i32->ne[1]);
+            }
+            return fallback;
+        default:
+            return fallback;
+    }
+}
+
 int llama_context::encode(const llama_batch & batch_inp) {
     GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
 
@@ -1274,17 +1337,19 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     n_queued_tokens += n_tokens;
 
+    const uint32_t n_outputs_expected = llama_encode_expected_outputs(model, n_tokens);
+
     // reserve output buffer
-    if (output_reserve(n_tokens) < n_tokens) {
-        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
+    if (output_reserve(n_outputs_expected) < n_outputs_expected) {
+        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs_expected);
         return -2;
     };
 
-    for (uint32_t i = 0; i < n_tokens; ++i) {
+    for (uint32_t i = 0; i < n_outputs_expected; ++i) {
         output_ids[i] = i;
     }
 
-    n_outputs = n_tokens;
+    n_outputs = n_outputs_expected;
 
     const auto causal_attn_org = cparams.causal_attn;
 
@@ -1309,6 +1374,13 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     auto * t_logits = res->get_logits();
     auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
+    auto * t_out_i32 = res->get_out_i32();
+
+    const uint32_t n_outputs_actual = llama_encode_actual_outputs(model, t_embd, t_out_i32, n_tokens);
+    n_outputs = n_outputs_actual;
+    for (uint32_t i = 0; i < n_outputs_actual; ++i) {
+        output_ids[i] = i;
+    }
 
     // extract logits
     if (logits.data && t_logits) {
@@ -1316,7 +1388,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
         GGML_ASSERT(backend_res != nullptr);
         GGML_ASSERT(logits.data != nullptr);
 
-        ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_tokens*n_logits*sizeof(float));
+        ggml_backend_tensor_get_async(backend_res, t_logits, logits.data, 0, n_outputs_actual*n_logits*sizeof(float));
     }
 
     // extract embeddings
@@ -1331,8 +1403,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
                     GGML_ASSERT(embd.data != nullptr);
                     const uint32_t n_embd_out = hparams.n_embd_out();
 
-                    GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd.size);
-                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_tokens*n_embd_out*sizeof(float));
+                    GGML_ASSERT(n_outputs_actual*n_embd_out <= (int64_t) embd.size);
+                    ggml_backend_tensor_get_async(backend_embd, t_embd, embd.data, 0, n_outputs_actual*n_embd_out*sizeof(float));
                 } break;
             case LLAMA_POOLING_TYPE_MEAN:
             case LLAMA_POOLING_TYPE_CLS:
@@ -1371,6 +1443,14 @@ int llama_context::encode(const llama_batch & batch_inp) {
         }
     }
 
+    if (out_i32.data && t_out_i32) {
+        ggml_backend_t backend_out_i32 = ggml_backend_sched_get_tensor_backend(sched.get(), t_out_i32);
+        GGML_ASSERT(backend_out_i32 != nullptr);
+        GGML_ASSERT(out_i32.data != nullptr);
+        GGML_ASSERT(n_outputs_actual*out_i32_stride <= (int64_t) out_i32.size);
+        ggml_backend_tensor_get_async(backend_out_i32, t_out_i32, out_i32.data, 0, n_outputs_actual*out_i32_stride*sizeof(int32_t));
+    }
+
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
         //cross.t_embd = t_embd;
@@ -1719,8 +1799,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
         //}
 
-        auto * t_logits = res->get_logits();
-        auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;
+        auto * t_logits  = res->get_logits();
+        auto * t_embd    = cparams.embeddings ? res->get_embd() : nullptr;
+        auto * t_out_i32 = res->get_out_i32();
 
         if (t_embd && res->get_embd_pooled()) {
             t_embd = res->get_embd_pooled();
@@ -1798,6 +1879,16 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
+        if (out_i32.data && t_out_i32 && n_outputs > 0) {
+            ggml_backend_t backend_out_i32 = ggml_backend_sched_get_tensor_backend(sched.get(), t_out_i32);
+            GGML_ASSERT(backend_out_i32 != nullptr);
+
+            int32_t * out_i32_dst = out_i32.data + n_outputs_prev*out_i32_stride;
+            GGML_ASSERT(n_outputs_prev + n_outputs <= n_outputs_all);
+            GGML_ASSERT((n_outputs_prev + n_outputs)*out_i32_stride <= (int64_t) out_i32.size);
+            ggml_backend_tensor_get_async(backend_out_i32, t_out_i32, out_i32_dst, 0, n_outputs*out_i32_stride*sizeof(int32_t));
+        }
+
         // Copy backend sampling output if this ubatch produced any sampling tensors.
         if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
             const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
@@ -1880,10 +1971,10 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
 
-    const auto n_batch    = cparams.n_batch;
     const auto n_vocab    = vocab.n_tokens();
     const auto n_logits   = model.n_logits();
     const auto n_embd_out = hparams.n_embd_out();
+    const auto n_out_i32  = hparams.n_out_i32();
 
     bool has_logits = true;
     bool has_embd   = cparams.embeddings;
@@ -1901,6 +1992,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     logits_stride = has_logits ? n_logits : 0;
     logits.size = has_logits ? n_logits*n_outputs_max : 0;
     embd.size   = has_embd ? n_embd_out*n_outputs_max : 0;
+    out_i32_stride = n_out_i32;
+    out_i32.size = n_out_i32 > 0 ? n_out_i32*n_outputs_max : 0;
 
     // Allocate backend sampling output buffers if there are backend samplers configured.
     const bool has_sampling = !sampling.samplers.empty();
@@ -1909,15 +2002,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         backend_token_count = (1 + n_vocab) * n_outputs_max;    // sampled + candidates
     }
 
-    if (output_ids.empty()) {
-        // init, never resized afterwards
-        output_ids.resize(n_batch);
+    if (output_ids.size() < (size_t) n_outputs_max) {
+        output_ids.resize(n_outputs_max);
     }
 
     const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
     const size_t new_size  =
         (logits.size + embd.size + backend_float_count) * sizeof(float) +
-        (                          backend_token_count) * sizeof(llama_token);
+        (out_i32.size)                                 * sizeof(int32_t) +
+        (                                              backend_token_count) * sizeof(llama_token);
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -1933,6 +2026,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
             buf_output = nullptr;
             logits.data = nullptr;
             embd.data = nullptr;
+            out_i32.data = nullptr;
         }
 
         auto * buft = ggml_backend_cpu_buffer_type();
@@ -1960,6 +2054,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
     offset += embd.size * sizeof(float);
 
+    out_i32 = out_i32.size > 0 ? buffer_view<int32_t>{(int32_t *) (base + offset), out_i32.size} : buffer_view<int32_t>{nullptr, 0};
+    offset += out_i32.size * sizeof(int32_t);
+
     if (has_sampling) {
         sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
         offset += sampling.logits.size * sizeof(float);
@@ -2007,7 +2104,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 void llama_context::output_reorder() {
     const uint64_t n_logits = logits_stride;
     const uint64_t n_vocab  = model.vocab.n_tokens();
-    const uint64_t n_embd   = model.hparams.n_embd;
+    const uint64_t n_embd   = model.hparams.n_embd_out();
+    const uint64_t n_i32    = model.hparams.n_out_i32();
 
     for (size_t s = 0; s < output_swaps.size(); ++s) {
         const uint64_t i0 = output_swaps[s].i0;
@@ -2025,6 +2123,12 @@ void llama_context::output_reorder() {
             }
         }
 
+        if (out_i32.size > 0) {
+            for (uint64_t k = 0; k < n_i32; ++k) {
+                std::swap(out_i32.data[i0*n_i32 + k], out_i32.data[i1*n_i32 + k]);
+            }
+        }
+
         if (!sampling.samplers.empty()) {
             assert(sampling.logits.size > 0);
             assert(sampling.probs.size > 0);
@@ -3101,6 +3205,18 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
     return ctx->get_embeddings_seq(seq_id);
 }
 
+int32_t * llama_get_output_i32(llama_context * ctx) {
+    ctx->synchronize();
+
+    return ctx->get_output_i32();
+}
+
+int32_t * llama_get_output_i32_ith(llama_context * ctx, int32_t i) {
+    ctx->synchronize();
+
+    return ctx->get_output_i32_ith(i);
+}
+
 bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
     return ctx->set_sampler(seq_id, smpl);
 }
diff --git a/src/llama-context.h b/src/llama-context.h
index 49c39f023..fe7e0cacd 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -78,6 +78,8 @@ struct llama_context {
     float * get_embeddings();
     float * get_embeddings_ith(int32_t i);
     float * get_embeddings_seq(llama_seq_id seq_id);
+    int32_t * get_output_i32();
+    int32_t * get_output_i32_ith(int32_t i);
 
     llama_token * get_sampled_tokens() const;
     llama_token   get_sampled_token_ith(int32_t idx);
@@ -277,6 +279,8 @@ struct llama_context {
     // embeddings output (2-dimensional array: [n_outputs][n_embd])
     // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
     buffer_view<float> embd = {nullptr, 0};
+    buffer_view<int32_t> out_i32 = {nullptr, 0};
+    uint32_t out_i32_stride = 0;
 
     struct sampling_info {
         // !samplers.empty() to check if any samplers are active
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 77735daad..463dd821d 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -737,6 +737,7 @@ void llm_graph_result::reset() {
     t_logits      = nullptr;
     t_embd        = nullptr;
     t_embd_pooled = nullptr;
+    t_out_i32     = nullptr;
     t_sampled.clear();
     t_sampled_probs.clear();
     t_sampled_logits.clear();
@@ -775,6 +776,9 @@ void llm_graph_result::set_outputs() {
     if (t_embd_pooled != nullptr) {
         ggml_set_output(t_embd_pooled);
     }
+    if (t_out_i32 != nullptr) {
+        ggml_set_output(t_out_i32);
+    }
     for (auto & [seq_id, t] : t_sampled) {
         if (t != nullptr) {
             ggml_set_output(t);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index a1362cc5a..d1feb8e48 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -638,6 +638,7 @@ class llm_graph_result {
     ggml_tensor * get_logits()      const { return t_logits; }
     ggml_tensor * get_embd()        const { return t_embd; }
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
+    ggml_tensor * get_out_i32()     const { return t_out_i32; }
 
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
@@ -666,6 +667,7 @@ class llm_graph_result {
     ggml_tensor * t_logits      = nullptr;
     ggml_tensor * t_embd        = nullptr;
     ggml_tensor * t_embd_pooled = nullptr;
+    ggml_tensor * t_out_i32     = nullptr;
 
     std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
     std::map<llama_seq_id, ggml_tensor*> t_candidates;
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 002d15d41..edcc8de21 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -84,6 +84,10 @@ uint32_t llama_hparams::n_embd_out() const {
     return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
 }
 
+uint32_t llama_hparams::n_out_i32() const {
+    return n_out_i32_impl;
+}
+
 uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
     if (il < n_layer) {
         return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 0a1c76965..dd4a47986 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -187,6 +187,8 @@ struct llama_hparams {
 
     // output embedding dimension (0 = use n_embd)
     uint32_t n_embd_out_impl = 0;
+    // raw int32 output width (0 = disabled)
+    uint32_t n_out_i32_impl  = 0;
 
     // llama4 smallthinker
     uint32_t n_moe_layer_step        = 0;
@@ -274,6 +276,7 @@ struct llama_hparams {
 
     // dimension of output embeddings
     uint32_t n_embd_out() const;
+    uint32_t n_out_i32() const;
 
     // dimension of key/value embeddings for each head (per layer)
     uint32_t n_embd_head_k(uint32_t il = 0) const;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f7b4bd12f..45bb9fb8d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -367,14 +367,48 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         return;
     }
 
-    ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
-    ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
-    ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
-    ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
-    ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
-    ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
-    ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
-    ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
+    if (arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER || arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) {
+        const std::string arch_name = llm_arch_name(arch);
+        uint32_t downsample_rate = 0;
+        uint32_t sampling_rate = 0;
+        uint32_t block_count = 0;
+        uint32_t num_quantizers = 0;
+        uint32_t quantizer_input_dim = 0;
+        float context_duration = 0.0f;
+
+        ml.get_key(arch_name + ".downsample_rate", downsample_rate);
+        ml.get_key(arch_name + ".sampling_rate", sampling_rate);
+        ml.get_key(arch_name + ".causal_transformer_context_duration", context_duration);
+        ml.get_key(arch_name + ".quantizer.num_quantizers", num_quantizers);
+        ml.get_key(arch_name + ".quantizer.input_dim", quantizer_input_dim);
+
+        if (arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) {
+            ml.get_key(arch_name + ".encoder.block_count", block_count);
+            hparams.n_ctx_train = std::max<uint32_t>(1, (uint32_t) std::lround((double) sampling_rate * context_duration));
+            hparams.n_embd = 1;
+            hparams.n_embd_out_impl = quantizer_input_dim;
+            hparams.n_out_i32_impl = num_quantizers;
+        } else {
+            ml.get_key(arch_name + ".decoder.block_count", block_count);
+            hparams.n_ctx_train = std::max<uint32_t>(1, (uint32_t) std::lround((double) sampling_rate * context_duration / std::max<uint32_t>(downsample_rate, 1)));
+            hparams.n_embd = 1;
+            hparams.n_embd_out_impl = 1;
+            hparams.n_out_i32_impl = 0;
+        }
+
+        hparams.n_layer = block_count;
+        hparams.n_vq = num_quantizers;
+        hparams.sampling_rate = sampling_rate;
+    } else {
+        ml.get_key(LLM_KV_CONTEXT_LENGTH,          hparams.n_ctx_train);
+        ml.get_key(LLM_KV_EMBEDDING_LENGTH,        hparams.n_embd);
+        ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT,    hparams.n_embd_out_impl, false);
+        ml.get_key(LLM_KV_BLOCK_COUNT,             hparams.n_layer);
+        ml.get_key(LLM_KV_EXPERT_COUNT,            hparams.n_expert,        false);
+        ml.get_key(LLM_KV_EXPERT_USED_COUNT,       hparams.n_expert_used,   false);
+        ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,      hparams.n_expert_groups, false);
+        ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used,    false);
+    }
 
     if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH,  hparams.n_embd);
@@ -1028,6 +1062,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+            {
+                hparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+                hparams.causal_attn = false;
+                hparams.f_norm_eps = 1e-5f;
+                type = LLM_TYPE_UNKNOWN;
+            } break;
         case LLM_ARCH_MAINCODER:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3729,6 +3771,88 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
+            case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+            case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+                {
+                    const std::string arch_name = llm_arch_name(arch);
+                    const char * section_name = arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? "encoder" : "decoder";
+
+                    auto get_u32 = [&](const std::string & key) -> int64_t {
+                        uint32_t value = 0;
+                        ml.get_key(key, value);
+                        return value;
+                    };
+
+                    auto get_str = [&](const std::string & key) -> std::string {
+                        std::string value;
+                        ml.get_key(key, value);
+                        return value;
+                    };
+
+                    const int64_t quant_input_dim = get_u32(arch_name + ".quantizer.input_dim");
+                    const int64_t quant_rvq_dim = get_u32(arch_name + ".quantizer.rvq_dim");
+                    const int64_t quant_output_dim = get_u32(arch_name + ".quantizer.output_dim");
+                    const int64_t quant_codebook_size = get_u32(arch_name + ".quantizer.codebook_size");
+                    const int64_t quant_codebook_dim = get_u32(arch_name + ".quantizer.codebook_dim");
+                    const int64_t num_quantizers = get_u32(arch_name + ".quantizer.num_quantizers");
+                    const int64_t block_count = get_u32(arch_name + "." + section_name + ".block_count");
+
+                    create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "weight"), {1, quant_input_dim, quant_rvq_dim},
+                            arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED);
+                    create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "bias"), {quant_rvq_dim},
+                            arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED);
+                    create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "weight"), {1, quant_rvq_dim, quant_output_dim},
+                            arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER ? 0 : TENSOR_NOT_REQUIRED);
+                    create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "bias"), {quant_output_dim},
+                            arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER ? 0 : TENSOR_NOT_REQUIRED);
+
+                    for (int64_t iq = 0; iq < num_quantizers; ++iq) {
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, (int) iq), {quant_codebook_dim, quant_codebook_size}, 0);
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "weight", -1, (int) iq), {1, quant_rvq_dim, quant_codebook_dim},
+                                arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED);
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "bias", -1, (int) iq), {quant_codebook_dim},
+                                arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER ? 0 : TENSOR_NOT_REQUIRED);
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, (int) iq), {1, quant_codebook_dim, quant_rvq_dim}, 0);
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, (int) iq), {quant_rvq_dim}, 0);
+                    }
+
+                    int tensor_block = 0;
+                    for (int64_t ib = 0; ib < block_count; ++ib) {
+                        const std::string block_prefix = arch_name + "." + section_name + "." + std::to_string(ib);
+                        const std::string module_type = get_str(block_prefix + ".module_type");
+
+                        if (module_type == "PatchedPretransform") {
+                            continue;
+                        }
+                        if (module_type != "Transformer") {
+                            throw std::runtime_error("unsupported MOSS audio module type: " + module_type);
+                        }
+
+                        const int64_t input_dimension = get_u32(block_prefix + ".input_dimension");
+                        const int64_t output_dimension = get_u32(block_prefix + ".output_dimension");
+                        const int64_t d_model = get_u32(block_prefix + ".d_model");
+                        const int64_t dim_feedforward = get_u32(block_prefix + ".dim_feedforward");
+                        const int64_t num_layers = get_u32(block_prefix + ".num_layers");
+
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, "weight", tensor_block), {input_dimension, d_model}, TENSOR_NOT_REQUIRED);
+                        create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, "weight", tensor_block), {d_model, output_dimension}, TENSOR_NOT_REQUIRED);
+
+                        for (int64_t il = 0; il < num_layers; ++il) {
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_QKV,  "weight", tensor_block, (int) il), {d_model, d_model * 3}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_OUT,  "weight", tensor_block, (int) il), {d_model, d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_UP,    "weight", tensor_block, (int) il), {d_model, dim_feedforward}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_DOWN,  "weight", tensor_block, (int) il), {dim_feedforward, d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "weight", tensor_block, (int) il), {d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "bias",   tensor_block, (int) il), {d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM,  "weight", tensor_block, (int) il), {d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM,  "bias",   tensor_block, (int) il), {d_model}, 0);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, "scale", tensor_block, (int) il), {d_model}, TENSOR_NOT_REQUIRED);
+                            create_tensor(tn(LLM_TENSOR_MOSS_AUDIO_FFN_SCALE,  "scale", tensor_block, (int) il), {d_model}, TENSOR_NOT_REQUIRED);
+                        }
+
+                        tensor_block++;
+                    }
+                } break;
             case LLM_ARCH_QWEN3MOE:
             case LLM_ARCH_QWEN3VLMOE:
             case LLM_ARCH_RND1:
@@ -8110,6 +8234,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_EUROBERT:
         case LLM_ARCH_WAVTOKENIZER_DEC:
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
         case LLM_ARCH_MODERN_BERT:
         case LLM_ARCH_GEMMA_EMBEDDING:
         case LLM_ARCH_DREAM:
@@ -8366,6 +8492,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_moss_tts_delay>(*this, params);
             } break;
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+            {
+                llm = std::make_unique<llm_build_moss_tts_audio_encoder>(*this, params);
+            } break;
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
+            {
+                llm = std::make_unique<llm_build_moss_tts_audio_decoder>(*this, params);
+            } break;
         case LLM_ARCH_QWEN3MOE:
             {
                 llm = std::make_unique<llm_build_qwen3moe>(*this, params);
@@ -8822,6 +8956,10 @@ int32_t llama_model_n_embd_out(const llama_model * model) {
     return model->hparams.n_embd_out();
 }
 
+int32_t llama_model_n_out_i32(const llama_model * model) {
+    return model->hparams.n_out_i32();
+}
+
 int32_t llama_model_n_layer(const llama_model * model) {
     return model->hparams.n_layer;
 }
@@ -8891,6 +9029,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_RWKV7:
         case LLM_ARCH_ARWKV7:
         case LLM_ARCH_WAVTOKENIZER_DEC:
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_DECODER:
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_KIMI_LINEAR:
@@ -9112,16 +9252,22 @@ uint64_t llama_model_n_params(const llama_model * model) {
 
 bool llama_model_has_encoder(const llama_model * model) {
     switch (model->arch) {
-        case LLM_ARCH_T5:        return true;
-        case LLM_ARCH_T5ENCODER: return true;
-        default:                 return false;
+        case LLM_ARCH_T5:
+        case LLM_ARCH_T5ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+            return true;
+        default:
+            return false;
     }
 }
 
 bool llama_model_has_decoder(const llama_model * model) {
     switch (model->arch) {
-        case LLM_ARCH_T5ENCODER: return false;
-        default:                 return true;
+        case LLM_ARCH_T5ENCODER:
+        case LLM_ARCH_MOSS_TTS_AUDIO_ENCODER:
+            return false;
+        default:
+            return true;
     }
 }
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 68ba292d4..1ed74e0b3 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1723,7 +1723,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
     // determine vocab type
     {
-        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+        if (!ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model, false)) {
+            if (kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_ENCODER || kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) {
+                tokenizer_model = "none";
+            } else {
+                ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+            }
+        }
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
         ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
@@ -1745,6 +1751,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
                 LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
                 id_to_token.resize(n_tokens);
+            } else if (kv.arch == LLM_ARCH_MOSS_TTS_AUDIO_DECODER) {
+                LLAMA_LOG_WARN("%s: missing vocab size for %s, adding a single dummy token for auxiliary audio batches\n",
+                        __func__, llm_arch_name(kv.arch));
+                id_to_token.resize(1);
             }
 
             return;
diff --git a/src/models/models.h b/src/models/models.h
index 3f21b0102..baa1a5e70 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -699,6 +699,14 @@ struct llm_build_t5_enc : public llm_graph_context {
     llm_build_t5_enc(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_moss_tts_audio_encoder : public llm_graph_context {
+    llm_build_moss_tts_audio_encoder(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_moss_tts_audio_decoder : public llm_graph_context {
+    llm_build_moss_tts_audio_decoder(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_wavtokenizer_dec : public llm_graph_context {
     llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/moss-audio-common.cpp b/src/models/moss-audio-common.cpp
new file mode 100644
index 000000000..e61e5e667
--- /dev/null
+++ b/src/models/moss-audio-common.cpp
@@ -0,0 +1,508 @@
+#include "moss-audio-common.h"
+
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+
+namespace moss_audio {
+
+std::string unquote(std::string value) {
+    if (value.size() >= 2 && value.front() == '\'' && value.back() == '\'') {
+        return value.substr(1, value.size() - 2);
+    }
+    if (value.size() >= 2 && value.front() == '"' && value.back() == '"') {
+        return value.substr(1, value.size() - 2);
+    }
+    return value;
+}
+
+std::string meta_str(const llama_model & model, const std::string & key) {
+    const auto it = model.gguf_kv.find(key);
+    if (it == model.gguf_kv.end()) {
+        throw std::runtime_error("missing GGUF key: " + key);
+    }
+    return unquote(it->second);
+}
+
+uint32_t meta_u32(const llama_model & model, const std::string & key) {
+    return (uint32_t) std::stoul(meta_str(model, key));
+}
+
+float meta_f32(const llama_model & model, const std::string & key, float def, bool required) {
+    const auto it = model.gguf_kv.find(key);
+    if (it == model.gguf_kv.end()) {
+        if (!required) {
+            return def;
+        }
+        throw std::runtime_error("missing GGUF key: " + key);
+    }
+    return std::stof(unquote(it->second));
+}
+
+ggml_tensor * require_tensor(const llama_model & model, const std::string & name) {
+    auto * tensor = const_cast<ggml_tensor *>(model.get_tensor(name.c_str()));
+    if (tensor == nullptr) {
+        throw std::runtime_error("missing tensor: " + name);
+    }
+    return tensor;
+}
+
+ggml_tensor * optional_tensor(const llama_model & model, const std::string & name) {
+    return const_cast<ggml_tensor *>(model.get_tensor(name.c_str()));
+}
+
+ggml_tensor * as_matrix(ggml_context * ctx0, ggml_tensor * tensor) {
+    if (tensor == nullptr) {
+        return nullptr;
+    }
+
+    const int n_dims = ggml_n_dims(tensor);
+    if (n_dims == 2) {
+        return tensor;
+    }
+    if (n_dims == 3 && tensor->ne[0] == 1) {
+        return ggml_reshape_2d(ctx0, tensor, tensor->ne[1], tensor->ne[2]);
+    }
+    if (n_dims == 4 && tensor->ne[0] == 1 && tensor->ne[1] == 1) {
+        return ggml_reshape_2d(ctx0, tensor, tensor->ne[2], tensor->ne[3]);
+    }
+
+    throw std::runtime_error("unsupported tensor rank for linear projection: " + std::string(ggml_get_name(tensor)));
+}
+
+ggml_tensor * as_f32_matrix(ggml_context * ctx0, ggml_tensor * tensor) {
+    return tensor != nullptr ? ggml_cast(ctx0, as_matrix(ctx0, tensor), GGML_TYPE_F32) : nullptr;
+}
+
+ggml_tensor * as_f32_vector(ggml_context * ctx0, ggml_tensor * tensor) {
+    return tensor != nullptr ? ggml_cast(ctx0, tensor, GGML_TYPE_F32) : nullptr;
+}
+
+ggml_tensor * linear_f32(
+        ggml_context * ctx0,
+        ggml_tensor * input,
+        ggml_tensor * weight,
+        ggml_tensor * bias) {
+    ggml_tensor * cur = ggml_cast(ctx0, input, GGML_TYPE_F32);
+
+    if (weight != nullptr) {
+        cur = ggml_mul_mat(ctx0, as_f32_matrix(ctx0, weight), cur);
+    }
+    if (bias != nullptr) {
+        cur = ggml_add(ctx0, cur, as_f32_vector(ctx0, bias));
+    }
+
+    return cur;
+}
+
+graph_input_embd::graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+
+void graph_input_embd::set_input(const llama_ubatch * ubatch) {
+    GGML_ASSERT(ubatch->embd != nullptr);
+    GGML_ASSERT(embd != nullptr);
+    ggml_backend_tensor_set(embd, ubatch->embd, 0, (size_t) ubatch->n_tokens * (size_t) n_embd * sizeof(float));
+}
+
+bool graph_input_embd::can_reuse(const llm_graph_params & params) {
+    return params.ubatch.embd != nullptr && embd != nullptr && embd->ne[0] == n_embd && embd->ne[1] == params.ubatch.n_tokens;
+}
+
+graph_input_channel::graph_input_channel(uint32_t channel, uint32_t n_channels) : channel(channel), n_channels(n_channels) {}
+
+void graph_input_channel::set_input(const llama_ubatch * ubatch) {
+    GGML_ASSERT(tokens != nullptr);
+    data.resize(ubatch->n_tokens, 0);
+
+    if (ubatch->token_audio != nullptr) {
+        GGML_ASSERT(ubatch->n_token_audio == n_channels);
+        for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
+            data[i] = ubatch->token_audio[(size_t) i * n_channels + channel];
+        }
+    }
+
+    ggml_backend_tensor_set(tokens, data.data(), 0, data.size() * sizeof(int32_t));
+}
+
+bool graph_input_channel::can_reuse(const llm_graph_params & params) {
+    return tokens != nullptr &&
+            tokens->ne[0] == params.ubatch.n_tokens &&
+            ((params.ubatch.token_audio == nullptr && params.ubatch.n_token_audio == 0) ||
+             (params.ubatch.token_audio != nullptr && params.ubatch.n_token_audio == n_channels));
+}
+
+graph_input_i32::graph_input_i32(std::vector<int32_t> data) : data(std::move(data)) {}
+
+void graph_input_i32::set_input(const llama_ubatch *) {
+    GGML_ASSERT(tensor != nullptr);
+    ggml_backend_tensor_set(tensor, data.data(), 0, data.size() * sizeof(int32_t));
+}
+
+bool graph_input_i32::can_reuse(const llm_graph_params &) {
+    return tensor != nullptr && tensor->ne[0] == (int64_t) data.size();
+}
+
+graph_input_f32::graph_input_f32(std::vector<float> data) : data(std::move(data)) {}
+
+void graph_input_f32::set_input(const llama_ubatch *) {
+    GGML_ASSERT(tensor != nullptr);
+    ggml_backend_tensor_set(tensor, data.data(), 0, data.size() * sizeof(float));
+}
+
+bool graph_input_f32::can_reuse(const llm_graph_params &) {
+    return tensor != nullptr && ggml_nelements(tensor) == (int64_t) data.size();
+}
+
+std::vector<int32_t> make_positions(size_t n_tokens) {
+    std::vector<int32_t> positions(n_tokens);
+    for (size_t i = 0; i < n_tokens; ++i) {
+        positions[i] = (int32_t) i;
+    }
+    return positions;
+}
+
+int64_t align_up(int64_t value, int64_t multiple) {
+    if (multiple <= 1) {
+        return value;
+    }
+    return ((value + multiple - 1) / multiple) * multiple;
+}
+
+std::vector<float> make_causal_mask(size_t n_tokens, int context) {
+    std::vector<float> mask(n_tokens * n_tokens, -std::numeric_limits<float>::infinity());
+
+    for (size_t iq = 0; iq < n_tokens; ++iq) {
+        for (size_t ik = 0; ik < n_tokens; ++ik) {
+            if (ik > iq) {
+                continue;
+            }
+            if (context > 0 && (int) (iq - ik) >= context) {
+                continue;
+            }
+            mask[iq * n_tokens + ik] = 0.0f;
+        }
+    }
+
+    return mask;
+}
+
+ggml_tensor * build_layer_norm(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * weight,
+        ggml_tensor * bias) {
+    cur = ggml_norm(ctx0, cur, LAYER_NORM_EPS);
+    cur = ggml_mul(ctx0, cur, weight);
+    cur = ggml_add(ctx0, cur, bias);
+    return cur;
+}
+
+ggml_tensor * build_attention(
+        ggml_context * ctx0,
+        ggml_tensor * wo,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_mask,
+        float kq_scale) {
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+    v = ggml_cont(ctx0, v);
+
+    ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+    ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
+
+    if (wo != nullptr) {
+        cur = ggml_mul_mat(ctx0, as_matrix(ctx0, wo), cur);
+    }
+
+    return cur;
+}
+
+ggml_tensor * patch_encode(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        int channels,
+        int64_t n_frames,
+        int patch_size) {
+    GGML_ASSERT(patch_size > 0);
+    GGML_ASSERT(n_frames % patch_size == 0);
+    GGML_ASSERT(cur->ne[0] == channels);
+    GGML_ASSERT(cur->ne[1] == n_frames);
+
+    cur = ggml_reshape_3d(ctx0, cur, channels, patch_size, n_frames / patch_size);
+    cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_2d(ctx0, cur, channels * patch_size, n_frames / patch_size);
+    return cur;
+}
+
+ggml_tensor * patch_decode(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        int channels,
+        int64_t n_frames,
+        int patch_size) {
+    GGML_ASSERT(patch_size > 0);
+    GGML_ASSERT(channels % patch_size == 0);
+    GGML_ASSERT(cur->ne[0] == channels);
+    GGML_ASSERT(cur->ne[1] == n_frames);
+
+    const int out_channels = channels / patch_size;
+    cur = ggml_reshape_3d(ctx0, cur, patch_size, out_channels, n_frames);
+    cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_2d(ctx0, cur, out_channels, n_frames * patch_size);
+    return cur;
+}
+
+quantizer_meta load_quantizer_meta(const llama_model & model, const std::string & arch_name) {
+    quantizer_meta meta;
+    meta.input_dim = (int) meta_u32(model, arch_name + ".quantizer.input_dim");
+    meta.rvq_dim = (int) meta_u32(model, arch_name + ".quantizer.rvq_dim");
+    meta.output_dim = (int) meta_u32(model, arch_name + ".quantizer.output_dim");
+    meta.num_quantizers = (int) meta_u32(model, arch_name + ".quantizer.num_quantizers");
+    meta.codebook_size = (int) meta_u32(model, arch_name + ".quantizer.codebook_size");
+    meta.codebook_dim = (int) meta_u32(model, arch_name + ".quantizer.codebook_dim");
+    return meta;
+}
+
+std::vector<module> load_modules(
+        const llama_model & model,
+        ggml_context * ctx0,
+        const std::string & arch_name,
+        const std::string & section_name) {
+    const auto tn = LLM_TN(model.arch);
+    const uint32_t block_count = meta_u32(model, arch_name + "." + section_name + ".block_count");
+    std::vector<module> modules(block_count);
+    int tensor_block = 0;
+
+    for (uint32_t ib = 0; ib < block_count; ++ib) {
+        const std::string block_prefix = arch_name + "." + section_name + "." + std::to_string(ib);
+        auto & block = modules[ib];
+        const std::string current_type = meta_str(model, block_prefix + ".module_type");
+
+        if (current_type == "PatchedPretransform") {
+            block.type = module_type::PATCHED_PRETRANSFORM;
+            block.patch_size = (int) meta_u32(model, block_prefix + ".patch_size");
+            continue;
+        }
+
+        if (current_type != "Transformer") {
+            throw std::runtime_error("unsupported MOSS audio module type: " + current_type);
+        }
+
+        block.type = module_type::TRANSFORMER;
+
+        auto & tr = block.transformer;
+        tr.input_dimension = (int) meta_u32(model, block_prefix + ".input_dimension");
+        tr.output_dimension = (int) meta_u32(model, block_prefix + ".output_dimension");
+        tr.d_model = (int) meta_u32(model, block_prefix + ".d_model");
+        tr.num_heads = (int) meta_u32(model, block_prefix + ".num_heads");
+        tr.num_layers = (int) meta_u32(model, block_prefix + ".num_layers");
+        tr.context = (int) meta_u32(model, block_prefix + ".context");
+        tr.max_period = meta_f32(model, block_prefix + ".max_period", 10000.0f, false);
+
+        tr.input_proj = as_matrix(ctx0, optional_tensor(model,
+                tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_INPUT_PROJ, "weight", tensor_block).str()));
+        tr.output_proj = as_matrix(ctx0, optional_tensor(model,
+                tn(LLM_TENSOR_MOSS_AUDIO_BLOCK_OUTPUT_PROJ, "weight", tensor_block).str()));
+
+        tr.layers.resize(tr.num_layers);
+        for (int il = 0; il < tr.num_layers; ++il) {
+            auto & layer = tr.layers[il];
+            layer.attn_in  = as_matrix(ctx0, require_tensor(model,
+                    tn(LLM_TENSOR_MOSS_AUDIO_ATTN_QKV, "weight", tensor_block, il).str()));
+            layer.attn_out = as_matrix(ctx0, require_tensor(model,
+                    tn(LLM_TENSOR_MOSS_AUDIO_ATTN_OUT, "weight", tensor_block, il).str()));
+            layer.linear1  = as_matrix(ctx0, require_tensor(model,
+                    tn(LLM_TENSOR_MOSS_AUDIO_FFN_UP, "weight", tensor_block, il).str()));
+            layer.linear2  = as_matrix(ctx0, require_tensor(model,
+                    tn(LLM_TENSOR_MOSS_AUDIO_FFN_DOWN, "weight", tensor_block, il).str()));
+            layer.norm1_w  = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "weight", tensor_block, il).str());
+            layer.norm1_b  = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_NORM, "bias", tensor_block, il).str());
+            layer.norm2_w  = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "weight", tensor_block, il).str());
+            layer.norm2_b  = require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_NORM, "bias", tensor_block, il).str());
+            layer.scale1   = optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_ATTN_SCALE, "scale", tensor_block, il).str());
+            layer.scale2   = optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_FFN_SCALE, "scale", tensor_block, il).str());
+        }
+
+        tensor_block++;
+    }
+
+    return modules;
+}
+
+ggml_tensor * build_transformer_block(
+        llm_graph_context & llm,
+        ggml_tensor * cur,
+        const transformer_block & block,
+        int64_t n_frames,
+        int module_index) {
+    GGML_ASSERT(cur->ne[1] == n_frames);
+
+    auto inp_pos = std::make_unique<graph_input_i32>(make_positions((size_t) n_frames));
+    inp_pos->tensor = ggml_new_tensor_1d(llm.ctx0, GGML_TYPE_I32, n_frames);
+    llm.cb(inp_pos->tensor, "moss_audio_pos", module_index);
+    ggml_set_input(inp_pos->tensor);
+    ggml_tensor * positions = inp_pos->tensor;
+    llm.res->add_input(std::move(inp_pos));
+
+    auto inp_mask = std::make_unique<graph_input_f32>(make_causal_mask((size_t) n_frames, block.context));
+    inp_mask->tensor = ggml_new_tensor_4d(llm.ctx0, GGML_TYPE_F32, n_frames, n_frames, 1, 1);
+    llm.cb(inp_mask->tensor, "moss_audio_mask", module_index);
+    ggml_set_input(inp_mask->tensor);
+    ggml_tensor * mask = inp_mask->tensor;
+    llm.res->add_input(std::move(inp_mask));
+
+    if (block.input_proj != nullptr) {
+        cur = ggml_mul_mat(llm.ctx0, block.input_proj, cur);
+    }
+
+    const int d_head = block.d_model / block.num_heads;
+    const float attn_scale = 1.0f / std::sqrt((float) d_head);
+
+    for (int il = 0; il < block.num_layers; ++il) {
+        const auto & layer = block.layers[il];
+
+        ggml_tensor * inp_sa = cur;
+        ggml_tensor * x = build_layer_norm(llm.ctx0, cur, layer.norm1_w, layer.norm1_b);
+        ggml_tensor * qkv = ggml_mul_mat(llm.ctx0, layer.attn_in, x);
+
+        ggml_tensor * q = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames,
+                ggml_row_size(qkv->type, d_head), qkv->nb[1], 0);
+        ggml_tensor * k = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames,
+                ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model));
+        ggml_tensor * v = ggml_view_3d(llm.ctx0, qkv, d_head, block.num_heads, n_frames,
+                ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model));
+
+        q = ggml_rope_ext(llm.ctx0, q, positions, nullptr, d_head, 0, 0,
+                block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+        k = ggml_rope_ext(llm.ctx0, k, positions, nullptr, d_head, 0, 0,
+                block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+
+        ggml_tensor * attn = build_attention(llm.ctx0, layer.attn_out, q, k, v, mask, attn_scale);
+        if (layer.scale1 != nullptr) {
+            attn = ggml_mul(llm.ctx0, attn, layer.scale1);
+        }
+        cur = ggml_add(llm.ctx0, inp_sa, attn);
+
+        ggml_tensor * inp_ff = cur;
+        x = build_layer_norm(llm.ctx0, cur, layer.norm2_w, layer.norm2_b);
+        x = ggml_mul_mat(llm.ctx0, layer.linear1, x);
+        x = ggml_gelu(llm.ctx0, x);
+        x = ggml_mul_mat(llm.ctx0, layer.linear2, x);
+        if (layer.scale2 != nullptr) {
+            x = ggml_mul(llm.ctx0, x, layer.scale2);
+        }
+        cur = ggml_add(llm.ctx0, inp_ff, x);
+    }
+
+    if (block.output_proj != nullptr) {
+        cur = ggml_mul_mat(llm.ctx0, block.output_proj, cur);
+    }
+
+    return cur;
+}
+
+ggml_tensor * build_decoder_quantizer(
+        llm_graph_context & llm,
+        const llama_model & model,
+        ggml_context * ctx0,
+        const quantizer_meta & quantizer,
+        int64_t n_frames) {
+    const auto tn = LLM_TN(model.arch);
+
+    ggml_tensor * cur = nullptr;
+    for (int iq = 0; iq < quantizer.num_quantizers; ++iq) {
+        auto inp_code = std::make_unique<graph_input_channel>((uint32_t) iq, (uint32_t) quantizer.num_quantizers);
+        inp_code->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_frames);
+        llm.cb(inp_code->tokens, "moss_audio_code", iq);
+        ggml_set_input(inp_code->tokens);
+
+        ggml_tensor * codebook = ggml_cast(ctx0,
+                require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, iq).str()),
+                GGML_TYPE_F32);
+        ggml_tensor * emb = ggml_get_rows(ctx0, codebook, inp_code->tokens);
+        emb = linear_f32(
+                ctx0,
+                emb,
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, iq).str()),
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, iq).str()));
+
+        cur = cur != nullptr ? ggml_add(ctx0, cur, emb) : emb;
+        llm.res->add_input(std::move(inp_code));
+    }
+
+    cur = linear_f32(
+            ctx0,
+            cur,
+            optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "weight").str()),
+            optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUTPUT_PROJ, "bias").str()));
+
+    GGML_ASSERT(cur != nullptr);
+    return cur;
+}
+
+ggml_tensor * build_l2_normalize(ggml_context * ctx0, ggml_tensor * cur) {
+    constexpr float L2_NORM_EPS = 3.4526698e-4f;
+
+    ggml_tensor * norm = ggml_sum_rows(ctx0, ggml_sqr(ctx0, cur));
+    norm = ggml_sqrt(ctx0, norm);
+    norm = ggml_clamp(ctx0, norm, L2_NORM_EPS, INFINITY);
+    return ggml_div(ctx0, cur, ggml_repeat(ctx0, norm, cur));
+}
+
+ggml_tensor * build_encoder_quantizer_codes(
+        const llama_model & model,
+        ggml_context * ctx0,
+        const quantizer_meta & quantizer,
+        ggml_tensor * cur) {
+    const auto tn = LLM_TN(model.arch);
+
+    cur = linear_f32(
+            ctx0,
+            cur,
+            optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "weight").str()),
+            optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_INPUT_PROJ, "bias").str()));
+
+    ggml_tensor * codes = nullptr;
+    ggml_tensor * residual = cur;
+
+    for (int iq = 0; iq < quantizer.num_quantizers; ++iq) {
+        ggml_tensor * latent = linear_f32(
+                ctx0,
+                residual,
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "weight", -1, iq).str()),
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_IN_PROJ, "bias", -1, iq).str()));
+
+        ggml_tensor * latent_unit = build_l2_normalize(ctx0, latent);
+        ggml_tensor * codebook = ggml_cast(ctx0,
+                require_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_CODEBOOK, "weight", -1, iq).str()),
+                GGML_TYPE_F32);
+        ggml_tensor * codebook_unit = build_l2_normalize(ctx0, codebook);
+        ggml_tensor * scores = ggml_mul_mat(ctx0, codebook_unit, latent_unit);
+        ggml_tensor * code_i = ggml_argmax(ctx0, scores);
+
+        ggml_tensor * code_i_row = ggml_reshape_2d(ctx0, code_i, 1, code_i->ne[0]);
+        codes = codes != nullptr ? ggml_concat(ctx0, codes, code_i_row, 0) : code_i_row;
+
+        ggml_tensor * decoded = ggml_get_rows(ctx0, codebook, code_i);
+        decoded = linear_f32(
+                ctx0,
+                decoded,
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "weight", -1, iq).str()),
+                optional_tensor(model, tn(LLM_TENSOR_MOSS_AUDIO_QUANT_OUT_PROJ, "bias", -1, iq).str()));
+        residual = ggml_sub(ctx0, residual, decoded);
+    }
+
+    GGML_ASSERT(codes != nullptr);
+    return codes;
+}
+
+} // namespace moss_audio
diff --git a/src/models/moss-audio-common.h b/src/models/moss-audio-common.h
new file mode 100644
index 000000000..05acc6093
--- /dev/null
+++ b/src/models/moss-audio-common.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include "models.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace moss_audio {
+
+constexpr float LAYER_NORM_EPS = 1e-5f;
+
+enum class module_type {
+    PATCHED_PRETRANSFORM,
+    TRANSFORMER,
+};
+
+struct transformer_layer {
+    ggml_tensor * attn_in  = nullptr;
+    ggml_tensor * attn_out = nullptr;
+    ggml_tensor * linear1  = nullptr;
+    ggml_tensor * linear2  = nullptr;
+    ggml_tensor * norm1_w  = nullptr;
+    ggml_tensor * norm1_b  = nullptr;
+    ggml_tensor * norm2_w  = nullptr;
+    ggml_tensor * norm2_b  = nullptr;
+    ggml_tensor * scale1   = nullptr;
+    ggml_tensor * scale2   = nullptr;
+};
+
+struct transformer_block {
+    int input_dimension  = 0;
+    int output_dimension = 0;
+    int d_model          = 0;
+    int num_heads        = 0;
+    int num_layers       = 0;
+    int context          = 0;
+    float max_period     = 10000.0f;
+
+    ggml_tensor * input_proj  = nullptr;
+    ggml_tensor * output_proj = nullptr;
+
+    std::vector<transformer_layer> layers;
+};
+
+struct module {
+    module_type type = module_type::PATCHED_PRETRANSFORM;
+    int patch_size = 1;
+    transformer_block transformer;
+};
+
+struct quantizer_meta {
+    int input_dim       = 0;
+    int rvq_dim         = 0;
+    int output_dim      = 0;
+    int num_quantizers  = 0;
+    int codebook_size   = 0;
+    int codebook_dim    = 0;
+};
+
+std::string unquote(std::string value);
+std::string meta_str(const llama_model & model, const std::string & key);
+uint32_t meta_u32(const llama_model & model, const std::string & key);
+float meta_f32(const llama_model & model, const std::string & key, float def = 0.0f, bool required = true);
+
+ggml_tensor * require_tensor(const llama_model & model, const std::string & name);
+ggml_tensor * optional_tensor(const llama_model & model, const std::string & name);
+ggml_tensor * as_matrix(ggml_context * ctx0, ggml_tensor * tensor);
+ggml_tensor * as_f32_matrix(ggml_context * ctx0, ggml_tensor * tensor);
+ggml_tensor * as_f32_vector(ggml_context * ctx0, ggml_tensor * tensor);
+ggml_tensor * linear_f32(ggml_context * ctx0, ggml_tensor * input, ggml_tensor * weight, ggml_tensor * bias);
+
+class graph_input_embd : public llm_graph_input_i {
+public:
+    explicit graph_input_embd(int64_t n_embd);
+
+    void set_input(const llama_ubatch * ubatch) override;
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * embd = nullptr;
+
+private:
+    int64_t n_embd;
+};
+
+class graph_input_channel : public llm_graph_input_i {
+public:
+    graph_input_channel(uint32_t channel, uint32_t n_channels);
+
+    void set_input(const llama_ubatch * ubatch) override;
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * tokens = nullptr;
+
+private:
+    uint32_t channel;
+    uint32_t n_channels;
+    std::vector<int32_t> data;
+};
+
+class graph_input_i32 : public llm_graph_input_i {
+public:
+    explicit graph_input_i32(std::vector<int32_t> data);
+
+    void set_input(const llama_ubatch * ubatch) override;
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * tensor = nullptr;
+
+private:
+    std::vector<int32_t> data;
+};
+
+class graph_input_f32 : public llm_graph_input_i {
+public:
+    explicit graph_input_f32(std::vector<float> data);
+
+    void set_input(const llama_ubatch * ubatch) override;
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * tensor = nullptr;
+
+private:
+    std::vector<float> data;
+};
+
+std::vector<int32_t> make_positions(size_t n_tokens);
+int64_t align_up(int64_t value, int64_t multiple);
+std::vector<float> make_causal_mask(size_t n_tokens, int context);
+
+ggml_tensor * build_layer_norm(ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * weight, ggml_tensor * bias);
+ggml_tensor * build_attention(
+        ggml_context * ctx0,
+        ggml_tensor * wo,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_mask,
+        float kq_scale);
+ggml_tensor * patch_encode(ggml_context * ctx0, ggml_tensor * cur, int channels, int64_t n_frames, int patch_size);
+ggml_tensor * patch_decode(ggml_context * ctx0, ggml_tensor * cur, int channels, int64_t n_frames, int patch_size);
+
+quantizer_meta load_quantizer_meta(const llama_model & model, const std::string & arch_name);
+std::vector<module> load_modules(
+        const llama_model & model,
+        ggml_context * ctx0,
+        const std::string & arch_name,
+        const std::string & section_name);
+
+ggml_tensor * build_transformer_block(
+        llm_graph_context & llm,
+        ggml_tensor * cur,
+        const transformer_block & block,
+        int64_t n_frames,
+        int module_index);
+
+ggml_tensor * build_decoder_quantizer(
+        llm_graph_context & llm,
+        const llama_model & model,
+        ggml_context * ctx0,
+        const quantizer_meta & quantizer,
+        int64_t n_frames);
+
+ggml_tensor * build_l2_normalize(ggml_context * ctx0, ggml_tensor * cur);
+ggml_tensor * build_encoder_quantizer_codes(
+        const llama_model & model,
+        ggml_context * ctx0,
+        const quantizer_meta & quantizer,
+        ggml_tensor * cur);
+
+} // namespace moss_audio
diff --git a/src/models/moss-audio-decoder.cpp b/src/models/moss-audio-decoder.cpp
new file mode 100644
index 000000000..0b1854ecb
--- /dev/null
+++ b/src/models/moss-audio-decoder.cpp
@@ -0,0 +1,37 @@
+#include "moss-audio-common.h"
+
+using namespace moss_audio;
+
+llm_build_moss_tts_audio_decoder::llm_build_moss_tts_audio_decoder(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+    const std::string arch_name = llm_arch_name(model.arch);
+    const auto quantizer = load_quantizer_meta(model, arch_name);
+    const std::vector<module> modules = load_modules(model, ctx0, arch_name, "decoder");
+
+    int64_t frames = ubatch.n_tokens;
+    ggml_tensor * cur = build_decoder_quantizer(*this, model, ctx0, quantizer, frames);
+    int channels = quantizer.output_dim;
+    int tensor_block = 0;
+
+    for (size_t i = 0; i < modules.size(); ++i) {
+        const auto & current = modules[i];
+        switch (current.type) {
+            case module_type::TRANSFORMER:
+                cur = build_transformer_block(*this, cur, current.transformer, frames, tensor_block);
+                channels = current.transformer.output_dimension;
+                tensor_block++;
+                break;
+            case module_type::PATCHED_PRETRANSFORM:
+                cur = patch_decode(ctx0, cur, channels, frames, current.patch_size);
+                channels /= current.patch_size;
+                frames *= current.patch_size;
+                break;
+        }
+    }
+
+    GGML_ASSERT(channels == 1);
+
+    cb(cur, "result_embd", -1);
+    res->t_embd = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/moss-audio-encoder.cpp b/src/models/moss-audio-encoder.cpp
new file mode 100644
index 000000000..fc73b74f8
--- /dev/null
+++ b/src/models/moss-audio-encoder.cpp
@@ -0,0 +1,49 @@
+#include "moss-audio-common.h"
+
+using namespace moss_audio;
+
+llm_build_moss_tts_audio_encoder::llm_build_moss_tts_audio_encoder(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+    const std::string arch_name = llm_arch_name(model.arch);
+    const auto quantizer = load_quantizer_meta(model, arch_name);
+    const std::vector<module> modules = load_modules(model, ctx0, arch_name, "encoder");
+    const int64_t downsample_rate = (int64_t) meta_u32(model, arch_name + ".downsample_rate");
+    const int64_t reserve_frames = ubatch.embd != nullptr ? ubatch.n_tokens : align_up(ubatch.n_tokens, downsample_rate);
+
+    auto inp = std::make_unique<graph_input_embd>(1);
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, reserve_frames);
+    cb(inp->embd, "moss_audio_waveform", -1);
+    ggml_set_input(inp->embd);
+    ggml_tensor * cur = inp->embd;
+    res->add_input(std::move(inp));
+
+    int channels = 1;
+    int64_t frames = reserve_frames;
+    int tensor_block = 0;
+
+    for (size_t i = 0; i < modules.size(); ++i) {
+        const auto & current = modules[i];
+        switch (current.type) {
+            case module_type::PATCHED_PRETRANSFORM:
+                cur = patch_encode(ctx0, cur, channels, frames, current.patch_size);
+                channels *= current.patch_size;
+                frames /= current.patch_size;
+                break;
+            case module_type::TRANSFORMER:
+                cur = build_transformer_block(*this, cur, current.transformer, frames, tensor_block);
+                channels = current.transformer.output_dimension;
+                tensor_block++;
+                break;
+        }
+    }
+
+    GGML_ASSERT(channels == quantizer.input_dim);
+
+    cb(cur, "result_embd", -1);
+    res->t_embd = cur;
+
+    ggml_tensor * codes = build_encoder_quantizer_codes(model, ctx0, quantizer, cur);
+    cb(codes, "result_out_i32", -1);
+    res->t_out_i32 = codes;
+    ggml_build_forward_expand(gf, codes);
+}
diff --git a/src/models/moss-audio-tokenizer.cpp b/src/models/moss-audio-tokenizer.cpp
new file mode 100644
index 000000000..a0f19415b
--- /dev/null
+++ b/src/models/moss-audio-tokenizer.cpp
@@ -0,0 +1,1205 @@
+#include "llama-moss-audio-tokenizer.h"
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "gguf.h"
+#include "llama-impl.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace {
+
+constexpr char MOSS_CODEC_ARCH[] = "moss-audio-tokenizer";
+constexpr float MOSS_LAYER_NORM_EPS = 1e-5f;
+constexpr size_t MOSS_CODEC_MAX_NODES_BASE = 256;
+constexpr size_t MOSS_CODEC_MAX_NODES_PER_LAYER = 32;
+
+enum class moss_codec_module_type {
+    PATCHED_PRETRANSFORM,
+    TRANSFORMER,
+};
+
+struct moss_codec_transformer_layer {
+    ggml_tensor * attn_in   = nullptr;
+    ggml_tensor * attn_out  = nullptr;
+    ggml_tensor * linear1   = nullptr;
+    ggml_tensor * linear2   = nullptr;
+    ggml_tensor * norm1_w   = nullptr;
+    ggml_tensor * norm1_b   = nullptr;
+    ggml_tensor * norm2_w   = nullptr;
+    ggml_tensor * norm2_b   = nullptr;
+    ggml_tensor * scale1    = nullptr;
+    ggml_tensor * scale2    = nullptr;
+};
+
+struct moss_codec_transformer_block {
+    int input_dimension   = 0;
+    int output_dimension  = 0;
+    int d_model           = 0;
+    int num_heads         = 0;
+    int num_layers        = 0;
+    int dim_feedforward   = 0;
+    int context           = 0;
+    float max_period      = 10000.0f;
+
+    ggml_tensor * input_proj  = nullptr;
+    ggml_tensor * output_proj = nullptr;
+
+    std::vector<moss_codec_transformer_layer> layers;
+};
+
+struct moss_codec_module {
+    moss_codec_module_type type = moss_codec_module_type::PATCHED_PRETRANSFORM;
+    int patch_size = 1;
+    moss_codec_transformer_block transformer;
+};
+
+struct moss_codec_quantizer_entry {
+    ggml_tensor * in_proj_w   = nullptr;
+    ggml_tensor * in_proj_b   = nullptr;
+    ggml_tensor * codebook    = nullptr;
+    ggml_tensor * out_proj_w  = nullptr;
+    ggml_tensor * out_proj_b  = nullptr;
+};
+
+struct moss_codec_quantizer {
+    int input_dim         = 0;
+    int rvq_dim          = 0;
+    int output_dim       = 0;
+    int num_quantizers   = 0;
+    int codebook_size    = 0;
+    int codebook_dim     = 0;
+
+    ggml_tensor * input_proj_w  = nullptr;
+    ggml_tensor * input_proj_b  = nullptr;
+    ggml_tensor * output_proj_w = nullptr;
+    ggml_tensor * output_proj_b = nullptr;
+
+    std::vector<moss_codec_quantizer_entry> quantizers;
+};
+
+static std::string moss_codec_module_type_to_string(const moss_codec_module_type type) {
+    switch (type) {
+        case moss_codec_module_type::PATCHED_PRETRANSFORM:
+            return "PatchedPretransform";
+        case moss_codec_module_type::TRANSFORMER:
+            return "Transformer";
+    }
+    return "Unknown";
+}
+
+static moss_codec_module_type moss_codec_module_type_from_string(const std::string & value) {
+    if (value == "PatchedPretransform") {
+        return moss_codec_module_type::PATCHED_PRETRANSFORM;
+    }
+    if (value == "Transformer") {
+        return moss_codec_module_type::TRANSFORMER;
+    }
+    throw std::runtime_error("unsupported codec module type: " + value);
+}
+
+static void moss_codec_set_n_threads(ggml_backend_t backend, int n_threads) {
+    if (backend == nullptr || n_threads <= 0) {
+        return;
+    }
+
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+    if (!reg) {
+        return;
+    }
+
+    auto fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+    if (fn != nullptr) {
+        fn(backend, n_threads);
+    }
+}
+
+static std::vector<int32_t> moss_codec_make_positions(const size_t n_tokens) {
+    std::vector<int32_t> positions(n_tokens);
+    for (size_t i = 0; i < n_tokens; ++i) {
+        positions[i] = (int32_t) i;
+    }
+    return positions;
+}
+
+static std::vector<float> moss_codec_make_causal_mask(const size_t n_tokens, const int context) {
+    std::vector<float> mask(n_tokens * n_tokens, -std::numeric_limits<float>::infinity());
+
+    for (size_t iq = 0; iq < n_tokens; ++iq) {
+        for (size_t ik = 0; ik < n_tokens; ++ik) {
+            if (ik > iq) {
+                continue;
+            }
+            if (context > 0 && (int) (iq - ik) >= context) {
+                continue;
+            }
+            mask[iq * n_tokens + ik] = 0.0f;
+        }
+    }
+
+    return mask;
+}
+
+static ggml_tensor * moss_codec_build_layer_norm(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * weight,
+        ggml_tensor * bias) {
+    cur = ggml_norm(ctx0, cur, MOSS_LAYER_NORM_EPS);
+    cur = ggml_mul(ctx0, cur, weight);
+    cur = ggml_add(ctx0, cur, bias);
+    return cur;
+}
+
+static ggml_tensor * moss_codec_build_attention(
+        ggml_context * ctx0,
+        ggml_tensor * wo,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_mask,
+        float kq_scale) {
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
+    v = ggml_cont(ctx0, v);
+
+    ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+    kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
+
+    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+    ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+    cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
+
+    if (wo != nullptr) {
+        cur = ggml_mul_mat(ctx0, wo, cur);
+    }
+
+    return cur;
+}
+
+static std::vector<float> moss_codec_patch_decode(
+        const std::vector<float> & input,
+        const int channels,
+        const size_t n_frames,
+        const int patch_size) {
+    if (patch_size <= 0) {
+        throw std::runtime_error("invalid patch size");
+    }
+    if (channels % patch_size != 0) {
+        throw std::runtime_error("patch decode channels not divisible by patch size");
+    }
+    if (input.size() != (size_t) channels * n_frames) {
+        throw std::runtime_error("patch decode input size mismatch");
+    }
+
+    const int out_channels = channels / patch_size;
+    const size_t out_frames = n_frames * (size_t) patch_size;
+    std::vector<float> output((size_t) out_channels * out_frames);
+
+    for (size_t t = 0; t < n_frames; ++t) {
+        for (int d = 0; d < out_channels; ++d) {
+            for (int i = 0; i < patch_size; ++i) {
+                const float value = input[(size_t) (d * patch_size + i) + t * (size_t) channels];
+                output[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) out_channels] = value;
+            }
+        }
+    }
+
+    return output;
+}
+
+static std::vector<float> moss_codec_patch_encode(
+        const std::vector<float> & input,
+        const int channels,
+        const size_t n_frames,
+        const int patch_size) {
+    if (patch_size <= 0) {
+        throw std::runtime_error("invalid patch size");
+    }
+    if (n_frames % (size_t) patch_size != 0) {
+        throw std::runtime_error("patch encode frame count not divisible by patch size");
+    }
+    if (input.size() != (size_t) channels * n_frames) {
+        throw std::runtime_error("patch encode input size mismatch");
+    }
+
+    const int out_channels = channels * patch_size;
+    const size_t out_frames = n_frames / (size_t) patch_size;
+    std::vector<float> output((size_t) out_channels * out_frames);
+
+    for (size_t t = 0; t < out_frames; ++t) {
+        for (int d = 0; d < channels; ++d) {
+            for (int i = 0; i < patch_size; ++i) {
+                const float value = input[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) channels];
+                output[(size_t) (d * patch_size + i) + t * (size_t) out_channels] = value;
+            }
+        }
+    }
+
+    return output;
+}
+
+static std::vector<float> moss_codec_copy_f32_output(ggml_tensor * tensor) {
+    std::vector<float> output((size_t) ggml_nelements(tensor));
+    ggml_backend_tensor_get(tensor, output.data(), 0, ggml_nbytes(tensor));
+    return output;
+}
+
+struct moss_codec_linear_f32 {
+    int in_features = 0;
+    int out_features = 0;
+    std::vector<float> weight;
+    std::vector<float> bias;
+
+    bool empty() const {
+        return weight.empty();
+    }
+};
+
+struct moss_codec_quantizer_entry_f32 {
+    moss_codec_linear_f32 in_proj;
+    moss_codec_linear_f32 out_proj;
+    int codebook_size = 0;
+    int codebook_dim = 0;
+    std::vector<float> codebook;
+    std::vector<float> codebook_unit;
+};
+
+static std::vector<float> moss_codec_tensor_to_f32(const ggml_tensor * tensor) {
+    if (tensor == nullptr) {
+        return {};
+    }
+
+    const size_t n_elements = (size_t) ggml_nelements(tensor);
+
+    switch (tensor->type) {
+        case GGML_TYPE_F32: {
+            std::vector<float> values(n_elements);
+            ggml_backend_tensor_get(const_cast<ggml_tensor *>(tensor), values.data(), 0, ggml_nbytes(tensor));
+            return values;
+        }
+        case GGML_TYPE_F16: {
+            std::vector<ggml_fp16_t> values_f16(n_elements);
+            std::vector<float> values(n_elements);
+            ggml_backend_tensor_get(const_cast<ggml_tensor *>(tensor), values_f16.data(), 0, ggml_nbytes(tensor));
+            for (size_t i = 0; i < n_elements; ++i) {
+                values[i] = ggml_fp16_to_fp32(values_f16[i]);
+            }
+            return values;
+        }
+        default:
+            throw std::runtime_error("unsupported tensor dtype for float conversion: " + std::string(ggml_type_name(tensor->type)));
+    }
+}
+
+static moss_codec_linear_f32 moss_codec_linear_from_tensors(ggml_tensor * weight, ggml_tensor * bias) {
+    moss_codec_linear_f32 result;
+    if (weight == nullptr) {
+        return result;
+    }
+
+    switch (ggml_n_dims(weight)) {
+        case 2:
+            result.in_features = (int) weight->ne[0];
+            result.out_features = (int) weight->ne[1];
+            break;
+        case 3:
+            if (weight->ne[0] != 1) {
+                throw std::runtime_error("expected singleton leading dim for 3D linear weight tensor");
+            }
+            result.in_features = (int) weight->ne[1];
+            result.out_features = (int) weight->ne[2];
+            break;
+        case 4:
+            if (weight->ne[0] != 1 || weight->ne[1] != 1) {
+                throw std::runtime_error("expected singleton leading dims for 4D linear weight tensor");
+            }
+            result.in_features = (int) weight->ne[2];
+            result.out_features = (int) weight->ne[3];
+            break;
+        default:
+            throw std::runtime_error("expected 2D/3D/4D linear weight tensor");
+    }
+    result.weight = moss_codec_tensor_to_f32(weight);
+    result.bias = moss_codec_tensor_to_f32(bias);
+    return result;
+}
+
+static std::vector<float> moss_codec_linear_apply(
+        const moss_codec_linear_f32 & linear,
+        const std::vector<float> & input,
+        const size_t n_frames) {
+    if (linear.empty()) {
+        return input;
+    }
+    if (input.size() != (size_t) linear.in_features * n_frames) {
+        throw std::runtime_error("linear input size mismatch");
+    }
+
+    std::vector<float> output((size_t) linear.out_features * n_frames, 0.0f);
+    for (size_t t = 0; t < n_frames; ++t) {
+        const float * x = input.data() + t * (size_t) linear.in_features;
+        float * y = output.data() + t * (size_t) linear.out_features;
+
+        if (!linear.bias.empty()) {
+            std::copy(linear.bias.begin(), linear.bias.end(), y);
+        }
+
+        for (int o = 0; o < linear.out_features; ++o) {
+            const float * w = linear.weight.data() + (size_t) o * (size_t) linear.in_features;
+            float acc = y[o];
+            for (int i = 0; i < linear.in_features; ++i) {
+                acc += w[i] * x[i];
+            }
+            y[o] = acc;
+        }
+    }
+
+    return output;
+}
+
+static std::vector<float> moss_codec_normalize_rows(
+        const std::vector<float> & input,
+        const int row_width) {
+    if (row_width <= 0 || input.size() % (size_t) row_width != 0) {
+        throw std::runtime_error("invalid row width for normalization");
+    }
+
+    std::vector<float> output = input;
+    const size_t n_rows = input.size() / (size_t) row_width;
+    for (size_t r = 0; r < n_rows; ++r) {
+        float norm2 = 0.0f;
+        for (int c = 0; c < row_width; ++c) {
+            const float v = output[r * (size_t) row_width + (size_t) c];
+            norm2 += v * v;
+        }
+        const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
+        for (int c = 0; c < row_width; ++c) {
+            output[r * (size_t) row_width + (size_t) c] *= inv;
+        }
+    }
+    return output;
+}
+
+struct moss_codec_gguf_loader {
+    ggml_context_ptr ctx_meta;
+    gguf_context_ptr ctx_gguf;
+    ggml_context_ptr ctx_data;
+    ggml_backend_ptr backend;
+    ggml_backend_buffer_ptr buffer;
+
+    std::string fname;
+    std::map<std::string, size_t> tensor_offset;
+    std::map<std::string, ggml_tensor *> loaded_tensors;
+    std::vector<ggml_tensor *> tensors_to_load;
+
+    explicit moss_codec_gguf_loader(const std::string & model_path)
+        : fname(model_path),
+          backend(ggml_backend_cpu_init()) {
+        if (!backend) {
+            throw std::runtime_error("failed to initialize CPU backend for codec");
+        }
+
+        ggml_context * meta = nullptr;
+        gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &meta,
+        };
+
+        ctx_gguf.reset(gguf_init_from_file(fname.c_str(), params));
+        if (!ctx_gguf) {
+            throw std::runtime_error("failed to load codec GGUF metadata from: " + fname);
+        }
+
+        ctx_meta.reset(meta);
+
+        for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
+            const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
+            tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
+        }
+
+        ggml_init_params data_params = {
+            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_data.reset(ggml_init(data_params));
+        if (!ctx_data) {
+            throw std::runtime_error("failed to initialize codec tensor context");
+        }
+    }
+
+    int find_key(const std::string & key, const bool required = true) const {
+        const int idx = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (idx < 0 && required) {
+            throw std::runtime_error("GGUF key not found: " + key);
+        }
+        return idx;
+    }
+
+    bool has_key(const std::string & key) const {
+        return gguf_find_key(ctx_gguf.get(), key.c_str()) >= 0;
+    }
+
+    uint32_t get_u32(const std::string & key, const bool required = true, const uint32_t fallback = 0) const {
+        const int idx = find_key(key, required);
+        if (idx < 0) {
+            return fallback;
+        }
+        return gguf_get_val_u32(ctx_gguf.get(), idx);
+    }
+
+    float get_f32(const std::string & key, const bool required = true, const float fallback = 0.0f) const {
+        const int idx = find_key(key, required);
+        if (idx < 0) {
+            return fallback;
+        }
+        return gguf_get_val_f32(ctx_gguf.get(), idx);
+    }
+
+    std::string get_string(const std::string & key, const bool required = true, const std::string & fallback = {}) const {
+        const int idx = find_key(key, required);
+        if (idx < 0) {
+            return fallback;
+        }
+        return std::string(gguf_get_val_str(ctx_gguf.get(), idx));
+    }
+
+    ggml_tensor * get_tensor(const std::string & name, const bool required = true) {
+        const auto it = loaded_tensors.find(name);
+        if (it != loaded_tensors.end()) {
+            return it->second;
+        }
+
+        ggml_tensor * meta_tensor = ggml_get_tensor(ctx_meta.get(), name.c_str());
+        if (!meta_tensor) {
+            if (required) {
+                throw std::runtime_error("codec tensor not found: " + name);
+            }
+            return nullptr;
+        }
+
+        ggml_tensor * data_tensor = ggml_dup_tensor(ctx_data.get(), meta_tensor);
+        ggml_set_name(data_tensor, meta_tensor->name);
+        loaded_tensors.emplace(name, data_tensor);
+        tensors_to_load.push_back(data_tensor);
+        return data_tensor;
+    }
+
+    void load_tensor_bytes() {
+        if (!buffer) {
+            buffer.reset(ggml_backend_alloc_ctx_tensors(ctx_data.get(), backend.get()));
+            if (!buffer) {
+                throw std::runtime_error("failed to allocate codec weight buffer");
+            }
+            ggml_backend_buffer_set_usage(buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        }
+
+        std::ifstream fin(fname, std::ios::binary);
+        if (!fin) {
+            throw std::runtime_error("failed to open codec GGUF for tensor loading: " + fname);
+        }
+
+        std::vector<uint8_t> read_buf;
+        for (ggml_tensor * tensor : tensors_to_load) {
+            const auto it = tensor_offset.find(tensor->name);
+            if (it == tensor_offset.end()) {
+                throw std::runtime_error("missing GGUF tensor offset for: " + std::string(tensor->name));
+            }
+
+            const size_t offset = it->second;
+            const size_t num_bytes = ggml_nbytes(tensor);
+
+            fin.seekg(offset, std::ios::beg);
+            if (!fin) {
+                throw std::runtime_error("failed to seek codec tensor: " + std::string(tensor->name));
+            }
+
+            if (ggml_backend_buffer_is_host(buffer.get())) {
+                fin.read(reinterpret_cast<char *>(tensor->data), (std::streamsize) num_bytes);
+            } else {
+                read_buf.resize(num_bytes);
+                fin.read(reinterpret_cast<char *>(read_buf.data()), (std::streamsize) num_bytes);
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, num_bytes);
+            }
+
+            if (!fin) {
+                throw std::runtime_error("failed to read codec tensor: " + std::string(tensor->name));
+            }
+        }
+    }
+};
+
+} // namespace
+
+struct moss_audio_tokenizer::impl {
+    int sample_rate = 0;
+    uint32_t downsample_rate = 0;
+    uint32_t num_quantizers = 0;
+    int n_threads = -1;
+
+    ggml_backend_ptr backend;
+    ggml_context_ptr ctx_meta;
+    gguf_context_ptr ctx_gguf;
+    ggml_context_ptr ctx_data;
+    ggml_backend_buffer_ptr weights_buffer;
+
+    moss_codec_quantizer quantizer;
+    moss_codec_linear_f32 quantizer_input_proj_f32;
+    std::vector<moss_codec_quantizer_entry_f32> quantizer_entries_f32;
+    std::vector<moss_codec_module> encoder;
+    std::vector<moss_codec_module> decoder;
+
+    explicit impl(const std::string & model_path, const moss_audio_tokenizer_options & options) {
+        moss_codec_gguf_loader loader(model_path);
+
+        if (!loader.has_key(std::string(MOSS_CODEC_ARCH) + ".quantizer_type")) {
+            throw std::runtime_error("model does not contain bundled MOSS audio tokenizer metadata");
+        }
+
+        sample_rate = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".sampling_rate");
+        downsample_rate = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".downsample_rate");
+        num_quantizers = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.num_quantizers");
+        n_threads = options.n_threads;
+
+        quantizer.input_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.input_dim");
+        quantizer.rvq_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.rvq_dim");
+        quantizer.output_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.output_dim");
+        quantizer.num_quantizers = (int) num_quantizers;
+        quantizer.codebook_size = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_size");
+        quantizer.codebook_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_dim");
+        quantizer.input_proj_w = loader.get_tensor("audio_tokenizer.quantizer.input_proj.weight", false);
+        quantizer.input_proj_b = loader.get_tensor("audio_tokenizer.quantizer.input_proj.bias", false);
+        quantizer.output_proj_w = loader.get_tensor("audio_tokenizer.quantizer.output_proj.weight", false);
+        quantizer.output_proj_b = loader.get_tensor("audio_tokenizer.quantizer.output_proj.bias", false);
+        quantizer.quantizers.resize(num_quantizers);
+        for (uint32_t iq = 0; iq < num_quantizers; ++iq) {
+            auto & entry = quantizer.quantizers[iq];
+            const std::string prefix = "audio_tokenizer.quantizer.quantizers." + std::to_string(iq);
+            entry.in_proj_w = loader.get_tensor(prefix + ".in_proj.weight", false);
+            entry.in_proj_b = loader.get_tensor(prefix + ".in_proj.bias", false);
+            entry.codebook = loader.get_tensor(prefix + ".codebook.weight");
+            entry.out_proj_w = loader.get_tensor(prefix + ".out_proj.weight", false);
+            entry.out_proj_b = loader.get_tensor(prefix + ".out_proj.bias", false);
+        }
+
+        const auto load_modules = [&](const std::string & section_name, std::vector<moss_codec_module> & modules) {
+            const uint32_t block_count = loader.get_u32(std::string(MOSS_CODEC_ARCH) + "." + section_name + ".block_count");
+            modules.resize(block_count);
+            for (uint32_t ib = 0; ib < block_count; ++ib) {
+                const std::string block_prefix = std::string(MOSS_CODEC_ARCH) + "." + section_name + "." + std::to_string(ib);
+                moss_codec_module & block = modules[ib];
+                block.type = moss_codec_module_type_from_string(loader.get_string(block_prefix + ".module_type"));
+
+                if (block.type == moss_codec_module_type::PATCHED_PRETRANSFORM) {
+                    block.patch_size = (int) loader.get_u32(block_prefix + ".patch_size");
+                    continue;
+                }
+
+                auto & tr = block.transformer;
+                tr.input_dimension = (int) loader.get_u32(block_prefix + ".input_dimension");
+                tr.output_dimension = (int) loader.get_u32(block_prefix + ".output_dimension");
+                tr.d_model = (int) loader.get_u32(block_prefix + ".d_model");
+                tr.num_heads = (int) loader.get_u32(block_prefix + ".num_heads");
+                tr.num_layers = (int) loader.get_u32(block_prefix + ".num_layers");
+                tr.dim_feedforward = (int) loader.get_u32(block_prefix + ".dim_feedforward");
+                tr.context = (int) loader.get_u32(block_prefix + ".context");
+                tr.max_period = loader.get_f32(block_prefix + ".max_period", false, 10000.0f);
+                tr.input_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".input_proj.weight", false);
+                tr.output_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".output_proj.weight", false);
+
+                tr.layers.resize(tr.num_layers);
+                for (int il = 0; il < tr.num_layers; ++il) {
+                    auto & layer = tr.layers[il];
+                    const std::string layer_prefix =
+                            "audio_tokenizer." + section_name + "." + std::to_string(ib) + ".transformer.layers." + std::to_string(il);
+                    layer.attn_in  = loader.get_tensor(layer_prefix + ".self_attn.in_projs.0.weight");
+                    layer.attn_out = loader.get_tensor(layer_prefix + ".self_attn.out_projs.0.weight");
+                    layer.linear1  = loader.get_tensor(layer_prefix + ".linear1.weight");
+                    layer.linear2  = loader.get_tensor(layer_prefix + ".linear2.weight");
+                    layer.norm1_w  = loader.get_tensor(layer_prefix + ".norm1.weight");
+                    layer.norm1_b  = loader.get_tensor(layer_prefix + ".norm1.bias");
+                    layer.norm2_w  = loader.get_tensor(layer_prefix + ".norm2.weight");
+                    layer.norm2_b  = loader.get_tensor(layer_prefix + ".norm2.bias");
+                    layer.scale1   = loader.get_tensor(layer_prefix + ".layer_scale_1.scale", false);
+                    layer.scale2   = loader.get_tensor(layer_prefix + ".layer_scale_2.scale", false);
+                }
+            }
+        };
+
+        load_modules("encoder", encoder);
+        load_modules("decoder", decoder);
+
+        loader.load_tensor_bytes();
+
+        backend = std::move(loader.backend);
+        ctx_meta = std::move(loader.ctx_meta);
+        ctx_gguf = std::move(loader.ctx_gguf);
+        ctx_data = std::move(loader.ctx_data);
+        weights_buffer = std::move(loader.buffer);
+
+        quantizer_input_proj_f32 = moss_codec_linear_from_tensors(quantizer.input_proj_w, quantizer.input_proj_b);
+        quantizer_entries_f32.resize(num_quantizers);
+        for (uint32_t iq = 0; iq < num_quantizers; ++iq) {
+            auto & dst = quantizer_entries_f32[iq];
+            const auto & src = quantizer.quantizers[iq];
+            dst.in_proj = moss_codec_linear_from_tensors(src.in_proj_w, src.in_proj_b);
+            dst.out_proj = moss_codec_linear_from_tensors(src.out_proj_w, src.out_proj_b);
+            dst.codebook_dim = (int) src.codebook->ne[0];
+            dst.codebook_size = (int) src.codebook->ne[1];
+            dst.codebook = moss_codec_tensor_to_f32(src.codebook);
+            dst.codebook_unit = moss_codec_normalize_rows(dst.codebook, dst.codebook_dim);
+        }
+
+        LLAMA_LOG_INFO("%s: sample_rate=%d downsample_rate=%u num_quantizers=%u encoder_blocks=%zu decoder_blocks=%zu\n",
+                __func__, sample_rate, downsample_rate, num_quantizers, encoder.size(), decoder.size());
+    }
+
+    std::vector<float> run_quantizer_decode(
+            const std::vector<llama_token> & codes,
+            const size_t n_frames,
+            uint32_t n_quantizers_req) const {
+        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
+        if (nq == 0 || nq > num_quantizers) {
+            throw std::runtime_error("invalid quantizer count for decode");
+        }
+        if (codes.size() != n_frames * (size_t) nq) {
+            throw std::runtime_error("raw code size does not match frame count");
+        }
+
+        const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) nq * 8;
+        const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false);
+        std::vector<uint8_t> meta_buf(meta_size);
+
+        ggml_init_params params = {
+            /*.mem_size   =*/ meta_size,
+            /*.mem_buffer =*/ meta_buf.data(),
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx0 = ggml_init(params);
+        if (!ctx0) {
+            throw std::runtime_error("failed to init quantizer decode ggml context");
+        }
+
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false);
+        std::vector<ggml_tensor *> code_inputs(nq);
+
+        ggml_tensor * cur = nullptr;
+        for (uint32_t iq = 0; iq < nq; ++iq) {
+            const auto & entry = quantizer.quantizers[iq];
+            ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames);
+            ggml_set_input(inp);
+            code_inputs[iq] = inp;
+
+            ggml_tensor * emb = ggml_get_rows(ctx0, entry.codebook, inp);
+            if (entry.out_proj_w) {
+                emb = ggml_mul_mat(ctx0, entry.out_proj_w, emb);
+            }
+            if (entry.out_proj_b) {
+                emb = ggml_add(ctx0, emb, entry.out_proj_b);
+            }
+            cur = cur ? ggml_add(ctx0, cur, emb) : emb;
+        }
+
+        if (quantizer.output_proj_w) {
+            cur = ggml_mul_mat(ctx0, quantizer.output_proj_w, cur);
+        }
+        if (quantizer.output_proj_b) {
+            cur = ggml_add(ctx0, cur, quantizer.output_proj_b);
+        }
+
+        ggml_build_forward_expand(gf, cur);
+
+        ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) };
+        ggml_gallocr_alloc_graph(allocr.get(), gf);
+
+        for (uint32_t iq = 0; iq < nq; ++iq) {
+            std::vector<int32_t> gathered(n_frames);
+            for (size_t t = 0; t < n_frames; ++t) {
+                const llama_token code = codes[t * (size_t) nq + iq];
+                if (code < 0 || code >= quantizer.codebook_size) {
+                    ggml_free(ctx0);
+                    throw std::runtime_error("audio code out of codec range during decode");
+                }
+                gathered[t] = (int32_t) code;
+            }
+            ggml_backend_tensor_set(code_inputs[iq], gathered.data(), 0, gathered.size() * sizeof(int32_t));
+        }
+
+        moss_codec_set_n_threads(backend.get(), n_threads);
+        const ggml_status status = ggml_backend_graph_compute(backend.get(), gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            ggml_free(ctx0);
+            throw std::runtime_error("quantizer decode graph compute failed");
+        }
+
+        std::vector<float> output = moss_codec_copy_f32_output(cur);
+        ggml_free(ctx0);
+        return output;
+    }
+
+    std::vector<float> run_transformer_block(
+            const moss_codec_transformer_block & block,
+            const std::vector<float> & input,
+            const size_t n_frames) const {
+        if (input.size() != (size_t) block.input_dimension * n_frames) {
+            throw std::runtime_error("transformer block input size mismatch");
+        }
+
+        const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) block.num_layers * MOSS_CODEC_MAX_NODES_PER_LAYER;
+        const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false);
+        std::vector<uint8_t> meta_buf(meta_size);
+
+        ggml_init_params params = {
+            /*.mem_size   =*/ meta_size,
+            /*.mem_buffer =*/ meta_buf.data(),
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx0 = ggml_init(params);
+        if (!ctx0) {
+            throw std::runtime_error("failed to init transformer ggml context");
+        }
+
+        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false);
+
+        ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, block.input_dimension, (int64_t) n_frames);
+        ggml_set_input(inp);
+        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames);
+        ggml_set_input(positions);
+        ggml_tensor * mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, (int64_t) n_frames, (int64_t) n_frames, 1, 1);
+        ggml_set_input(mask);
+
+        ggml_tensor * cur = inp;
+        if (block.input_proj) {
+            cur = ggml_mul_mat(ctx0, block.input_proj, cur);
+        }
+
+        const int d_head = block.d_model / block.num_heads;
+        const float attn_scale = 1.0f / std::sqrt((float) d_head);
+
+        for (int il = 0; il < block.num_layers; ++il) {
+            const auto & layer = block.layers[il];
+
+            ggml_tensor * inp_sa = cur;
+            ggml_tensor * x = moss_codec_build_layer_norm(ctx0, cur, layer.norm1_w, layer.norm1_b);
+            ggml_tensor * qkv = ggml_mul_mat(ctx0, layer.attn_in, x);
+
+            ggml_tensor * q = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
+                    ggml_row_size(qkv->type, d_head), qkv->nb[1], 0);
+            ggml_tensor * k = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
+                    ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model));
+            ggml_tensor * v = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
+                    ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model));
+
+            q = ggml_rope_ext(ctx0, q, positions, nullptr, d_head, 0, 0,
+                    block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+            k = ggml_rope_ext(ctx0, k, positions, nullptr, d_head, 0, 0,
+                    block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+
+            ggml_tensor * attn = moss_codec_build_attention(ctx0, layer.attn_out, q, k, v, mask, attn_scale);
+            if (layer.scale1) {
+                attn = ggml_mul(ctx0, attn, layer.scale1);
+            }
+            cur = ggml_add(ctx0, inp_sa, attn);
+
+            ggml_tensor * inp_ff = cur;
+            x = moss_codec_build_layer_norm(ctx0, cur, layer.norm2_w, layer.norm2_b);
+            x = ggml_mul_mat(ctx0, layer.linear1, x);
+            x = ggml_gelu(ctx0, x);
+            x = ggml_mul_mat(ctx0, layer.linear2, x);
+            if (layer.scale2) {
+                x = ggml_mul(ctx0, x, layer.scale2);
+            }
+            cur = ggml_add(ctx0, inp_ff, x);
+        }
+
+        if (block.output_proj) {
+            cur = ggml_mul_mat(ctx0, block.output_proj, cur);
+        }
+
+        ggml_build_forward_expand(gf, cur);
+
+        ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) };
+        ggml_gallocr_alloc_graph(allocr.get(), gf);
+
+        const std::vector<int32_t> positions_data = moss_codec_make_positions(n_frames);
+        const std::vector<float> mask_data = moss_codec_make_causal_mask(n_frames, block.context);
+
+        ggml_backend_tensor_set(inp, input.data(), 0, input.size() * sizeof(float));
+        ggml_backend_tensor_set(positions, positions_data.data(), 0, positions_data.size() * sizeof(int32_t));
+        ggml_backend_tensor_set(mask, mask_data.data(), 0, mask_data.size() * sizeof(float));
+
+        moss_codec_set_n_threads(backend.get(), n_threads);
+        const ggml_status status = ggml_backend_graph_compute(backend.get(), gf);
+        if (status != GGML_STATUS_SUCCESS) {
+            ggml_free(ctx0);
+            throw std::runtime_error("transformer graph compute failed");
+        }
+
+        std::vector<float> output = moss_codec_copy_f32_output(cur);
+        ggml_free(ctx0);
+        return output;
+    }
+
+    std::vector<float> decode(
+            const std::vector<llama_token> & codes,
+            const size_t n_frames,
+            const uint32_t n_quantizers_req) const {
+        uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
+        if (nq == 0 || nq > num_quantizers) {
+            throw std::runtime_error("invalid quantizer count");
+        }
+
+        std::vector<float> cur = run_quantizer_decode(codes, n_frames, nq);
+        int channels = quantizer.output_dim;
+        size_t frames = n_frames;
+
+        for (const auto & module : decoder) {
+            switch (module.type) {
+                case moss_codec_module_type::TRANSFORMER:
+                    cur = run_transformer_block(module.transformer, cur, frames);
+                    channels = module.transformer.output_dimension;
+                    break;
+                case moss_codec_module_type::PATCHED_PRETRANSFORM:
+                    cur = moss_codec_patch_decode(cur, channels, frames, module.patch_size);
+                    channels /= module.patch_size;
+                    frames *= (size_t) module.patch_size;
+                    break;
+            }
+        }
+
+        if (channels != 1) {
+            throw std::runtime_error("codec decoder did not end with a mono waveform channel");
+        }
+
+        return cur;
+    }
+
+    std::vector<llama_token> run_quantizer_encode(
+            const std::vector<float> & input,
+            const size_t n_frames,
+            const uint32_t n_quantizers_req) const {
+        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
+        if (nq == 0 || nq > num_quantizers) {
+            throw std::runtime_error("invalid quantizer count for encode");
+        }
+
+        std::vector<float> residual = moss_codec_linear_apply(quantizer_input_proj_f32, input, n_frames);
+        if (residual.size() != (size_t) quantizer.rvq_dim * n_frames) {
+            throw std::runtime_error("quantizer input projection size mismatch");
+        }
+
+        std::vector<llama_token> codes(n_frames * (size_t) nq, 0);
+        std::vector<float> latents;
+        std::vector<float> latents_unit;
+        std::vector<float> decoded;
+
+        for (uint32_t iq = 0; iq < nq; ++iq) {
+            const auto & entry = quantizer_entries_f32[iq];
+            latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames);
+            if (latents.size() != (size_t) entry.codebook_dim * n_frames) {
+                throw std::runtime_error("quantizer latent projection size mismatch");
+            }
+
+            latents_unit.resize(latents.size());
+            for (size_t t = 0; t < n_frames; ++t) {
+                const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim;
+                float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim;
+
+                float norm2 = 0.0f;
+                for (int d = 0; d < entry.codebook_dim; ++d) {
+                    norm2 += in_ptr[d] * in_ptr[d];
+                }
+                const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
+                for (int d = 0; d < entry.codebook_dim; ++d) {
+                    out_ptr[d] = in_ptr[d] * inv;
+                }
+            }
+
+            std::vector<float> codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f);
+            for (size_t t = 0; t < n_frames; ++t) {
+                const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim;
+
+                float best_score = -std::numeric_limits<float>::infinity();
+                int best_index = 0;
+                for (int code = 0; code < entry.codebook_size; ++code) {
+                    const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim;
+                    float score = 0.0f;
+                    for (int d = 0; d < entry.codebook_dim; ++d) {
+                        score += row[d] * latent[d];
+                    }
+                    if (score > best_score) {
+                        best_score = score;
+                        best_index = code;
+                    }
+                }
+
+                codes[t * (size_t) nq + iq] = best_index;
+                const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim;
+                std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim));
+            }
+
+            decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames);
+            if (decoded.size() != residual.size()) {
+                throw std::runtime_error("quantizer decoded embedding size mismatch");
+            }
+
+            for (size_t i = 0; i < residual.size(); ++i) {
+                residual[i] -= decoded[i];
+            }
+        }
+
+        return codes;
+    }
+
+    std::vector<llama_token> encode(
+            const std::vector<float> & audio,
+            size_t * out_frames,
+            const uint32_t n_quantizers_req) const {
+        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
+        if (nq == 0 || nq > num_quantizers) {
+            throw std::runtime_error("invalid quantizer count");
+        }
+
+        const size_t padded_samples =
+                ((audio.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate;
+        const size_t valid_frames = audio.size() / (size_t) downsample_rate;
+
+        std::vector<float> cur(padded_samples, 0.0f);
+        std::copy(audio.begin(), audio.end(), cur.begin());
+
+        int channels = 1;
+        size_t frames = padded_samples;
+
+        for (const auto & module : encoder) {
+            switch (module.type) {
+                case moss_codec_module_type::PATCHED_PRETRANSFORM:
+                    cur = moss_codec_patch_encode(cur, channels, frames, module.patch_size);
+                    channels *= module.patch_size;
+                    frames /= (size_t) module.patch_size;
+                    break;
+                case moss_codec_module_type::TRANSFORMER:
+                    cur = run_transformer_block(module.transformer, cur, frames);
+                    channels = module.transformer.output_dimension;
+                    break;
+            }
+        }
+
+        if (channels != quantizer.input_dim) {
+            throw std::runtime_error("codec encoder output dimension does not match quantizer input dimension");
+        }
+
+        std::vector<llama_token> codes = run_quantizer_encode(cur, frames, nq);
+        if (out_frames) {
+            *out_frames = valid_frames;
+        }
+
+        if (valid_frames >= frames) {
+            return codes;
+        }
+
+        std::vector<llama_token> trimmed(valid_frames * (size_t) nq);
+        for (size_t t = 0; t < valid_frames; ++t) {
+            std::copy_n(codes.data() + t * (size_t) nq, nq, trimmed.data() + t * (size_t) nq);
+        }
+        return trimmed;
+    }
+};
+
+moss_audio_tokenizer::moss_audio_tokenizer(
+        const std::string & model_path,
+        const moss_audio_tokenizer_options & options)
+    : impl_(std::make_unique<impl>(model_path, options)) {
+}
+
+moss_audio_tokenizer::~moss_audio_tokenizer() = default;
+
+moss_audio_tokenizer::moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept = default;
+
+moss_audio_tokenizer & moss_audio_tokenizer::operator=(moss_audio_tokenizer &&) noexcept = default;
+
+int moss_audio_tokenizer::sample_rate() const {
+    return impl_->sample_rate;
+}
+
+uint32_t moss_audio_tokenizer::downsample_rate() const {
+    return impl_->downsample_rate;
+}
+
+uint32_t moss_audio_tokenizer::num_quantizers() const {
+    return impl_->num_quantizers;
+}
+
+std::vector<float> moss_audio_tokenizer::decode(
+        const std::vector<llama_token> & codes,
+        const size_t n_frames,
+        const uint32_t n_quantizers) const {
+    return impl_->decode(codes, n_frames, n_quantizers);
+}
+
+std::vector<llama_token> moss_audio_tokenizer::encode(
+        const std::vector<float> & audio,
+        size_t * out_frames,
+        const uint32_t n_quantizers) const {
+    return impl_->encode(audio, out_frames, n_quantizers);
+}
+
+static std::string moss_codec_model_meta_str(const llama_model * model, const std::string & key) {
+    const auto it = model->gguf_kv.find(key);
+    if (it == model->gguf_kv.end()) {
+        throw std::runtime_error("missing GGUF key: " + key);
+    }
+
+    std::string value = it->second;
+    if (value.size() >= 2 && ((value.front() == '\'' && value.back() == '\'') || (value.front() == '"' && value.back() == '"'))) {
+        value = value.substr(1, value.size() - 2);
+    }
+    return value;
+}
+
+static uint32_t moss_codec_model_meta_u32(const llama_model * model, const std::string & key) {
+    return (uint32_t) std::stoul(moss_codec_model_meta_str(model, key));
+}
+
+static const ggml_tensor * moss_codec_model_require_tensor(const llama_model * model, const std::string & name) {
+    const ggml_tensor * tensor = model->get_tensor(name.c_str());
+    if (tensor == nullptr) {
+        throw std::runtime_error("missing tensor: " + name);
+    }
+    return tensor;
+}
+
+static const ggml_tensor * moss_codec_model_optional_tensor(const llama_model * model, const std::string & name) {
+    return model->get_tensor(name.c_str());
+}
+
+int moss_audio_model_sample_rate(const llama_model * model) {
+    const std::string arch_name = llm_arch_name(model->arch);
+    return (int) moss_codec_model_meta_u32(model, arch_name + ".sampling_rate");
+}
+
+uint32_t moss_audio_model_downsample_rate(const llama_model * model) {
+    const std::string arch_name = llm_arch_name(model->arch);
+    return moss_codec_model_meta_u32(model, arch_name + ".downsample_rate");
+}
+
+uint32_t moss_audio_model_num_quantizers(const llama_model * model) {
+    const std::string arch_name = llm_arch_name(model->arch);
+    return moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers");
+}
+
+std::vector<llama_token> moss_audio_model_quantizer_encode(
+        const llama_model * model,
+        const std::vector<float> & input,
+        size_t n_frames,
+        uint32_t n_quantizers_req) {
+    if (model->arch != LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) {
+        throw std::runtime_error("quantizer encode expects a moss-tts-audio-encoder model");
+    }
+
+    const std::string arch_name = llm_arch_name(model->arch);
+    const uint32_t num_quantizers = moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers");
+    const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
+    if (nq == 0 || nq > num_quantizers) {
+        throw std::runtime_error("invalid quantizer count");
+    }
+
+    moss_codec_linear_f32 quantizer_input_proj = moss_codec_linear_from_tensors(
+            const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, "quantizer.input_proj.weight")),
+            const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, "quantizer.input_proj.bias")));
+
+    std::vector<moss_codec_quantizer_entry_f32> quantizers(nq);
+    for (uint32_t iq = 0; iq < nq; ++iq) {
+        auto & entry = quantizers[iq];
+        const std::string prefix = "quantizer.quantizers." + std::to_string(iq);
+        entry.in_proj = moss_codec_linear_from_tensors(
+                const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, prefix + ".in_proj.weight")),
+                const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, prefix + ".in_proj.bias")));
+        entry.out_proj = moss_codec_linear_from_tensors(
+                const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, prefix + ".out_proj.weight")),
+                const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, prefix + ".out_proj.bias")));
+        const ggml_tensor * codebook = moss_codec_model_require_tensor(model, prefix + ".codebook.weight");
+        entry.codebook_dim = (int) codebook->ne[0];
+        entry.codebook_size = (int) codebook->ne[1];
+        entry.codebook = moss_codec_tensor_to_f32(codebook);
+        entry.codebook_unit = moss_codec_normalize_rows(entry.codebook, entry.codebook_dim);
+    }
+
+    std::vector<float> residual = moss_codec_linear_apply(quantizer_input_proj, input, n_frames);
+    std::vector<llama_token> codes(n_frames * (size_t) nq, 0);
+    std::vector<float> latents;
+    std::vector<float> latents_unit;
+    std::vector<float> decoded;
+
+    for (uint32_t iq = 0; iq < nq; ++iq) {
+        const auto & entry = quantizers[iq];
+        latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames);
+        if (latents.size() != (size_t) entry.codebook_dim * n_frames) {
+            throw std::runtime_error("quantizer latent projection size mismatch");
+        }
+
+        latents_unit.resize(latents.size());
+        for (size_t t = 0; t < n_frames; ++t) {
+            const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim;
+            float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim;
+
+            float norm2 = 0.0f;
+            for (int d = 0; d < entry.codebook_dim; ++d) {
+                norm2 += in_ptr[d] * in_ptr[d];
+            }
+            const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
+            for (int d = 0; d < entry.codebook_dim; ++d) {
+                out_ptr[d] = in_ptr[d] * inv;
+            }
+        }
+
+        std::vector<float> codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f);
+        for (size_t t = 0; t < n_frames; ++t) {
+            const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim;
+
+            float best_score = -std::numeric_limits<float>::infinity();
+            int best_index = 0;
+            for (int code = 0; code < entry.codebook_size; ++code) {
+                const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim;
+                float score = 0.0f;
+                for (int d = 0; d < entry.codebook_dim; ++d) {
+                    score += row[d] * latent[d];
+                }
+                if (score > best_score) {
+                    best_score = score;
+                    best_index = code;
+                }
+            }
+
+            codes[t * (size_t) nq + iq] = best_index;
+            const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim;
+            std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim));
+        }
+
+        decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames);
+        if (decoded.size() != residual.size()) {
+            throw std::runtime_error("quantizer decoded embedding size mismatch");
+        }
+
+        for (size_t i = 0; i < residual.size(); ++i) {
+            residual[i] -= decoded[i];
+        }
+    }
+
+    return codes;
+}
diff --git a/tools/tts/CMakeLists.txt b/tools/tts/CMakeLists.txt
index b91a84759..838ad3b42 100644
--- a/tools/tts/CMakeLists.txt
+++ b/tools/tts/CMakeLists.txt
@@ -8,7 +8,7 @@ if(LLAMA_TOOLS_INSTALL)
 endif()
 
 set(TARGET llama-moss-tts)
-add_executable(${TARGET} moss-tts.cpp)
+add_executable(${TARGET} run-moss-tts-delay.cpp)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
diff --git a/tools/tts/moss-tts-audio-decode.py b/tools/tts/moss-tts-audio-decode.py
index 160579149..bf7e3ebe2 100755
--- a/tools/tts/moss-tts-audio-decode.py
+++ b/tools/tts/moss-tts-audio-decode.py
@@ -3,32 +3,14 @@
 from __future__ import annotations
 
 import argparse
-import os
 import struct
-import sys
 import wave
 from pathlib import Path
 
 import numpy as np
 
-
-def resolve_moss_tts_dir() -> Path:
-    env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT")
-    if env_dir:
-        path = Path(env_dir).expanduser().resolve()
-    else:
-        path = Path(__file__).resolve().parents[3] / "MOSS-TTS"
-
-    if not path.is_dir():
-        raise FileNotFoundError(
-            f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root."
-        )
-    return path
-
-
-sys.path.insert(0, str(resolve_moss_tts_dir()))
-
-from moss_tts_delay.llama_cpp._constants import N_VQ, SAMPLE_RATE  # noqa: E402
+from moss_tts_onnx import OnnxAudioTokenizer
+from moss_tts_processor import N_VQ, SAMPLE_RATE
 
 
 CODES_MAGIC = 0x53444F43  # "CODS"
@@ -73,13 +55,6 @@ def main() -> int:
     ap.add_argument("--cpu", action="store_true")
     args = ap.parse_args()
 
-    try:
-        from moss_audio_tokenizer.onnx import OnnxAudioTokenizer
-    except Exception as exc:
-        raise RuntimeError(
-            "moss_audio_tokenizer.onnx is unavailable; initialize the submodule/package and install ONNX deps"
-        ) from exc
-
     codes = read_codes(Path(args.codes_bin))
     if codes.ndim != 2 or codes.shape[1] != N_VQ:
         raise RuntimeError(f"expected raw codes with shape (T, {N_VQ}), got {codes.shape}")
diff --git a/tools/tts/moss-tts-build-generation-ref.py b/tools/tts/moss-tts-build-generation-ref.py
index 48a784673..aa01d5e81 100755
--- a/tools/tts/moss-tts-build-generation-ref.py
+++ b/tools/tts/moss-tts-build-generation-ref.py
@@ -3,31 +3,19 @@
 from __future__ import annotations
 
 import argparse
-import os
 import struct
 import sys
 from pathlib import Path
 
 import numpy as np
 
+from moss_tts_onnx import OnnxAudioTokenizer
+from moss_tts_processor import AUDIO_PAD_CODE, Tokenizer, build_generation_prompt
+
 REF_MAGIC = 0x4652474D  # "MGRF"
 REF_VERSION = 1
 
 
-def resolve_moss_tts_dir() -> Path:
-    env_dir = os.getenv("MOSS_TTS_DIR") or os.getenv("MOSS_TTS_ROOT")
-    if env_dir:
-        path = Path(env_dir).expanduser().resolve()
-    else:
-        path = Path(__file__).resolve().parents[3] / "MOSS-TTS"
-
-    if not path.is_dir():
-        raise FileNotFoundError(
-            f"MOSS-TTS repo not found: {path}. Set MOSS_TTS_DIR to the MOSS-TTS checkout root."
-        )
-    return path
-
-
 def parse_args() -> argparse.Namespace:
     ap = argparse.ArgumentParser(
         description="Build first-class MOSS-TTS generation input (.bin) from text (+ optional reference audio)."
@@ -59,7 +47,6 @@ def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None:
         raise ValueError("--encoder-onnx and --decoder-onnx are required when --reference-audio is set")
 
     import soundfile as sf
-    from moss_audio_tokenizer.onnx import OnnxAudioTokenizer
 
     wav, sr = sf.read(args.reference_audio, dtype="float32")
     if wav.ndim > 1:
@@ -79,11 +66,6 @@ def _read_reference_codes(args: argparse.Namespace) -> np.ndarray | None:
 def main() -> int:
     args = parse_args()
 
-    sys.path.insert(0, str(resolve_moss_tts_dir()))
-
-    from moss_tts_delay.llama_cpp._constants import AUDIO_PAD_CODE
-    from moss_tts_delay.llama_cpp.processor import Tokenizer, build_generation_prompt
-
     text = _load_text(args)
     reference_codes = _read_reference_codes(args)
 
diff --git a/tools/tts/moss-tts-firstclass-e2e.py b/tools/tts/moss-tts-firstclass-e2e.py
index 21647373f..bf112597e 100755
--- a/tools/tts/moss-tts-firstclass-e2e.py
+++ b/tools/tts/moss-tts-firstclass-e2e.py
@@ -13,8 +13,9 @@
 
 
 def run_cmd(cmd: list[str], env: dict[str, str] | None = None) -> subprocess.CompletedProcess:
-    print("+", shlex.join(cmd), flush=True)
-    return subprocess.run(cmd, env=env, check=False)
+    cmd_str = shlex.join(cmd)
+    print("+", cmd_str, flush=True)
+    return subprocess.run(cmd_str, env=env, check=False, shell=True)
 
 
 def need_file(path: Path, name: str) -> None:
@@ -32,7 +33,11 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument("--model-gguf", default=os.getenv("MODEL_GGUF", ""))
-    parser.add_argument("--moss-tts-dir", default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", "")))
+    parser.add_argument(
+        "--moss-tts-dir",
+        default=os.getenv("MOSS_TTS_DIR", os.getenv("MOSS_TTS_ROOT", "")),
+        help="Deprecated compatibility flag; the first-class helpers no longer require a MOSS-TTS checkout.",
+    )
     parser.add_argument("--tokenizer-dir", default=os.getenv("TOKENIZER_DIR", ""))
     parser.add_argument("--onnx-encoder", default=os.getenv("ONNX_ENCODER", ""))
     parser.add_argument("--onnx-decoder", default=os.getenv("ONNX_DECODER", ""))
@@ -82,7 +87,6 @@ def main() -> int:
     onnx_decoder = Path(args.onnx_decoder).expanduser().resolve()
     python_bin = Path(args.python_bin).expanduser().resolve()
     output_wav = Path(args.output_wav).expanduser().resolve()
-    moss_tts_dir = Path(args.moss_tts_dir).expanduser().resolve() if args.moss_tts_dir else None
 
     need_file(python_bin, "python binary")
     need_file(model_gguf, "first-class model gguf")
@@ -91,8 +95,6 @@ def main() -> int:
     need_file(onnx_decoder, "ONNX decoder")
     need_file(build_ref_script, "generation-ref builder")
     need_file(decode_script, "audio decode helper")
-    if moss_tts_dir is not None and not moss_tts_dir.is_dir():
-        raise FileNotFoundError(f"missing MOSS-TTS repo: {moss_tts_dir}")
     if args.text_file:
         need_file(Path(args.text_file).expanduser().resolve(), "text file")
     if args.reference_audio:
@@ -119,12 +121,6 @@ def main() -> int:
     need_file(llama_bin, "llama-moss-tts binary")
     output_wav.parent.mkdir(parents=True, exist_ok=True)
     shared_env = os.environ.copy()
-    if moss_tts_dir is not None:
-        shared_env["MOSS_TTS_DIR"] = str(moss_tts_dir)
-        old_pythonpath = shared_env.get("PYTHONPATH")
-        shared_env["PYTHONPATH"] = (
-            f"{moss_tts_dir}{os.pathsep}{old_pythonpath}" if old_pythonpath else str(moss_tts_dir)
-        )
 
     with tempfile.TemporaryDirectory(prefix="moss-tts-firstclass-") as tmpdir:
         tmpdir_path = Path(tmpdir)
diff --git a/tools/tts/moss_tts_onnx.py b/tools/tts/moss_tts_onnx.py
new file mode 100644
index 000000000..6235bc2f6
--- /dev/null
+++ b/tools/tts/moss_tts_onnx.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+
+N_QUANTIZERS = 32
+DOWNSAMPLE_RATE = 1920
+
+
+def _load_ort_session(model_path: str | Path, use_gpu: bool):
+    try:
+        import onnxruntime as ort
+    except ImportError as exc:
+        raise RuntimeError("onnxruntime is required for MOSS audio tokenizer ONNX inference") from exc
+
+    providers = ["CPUExecutionProvider"]
+    if use_gpu:
+        available = set(ort.get_available_providers())
+        if "CUDAExecutionProvider" in available:
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+
+    session_options = ort.SessionOptions()
+    return ort.InferenceSession(str(model_path), sess_options=session_options, providers=providers)
+
+
+class OnnxAudioTokenizer:
+    """Minimal ONNX wrapper for the MOSS audio tokenizer."""
+
+    def __init__(self, encoder_path: str | Path, decoder_path: str | Path, use_gpu: bool = True):
+        self.encoder_session = _load_ort_session(encoder_path, use_gpu)
+        self.decoder_session = _load_ort_session(decoder_path, use_gpu)
+        self.encoder_inputs = [item.name for item in self.encoder_session.get_inputs()]
+        self.encoder_outputs = [item.name for item in self.encoder_session.get_outputs()]
+        self.decoder_inputs = [item.name for item in self.decoder_session.get_inputs()]
+        self.decoder_outputs = [item.name for item in self.decoder_session.get_outputs()]
+
+    def encode(self, waveform: np.ndarray, n_quantizers: int = N_QUANTIZERS) -> np.ndarray:
+        if waveform.ndim == 1:
+            waveform = waveform[np.newaxis, np.newaxis, :]
+        elif waveform.ndim == 2:
+            waveform = waveform[np.newaxis, :]
+
+        t = waveform.shape[-1]
+        padded = ((t + DOWNSAMPLE_RATE - 1) // DOWNSAMPLE_RATE) * DOWNSAMPLE_RATE
+        if padded != t:
+            waveform = np.concatenate(
+                [waveform, np.zeros((waveform.shape[0], waveform.shape[1], padded - t), dtype=np.float32)],
+                axis=-1,
+            )
+
+        result = self.encoder_session.run(
+            self.encoder_outputs,
+            {
+                self.encoder_inputs[0]: waveform.astype(np.float32),
+                self.encoder_inputs[1]: np.array(n_quantizers, dtype=np.int64),
+            },
+        )
+        return result[0][:, 0, :int(result[1][0])].T.astype(np.int64)
+
+    def decode(self, audio_codes: np.ndarray, n_quantizers: int = N_QUANTIZERS) -> np.ndarray:
+        if audio_codes.ndim == 2:
+            if audio_codes.shape[1] == N_QUANTIZERS and audio_codes.shape[0] != N_QUANTIZERS:
+                audio_codes = audio_codes.T
+            audio_codes = audio_codes[:, np.newaxis, :]
+
+        result = self.decoder_session.run(
+            self.decoder_outputs,
+            {
+                self.decoder_inputs[0]: audio_codes.astype(np.int64),
+                self.decoder_inputs[1]: np.array(n_quantizers, dtype=np.int64),
+            },
+        )
+        return result[0][0, 0, :int(result[1][0])].astype(np.float32)
diff --git a/tools/tts/moss_tts_processor.py b/tools/tts/moss_tts_processor.py
new file mode 100644
index 000000000..6f1b456e5
--- /dev/null
+++ b/tools/tts/moss_tts_processor.py
@@ -0,0 +1,269 @@
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+
+import numpy as np
+
+log = logging.getLogger(__name__)
+
+AUDIO_PLACEHOLDER = "<|audio|>"
+
+N_VQ = 32
+PAD_TOKEN_ID = 151643
+IM_START_TOKEN_ID = 151644
+IM_END_TOKEN_ID = 151645
+AUDIO_START_TOKEN_ID = 151652
+AUDIO_END_TOKEN_ID = 151653
+AUDIO_USER_SLOT_TOKEN_ID = 151654
+AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID = 151656
+AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID = 151662
+AUDIO_PAD_CODE = 1024
+AUDIO_VOCAB_SIZE = 1024
+SAMPLE_RATE = 24000
+
+
+class Tokenizer:
+    """Thin wrapper around the Hugging Face `tokenizers` library."""
+
+    def __init__(self, tokenizer_dir: str | Path):
+        from tokenizers import Tokenizer as HFTokenizer
+
+        tokenizer_path = Path(tokenizer_dir) / "tokenizer.json"
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"tokenizer.json not found in {tokenizer_dir}")
+        self._tok = HFTokenizer.from_file(str(tokenizer_path))
+        log.info("Tokenizer loaded from %s (vocab=%d)", tokenizer_path, self._tok.get_vocab_size())
+
+    def encode(self, text: str) -> list[int]:
+        return self._tok.encode(text).ids
+
+    def decode(self, ids: list[int]) -> str:
+        return self._tok.decode(ids)
+
+    @property
+    def vocab_size(self) -> int:
+        return self._tok.get_vocab_size()
+
+    def id_to_token(self, token_id: int) -> str | None:
+        return self._tok.id_to_token(token_id)
+
+
+def _get_special_token_str(tokenizer: Tokenizer, token_id: int) -> str:
+    token = tokenizer.id_to_token(token_id)
+    if token is None:
+        raise ValueError(f"Token ID {token_id} not in vocabulary")
+    return token
+
+
+def build_generation_prompt(
+    tokenizer: Tokenizer,
+    text: str,
+    reference_codes: np.ndarray | None = None,
+    instruction: str | None = None,
+    tokens: int | None = None,
+    quality: str | None = None,
+    language: str | None = None,
+    sound_event: str | None = None,
+    ambient_sound: str | None = None,
+) -> np.ndarray:
+    """Build the packed multi-channel prompt as (T, 1 + N_VQ)."""
+
+    audio_start_tok = _get_special_token_str(tokenizer, AUDIO_START_TOKEN_ID)
+    audio_end_tok = _get_special_token_str(tokenizer, AUDIO_END_TOKEN_ID)
+    gen_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID)
+    delay_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID)
+    user_slot_tok = _get_special_token_str(tokenizer, AUDIO_USER_SLOT_TOKEN_ID)
+    im_start_tok = _get_special_token_str(tokenizer, IM_START_TOKEN_ID)
+    im_end_tok = _get_special_token_str(tokenizer, IM_END_TOKEN_ID)
+
+    has_ref = reference_codes is not None and reference_codes.shape[0] > 0
+    ref_str = f"[S1]:\n{AUDIO_PLACEHOLDER}" if has_ref else "None"
+
+    user_content = (
+        f"<user_inst>\n"
+        f"- Reference(s):\n{ref_str}\n"
+        f"- Instruction:\n{instruction}\n"
+        f"- Tokens:\n{tokens}\n"
+        f"- Quality:\n{quality}\n"
+        f"- Sound Event:\n{sound_event}\n"
+        f"- Ambient Sound:\n{ambient_sound}\n"
+        f"- Language:\n{language}\n"
+        f"- Text:\n{text}\n"
+        f"</user_inst>"
+    )
+
+    ref_lengths = [reference_codes.shape[0]] if has_ref else []
+    user_content = _replace_audio_placeholders(
+        user_content,
+        ref_lengths,
+        n_vq=N_VQ,
+        gen_slot_token=user_slot_tok,
+        delay_slot_token=user_slot_tok,
+        audio_start_token=audio_start_tok,
+        audio_end_token=audio_end_tok,
+    )
+
+    full_text = f"{im_start_tok}user\n{user_content}{im_end_tok}\n{im_start_tok}assistant\n"
+    ref_audio_list = [reference_codes] if has_ref else []
+    unified_codes = _get_unified_codes(tokenizer, full_text, ref_audio_list)
+
+    assistant_gen = f"{audio_start_tok}"
+    gen_ids = np.array(tokenizer.encode(assistant_gen), dtype=np.int64)
+    gen_multi = np.full((len(gen_ids), 1 + N_VQ), AUDIO_PAD_CODE, dtype=np.int64)
+    gen_multi[:, 0] = gen_ids
+
+    return np.concatenate([unified_codes, gen_multi], axis=0)
+
+
+def parse_generation_output(
+    tokenizer: Tokenizer,
+    generation_ids: np.ndarray,
+    prompt_len: int,
+) -> tuple[str, np.ndarray]:
+    """Parse generated packed IDs into text and raw audio codes."""
+
+    gen_part = generation_ids[prompt_len:]
+    text_channel = gen_part[:, 0].tolist()
+    audio_channels = gen_part[:, 1:]
+
+    audio_start_tok = _get_special_token_str(tokenizer, AUDIO_START_TOKEN_ID)
+    gen_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_GEN_SLOT_TOKEN_ID)
+    delay_slot_tok = _get_special_token_str(tokenizer, AUDIO_ASSISTANT_DELAY_SLOT_TOKEN_ID)
+    audio_end_tok = _get_special_token_str(tokenizer, AUDIO_END_TOKEN_ID)
+
+    raw_text = tokenizer.decode(text_channel)
+    pattern = re.compile(
+        rf"(?:{re.escape(audio_start_tok)})?"
+        rf"(?:{re.escape(gen_slot_tok)})*"
+        rf"(?:{re.escape(delay_slot_tok)})*"
+        rf"{re.escape(audio_end_tok)}"
+    )
+
+    def repl(match: re.Match[str]) -> str:
+        segment = match.group(0)
+        if gen_slot_tok in segment:
+            return AUDIO_PLACEHOLDER
+        return ""
+
+    text = pattern.sub(repl, raw_text)
+    segments = extract_audio_segments(audio_channels)
+    if segments:
+        audio_codes = np.concatenate(segments, axis=0)
+    else:
+        audio_codes = np.zeros((0, N_VQ), dtype=np.int64)
+
+    return text, audio_codes
+
+
+def _replace_audio_placeholders(
+    content: str,
+    lengths: list[int],
+    n_vq: int,
+    gen_slot_token: str,
+    delay_slot_token: str,
+    audio_start_token: str,
+    audio_end_token: str,
+) -> str:
+    num_placeholders = content.count(AUDIO_PLACEHOLDER)
+    if num_placeholders != len(lengths):
+        raise ValueError(f"Placeholder count ({num_placeholders}) != lengths count ({len(lengths)})")
+
+    lengths_iter = iter(lengths)
+
+    def _build_block(length: int) -> str:
+        if length == 0:
+            return f"{audio_start_token}{audio_end_token}"
+        step_tokens = gen_slot_token * length + delay_slot_token * (n_vq - 1)
+        return f"{audio_start_token}{step_tokens}{audio_end_token}"
+
+    def replacer(match: re.Match[str]) -> str:
+        return _build_block(next(lengths_iter))
+
+    return re.sub(re.escape(AUDIO_PLACEHOLDER), replacer, content)
+
+
+def _get_unified_codes(
+    tokenizer: Tokenizer,
+    content: str,
+    audio_codes_list: list[np.ndarray],
+    truncation: bool = False,
+) -> np.ndarray:
+    text_ids = np.array(tokenizer.encode(content), dtype=np.int64)
+
+    if len(audio_codes_list) == 0:
+        audio_channel = np.full((len(text_ids), N_VQ), AUDIO_PAD_CODE, dtype=np.int64)
+        return np.concatenate([text_ids[:, np.newaxis], audio_channel], axis=1)
+
+    audio_start_indices = np.where(text_ids == AUDIO_START_TOKEN_ID)[0]
+    audio_end_indices = np.where(text_ids == AUDIO_END_TOKEN_ID)[0]
+
+    if len(audio_start_indices) != len(audio_codes_list) or len(audio_end_indices) != len(audio_codes_list):
+        raise ValueError(
+            f"Audio markers ({len(audio_start_indices)} starts, {len(audio_end_indices)} ends) "
+            f"do not match codes ({len(audio_codes_list)})"
+        )
+
+    delay_parts: list[np.ndarray] = []
+    prefix_idx = 0
+
+    for start_idx, end_idx, codes in zip(audio_start_indices, audio_end_indices, audio_codes_list):
+        start_idx = int(start_idx)
+        end_idx = int(end_idx)
+        delayed = apply_delay_pattern(codes, AUDIO_PAD_CODE)
+        pad_before = np.full((start_idx - prefix_idx + 1, N_VQ), AUDIO_PAD_CODE, dtype=np.int64)
+        delay_parts.extend([pad_before, delayed])
+        prefix_idx = end_idx
+
+    if truncation:
+        delay_parts[-1] = delay_parts[-1][:-(N_VQ - 1), :]
+    else:
+        last_end = int(audio_end_indices[-1])
+        pad_after = np.full((len(text_ids) - last_end, N_VQ), AUDIO_PAD_CODE, dtype=np.int64)
+        delay_parts.append(pad_after)
+
+    delay_audio = np.concatenate(delay_parts, axis=0)
+    if len(text_ids) != delay_audio.shape[0]:
+        text_ids = text_ids[:delay_audio.shape[0]]
+
+    return np.concatenate([text_ids[:, np.newaxis], delay_audio], axis=1)
+
+
+def apply_delay_pattern(codes: np.ndarray, pad_code: int = AUDIO_PAD_CODE) -> np.ndarray:
+    t, n_vq = codes.shape
+    delayed = np.full((t + n_vq - 1, n_vq), pad_code, dtype=codes.dtype)
+    for channel in range(n_vq):
+        delayed[channel:channel + t, channel] = codes[:, channel]
+    return delayed
+
+
+def apply_de_delay_pattern(delay_codes: np.ndarray) -> np.ndarray:
+    total_len, n_vq = delay_codes.shape
+    t = total_len - n_vq + 1
+    if t <= 0:
+        return np.zeros((0, n_vq), dtype=delay_codes.dtype)
+    codes = np.zeros((t, n_vq), dtype=delay_codes.dtype)
+    for channel in range(n_vq):
+        codes[:, channel] = delay_codes[channel:channel + t, channel]
+    return codes
+
+
+def extract_audio_segments(generation_audio: np.ndarray) -> list[np.ndarray]:
+    codes = apply_de_delay_pattern(generation_audio)
+    if codes.shape[0] == 0:
+        return []
+
+    is_pad = np.all(codes == AUDIO_PAD_CODE, axis=1)
+    non_pad_idx = np.where(~is_pad)[0]
+    if len(non_pad_idx) == 0:
+        return []
+
+    segments: list[np.ndarray] = []
+    start = int(non_pad_idx[0])
+    for i in range(1, len(non_pad_idx)):
+        if int(non_pad_idx[i]) != int(non_pad_idx[i - 1]) + 1:
+            segments.append(codes[start:int(non_pad_idx[i - 1]) + 1])
+            start = int(non_pad_idx[i])
+    segments.append(codes[start:int(non_pad_idx[-1]) + 1])
+    return segments
diff --git a/tools/tts/moss-tts.cpp b/tools/tts/run-moss-tts-delay.cpp
similarity index 60%
rename from tools/tts/moss-tts.cpp
rename to tools/tts/run-moss-tts-delay.cpp
index 2e545297c..e98aab0ad 100644
--- a/tools/tts/moss-tts.cpp
+++ b/tools/tts/run-moss-tts-delay.cpp
@@ -3,8 +3,10 @@
 #include "log.h"
 #include "llama.h"
 #include "llama-cpp.h"
+#include "llama-moss-audio-tokenizer.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstdio>
 #include <cinttypes>
 #include <cstdint>
@@ -41,6 +43,23 @@ constexpr uint32_t MOSS_DECODE_REF_MAGIC = 0x4652444d; // "MDRF"
 constexpr uint32_t MOSS_DECODE_REF_VERSION = 1;
 constexpr uint32_t MOSS_GEN_REF_MAGIC = 0x4652474d; // "MGRF"
 constexpr uint32_t MOSS_GEN_REF_VERSION = 1;
+constexpr const char * MOSS_AUDIO_PLACEHOLDER = "<|audio|>";
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1;
+    uint16_t num_channels = 1;
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
 
 struct moss_sampling_config {
     float text_temperature = 1.5f;
@@ -85,6 +104,12 @@ struct moss_generation_audio {
     size_t raw_frames = 0;
 };
 
+struct moss_prompt_input {
+    std::vector<llama_token> packed_ids;
+    size_t prompt_frames = 0;
+    size_t reference_frames = 0;
+};
+
 struct moss_delay_state {
     int32_t audio_length = 0;
     int64_t delayed_length = MOSS_DELAY_INT64_MAX;
@@ -173,6 +198,71 @@ static void moss_generate_from_ref(
         const moss_sampling_config & sampling_cfg,
         uint32_t seed,
         const std::string & dump_raw_codes_path,
+        const std::string & audio_decoder_model_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio);
+static void moss_generate_from_prompt(
+        const std::string & model_path,
+        const std::vector<llama_token> & prompt_packed,
+        size_t prompt_frames,
+        size_t reference_frames,
+        int32_t n_gpu_layers,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & audio_decoder_model_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio);
+
+static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate);
+static std::vector<llama_token> moss_encode_audio_llama(
+        const std::string & audio_encoder_model_path,
+        const std::string & wav_path,
+        int32_t n_gpu_layers,
+        uint32_t n_quantizers,
+        size_t * out_frames);
+static void moss_decode_audio_llama(
+        const std::string & audio_decoder_model_path,
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        const moss_delay_config & cfg,
+        int32_t n_gpu_layers,
+        const std::string & wav_out_path);
+static void moss_decode_audio_native(
+        const std::string & model_path,
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        const moss_delay_config & cfg,
+        const std::string & wav_out_path);
+static std::vector<float> moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate);
+static moss_prompt_input moss_build_prompt_input(
+        const llama_vocab * vocab,
+        const moss_delay_config & cfg,
+        const std::string & text,
+        const std::string & language,
+        const std::vector<llama_token> & reference_codes,
+        size_t reference_frames);
+static void moss_generate_from_text(
+        const std::string & model_path,
+        const std::string & text,
+        const std::string & language,
+        const std::string & reference_audio_path,
+        int32_t n_gpu_layers,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & audio_encoder_model_path,
+        const std::string & audio_decoder_model_path,
         const std::string & python_bin,
         const std::string & helper_script,
         const std::string & encoder_onnx,
@@ -182,15 +272,25 @@ static void moss_generate_from_ref(
 
 struct llama_backend_scope {
     llama_backend_scope() {
-        llama_backend_init();
+        if (refcount().fetch_add(1, std::memory_order_acq_rel) == 0) {
+            llama_backend_init();
+        }
     }
 
     ~llama_backend_scope() {
-        llama_backend_free();
+        if (refcount().fetch_sub(1, std::memory_order_acq_rel) == 1) {
+            llama_backend_free();
+        }
     }
 
     llama_backend_scope(const llama_backend_scope &) = delete;
     llama_backend_scope & operator=(const llama_backend_scope &) = delete;
+
+private:
+    static std::atomic<int> & refcount() {
+        static std::atomic<int> value{0};
+        return value;
+    }
 };
 
 struct moss_owned_batch {
@@ -256,9 +356,13 @@ static void print_usage(int argc, char ** argv) {
     LOG("\nexample usage:\n");
     LOG("  %s -m model.gguf --print-delay-config\n", argv[0]);
     LOG("  %s -m model.gguf --generation-input generation.input.bin -ngl -1\n", argv[0]);
+    LOG("  %s -m model.gguf --audio-decoder-model audio_decoder.gguf --text \"你好，世界。\" --wav-out out.wav -ngl -1\n", argv[0]);
+    LOG("  %s -m model.gguf --audio-encoder-model audio_encoder.gguf --audio-decoder-model audio_decoder.gguf --text \"你好，世界。\" --reference-audio ref.wav --wav-out out.wav -ngl -1\n", argv[0]);
     LOG("  %s --decode-parity-ref decode.ref.bin\n", argv[0]);
     LOG("\noptions:\n");
     LOG("  -ngl, --gpu-layers, --n-gpu-layers N  number of layers to offload to GPU (default: -1)\n");
+    LOG("  --audio-encoder-model PATH            native moss-tts-audio-encoder GGUF for reference wav -> codes\n");
+    LOG("  --audio-decoder-model PATH            native moss-tts-audio-decoder GGUF for codes -> wav\n");
     LOG("\n");
 }
 
@@ -347,6 +451,77 @@ static size_t moss_audio_vocab_with_pad(const moss_delay_config & cfg) {
     return std::max<size_t>(cfg.audio_vocab_size + 1u, (size_t) cfg.audio_pad_code + 1u);
 }
 
+static std::string moss_model_architecture(const llama_model * model) {
+    char buf[128];
+    const int32_t n = llama_model_meta_val_str(model, "general.architecture", buf, sizeof(buf));
+    if (n <= 0) {
+        throw std::runtime_error("missing general.architecture in GGUF metadata");
+    }
+    return std::string(buf);
+}
+
+struct moss_audio_runtime {
+    llama_model_ptr model;
+    llama_context_ptr ctx;
+};
+
+static llama_model_ptr moss_load_audio_model(
+        const std::string & model_path,
+        const char * expected_arch,
+        int32_t n_gpu_layers) {
+    llama_model_params mparams = llama_model_default_params();
+    mparams.use_mmap = true;
+    mparams.n_gpu_layers = n_gpu_layers;
+
+    llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams));
+    if (!model) {
+        throw std::runtime_error("failed to load audio model: " + model_path);
+    }
+
+    const std::string arch = moss_model_architecture(model.get());
+    if (arch != expected_arch) {
+        throw std::runtime_error(
+                "unexpected audio model architecture for " + model_path +
+                ": expected " + expected_arch + ", got " + arch);
+    }
+
+    return model;
+}
+
+static llama_context_ptr moss_init_audio_context(
+        llama_model * model,
+        uint32_t n_ctx) {
+    llama_context_params cparams = llama_context_default_params();
+    cparams.n_ctx = std::max<uint32_t>(n_ctx, 1u);
+    cparams.n_batch = std::max<uint32_t>(n_ctx, 1u);
+    cparams.n_ubatch = cparams.n_batch;
+    cparams.n_seq_max = 1;
+    cparams.embeddings = true;
+    cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
+
+    llama_context_ptr ctx(llama_init_from_model(model, cparams));
+    if (!ctx) {
+        throw std::runtime_error("failed to create audio context");
+    }
+
+    llama_set_warmup(ctx.get(), false);
+    llama_set_causal_attn(ctx.get(), false);
+    llama_set_embeddings(ctx.get(), true);
+
+    return ctx;
+}
+
+static moss_audio_runtime moss_load_audio_runtime(
+        const std::string & model_path,
+        const char * expected_arch,
+        int32_t n_gpu_layers,
+        uint32_t n_ctx) {
+    moss_audio_runtime runtime;
+    runtime.model = moss_load_audio_model(model_path, expected_arch, n_gpu_layers);
+    runtime.ctx = moss_init_audio_context(runtime.model.get(), n_ctx);
+    return runtime;
+}
+
 static int64_t moss_find_last_equal(const std::vector<llama_token> & values, llama_token target) {
     for (int64_t i = (int64_t) values.size() - 1; i >= 0; --i) {
         if (values[(size_t) i] == target) {
@@ -948,6 +1123,472 @@ static void moss_write_codes_file(
     moss_write_exact(out, raw_codes.data(), raw_codes.size(), "codes payload");
 }
 
+static bool save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: failed to open '%s' for writing\n", __func__, fname.c_str());
+        return false;
+    }
+
+    wav_header header;
+    header.sample_rate = (uint32_t) sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = (uint32_t) (data.size() * (header.bits_per_sample / 8));
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char *>(&header), sizeof(header));
+
+    for (const float sample : data) {
+        const int16_t pcm = (int16_t) std::clamp(sample * 32767.0f, -32768.0f, 32767.0f);
+        file.write(reinterpret_cast<const char *>(&pcm), sizeof(pcm));
+    }
+
+    return file.good();
+}
+
+static moss_owned_batch moss_batch_from_audio_waveform(const std::vector<float> & audio) {
+    moss_owned_batch owned_batch((int32_t) audio.size(), 1, 1);
+    llama_batch & batch = owned_batch.batch;
+    batch.n_tokens = (int32_t) audio.size();
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        batch.embd[i] = audio[i];
+        batch.pos[i] = (llama_pos) i;
+        batch.n_seq_id[i] = 1;
+        batch.seq_id[i][0] = 0;
+        batch.logits[i] = 1;
+    }
+
+    return owned_batch;
+}
+
+static moss_owned_batch moss_batch_from_audio_codes(
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        uint32_t n_quantizers) {
+    GGML_ASSERT(raw_codes.size() == raw_frames * (size_t) n_quantizers);
+
+    moss_owned_batch owned_batch((int32_t) raw_frames, 0, 1);
+    llama_batch & batch = owned_batch.batch;
+    batch.n_tokens = (int32_t) raw_frames;
+    batch.n_token_audio = (int32_t) n_quantizers;
+    owned_batch.token_audio = raw_codes;
+    owned_batch.refresh_token_audio_ptr();
+
+    for (size_t i = 0; i < raw_frames; ++i) {
+        batch.token[i] = 0;
+        batch.pos[i] = (llama_pos) i;
+        batch.n_seq_id[i] = 1;
+        batch.seq_id[i][0] = 0;
+        batch.logits[i] = 1;
+    }
+
+    return owned_batch;
+}
+
+static std::vector<llama_token> moss_encode_audio_llama(
+        const std::string & audio_encoder_model_path,
+        const std::string & wav_path,
+        int32_t n_gpu_layers,
+        uint32_t n_quantizers,
+        size_t * out_frames) {
+    moss_audio_runtime runtime;
+    runtime.model = moss_load_audio_model(
+            audio_encoder_model_path,
+            "moss-tts-audio-encoder",
+            n_gpu_layers);
+    const int sample_rate = moss_audio_model_sample_rate(runtime.model.get());
+    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get());
+    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get());
+    const uint32_t nq = n_quantizers == 0 ? model_quantizers : n_quantizers;
+    if (nq == 0 || nq > model_quantizers) {
+        throw std::runtime_error("invalid audio encoder quantizer count");
+    }
+
+    const std::vector<float> wav = moss_read_wav_f32_mono(wav_path, sample_rate);
+    const size_t padded_samples =
+            ((wav.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate;
+    const size_t valid_frames = wav.size() / (size_t) downsample_rate;
+
+    if (padded_samples == 0) {
+        if (out_frames != nullptr) {
+            *out_frames = 0;
+        }
+        return {};
+    }
+
+    std::vector<float> padded_wav(padded_samples, 0.0f);
+    std::copy(wav.begin(), wav.end(), padded_wav.begin());
+
+    runtime.ctx = moss_init_audio_context(runtime.model.get(), (uint32_t) padded_samples);
+
+    moss_owned_batch batch = moss_batch_from_audio_waveform(padded_wav);
+    const int ret = llama_encode(runtime.ctx.get(), batch.batch);
+    if (ret != 0) {
+        throw std::runtime_error("audio encoder llama_encode failed: " + std::to_string(ret));
+    }
+
+    const int32_t n_out_i32 = llama_model_n_out_i32(runtime.model.get());
+    const size_t padded_frames = padded_samples / (size_t) downsample_rate;
+    const int32_t * codes_i32 = llama_get_output_i32(runtime.ctx.get());
+    if (codes_i32 == nullptr) {
+        throw std::runtime_error("audio encoder returned null raw i32 outputs");
+    }
+
+    if (n_out_i32 != (int32_t) nq) {
+        throw std::runtime_error("audio encoder raw i32 width does not match quantizer count");
+    }
+
+    std::vector<llama_token> codes(padded_frames * (size_t) nq);
+    for (size_t t = 0; t < padded_frames; ++t) {
+        const int32_t * row = codes_i32 + t * (size_t) n_out_i32;
+        std::copy_n(row, nq, codes.data() + t * (size_t) nq);
+    }
+
+    if (out_frames != nullptr) {
+        *out_frames = valid_frames;
+    }
+    if (valid_frames >= padded_frames) {
+        return codes;
+    }
+
+    std::vector<llama_token> trimmed(valid_frames * (size_t) nq);
+    for (size_t t = 0; t < valid_frames; ++t) {
+        std::copy_n(
+                codes.data() + t * (size_t) nq,
+                nq,
+                trimmed.data() + t * (size_t) nq);
+    }
+    return trimmed;
+}
+
+static void moss_decode_audio_llama(
+        const std::string & audio_decoder_model_path,
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        const moss_delay_config & cfg,
+        int32_t n_gpu_layers,
+        const std::string & wav_out_path) {
+    moss_audio_runtime runtime = moss_load_audio_runtime(
+            audio_decoder_model_path,
+            "moss-tts-audio-decoder",
+            n_gpu_layers,
+            std::max<uint32_t>((uint32_t) raw_frames, 1u));
+
+    const int sample_rate = moss_audio_model_sample_rate(runtime.model.get());
+    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get());
+    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get());
+    if (cfg.n_vq != model_quantizers) {
+        throw std::runtime_error(
+                "audio decoder quantizer count mismatch: model expects " +
+                std::to_string(model_quantizers) + ", got " + std::to_string(cfg.n_vq));
+    }
+    if (raw_codes.size() != raw_frames * (size_t) cfg.n_vq) {
+        throw std::runtime_error("audio decoder raw code payload size mismatch");
+    }
+
+    std::vector<float> audio;
+    if (raw_frames > 0) {
+        moss_owned_batch batch = moss_batch_from_audio_codes(raw_codes, raw_frames, cfg.n_vq);
+        const int ret = llama_encode(runtime.ctx.get(), batch.batch);
+        if (ret != 0) {
+            throw std::runtime_error("audio decoder llama_encode failed: " + std::to_string(ret));
+        }
+
+        const int32_t n_embd_out = llama_model_n_embd_out(runtime.model.get());
+        if (n_embd_out != 1) {
+            throw std::runtime_error("audio decoder output dimension must be 1");
+        }
+
+        const size_t n_samples = raw_frames * (size_t) downsample_rate;
+        const float * embd = llama_get_embeddings(runtime.ctx.get());
+        if (embd == nullptr) {
+            throw std::runtime_error("audio decoder returned null embeddings");
+        }
+        audio.assign(embd, embd + n_samples);
+    }
+
+    if (!save_wav16(wav_out_path, audio, sample_rate)) {
+        throw std::runtime_error("failed to write WAV file: " + wav_out_path);
+    }
+}
+
+static void moss_decode_audio_native(
+        const std::string & model_path,
+        const std::vector<llama_token> & raw_codes,
+        size_t raw_frames,
+        const moss_delay_config & cfg,
+        const std::string & wav_out_path) {
+    moss_audio_tokenizer_options codec_opts;
+    codec_opts.n_threads = cpu_get_num_math();
+
+    moss_audio_tokenizer codec(model_path, codec_opts);
+    const std::vector<float> audio = codec.decode(raw_codes, raw_frames, cfg.n_vq);
+
+    if (!save_wav16(wav_out_path, audio, codec.sample_rate())) {
+        throw std::runtime_error("failed to write WAV file: " + wav_out_path);
+    }
+}
+
+static std::vector<float> moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate) {
+    std::ifstream in(path, std::ios::binary);
+    if (!in) {
+        throw std::runtime_error("failed to open wav file: " + path);
+    }
+
+    auto read_u16 = [&](uint16_t & value) {
+        in.read(reinterpret_cast<char *>(&value), sizeof(value));
+        if (!in) {
+            throw std::runtime_error("failed to read wav u16 field");
+        }
+    };
+    auto read_u32 = [&](uint32_t & value) {
+        in.read(reinterpret_cast<char *>(&value), sizeof(value));
+        if (!in) {
+            throw std::runtime_error("failed to read wav u32 field");
+        }
+    };
+
+    char riff[4];
+    char wave[4];
+    uint32_t chunk_size = 0;
+    in.read(riff, 4);
+    read_u32(chunk_size);
+    in.read(wave, 4);
+    if (!in || std::memcmp(riff, "RIFF", 4) != 0 || std::memcmp(wave, "WAVE", 4) != 0) {
+        throw std::runtime_error("unsupported wav header: " + path);
+    }
+
+    uint16_t audio_format = 0;
+    uint16_t num_channels = 0;
+    uint32_t sample_rate = 0;
+    uint16_t bits_per_sample = 0;
+    std::vector<uint8_t> data_chunk;
+
+    while (in) {
+        char chunk_id[4];
+        uint32_t chunk_bytes = 0;
+        in.read(chunk_id, 4);
+        if (!in) {
+            break;
+        }
+        read_u32(chunk_bytes);
+
+        if (std::memcmp(chunk_id, "fmt ", 4) == 0) {
+            uint32_t byte_rate = 0;
+            uint16_t block_align = 0;
+            read_u16(audio_format);
+            read_u16(num_channels);
+            read_u32(sample_rate);
+            read_u32(byte_rate);
+            read_u16(block_align);
+            read_u16(bits_per_sample);
+
+            const size_t fmt_extra = chunk_bytes > 16 ? chunk_bytes - 16 : 0;
+            if (fmt_extra > 0) {
+                in.seekg((std::streamoff) fmt_extra, std::ios::cur);
+            }
+        } else if (std::memcmp(chunk_id, "data", 4) == 0) {
+            data_chunk.resize(chunk_bytes);
+            in.read(reinterpret_cast<char *>(data_chunk.data()), (std::streamsize) chunk_bytes);
+        } else {
+            in.seekg((std::streamoff) chunk_bytes, std::ios::cur);
+        }
+
+        if (chunk_bytes & 1u) {
+            in.seekg(1, std::ios::cur);
+        }
+    }
+
+    if (audio_format == 0 || num_channels == 0 || sample_rate == 0 || bits_per_sample == 0 || data_chunk.empty()) {
+        throw std::runtime_error("incomplete wav metadata: " + path);
+    }
+    if ((int) sample_rate != expected_sample_rate) {
+        throw std::runtime_error("reference wav sample rate must be " + std::to_string(expected_sample_rate));
+    }
+    if (audio_format != 1 && audio_format != 3) {
+        throw std::runtime_error("only PCM16/PCM32-float wav is supported: " + path);
+    }
+
+    const size_t bytes_per_sample = bits_per_sample / 8u;
+    if (bytes_per_sample == 0 || data_chunk.size() % (bytes_per_sample * num_channels) != 0) {
+        throw std::runtime_error("invalid wav data chunk size: " + path);
+    }
+
+    const size_t n_frames = data_chunk.size() / (bytes_per_sample * num_channels);
+    std::vector<float> mono(n_frames, 0.0f);
+
+    for (size_t i = 0; i < n_frames; ++i) {
+        float acc = 0.0f;
+        for (uint16_t ch = 0; ch < num_channels; ++ch) {
+            const uint8_t * src = data_chunk.data() + (i * (size_t) num_channels + ch) * bytes_per_sample;
+            float sample = 0.0f;
+            if (audio_format == 1 && bits_per_sample == 16) {
+                int16_t v = 0;
+                std::memcpy(&v, src, sizeof(v));
+                sample = (float) v / 32768.0f;
+            } else if (audio_format == 3 && bits_per_sample == 32) {
+                std::memcpy(&sample, src, sizeof(sample));
+            } else {
+                throw std::runtime_error("unsupported wav sample encoding: " + path);
+            }
+            acc += sample;
+        }
+        mono[i] = acc / (float) num_channels;
+    }
+
+    return mono;
+}
+
+static moss_prompt_input moss_build_prompt_input(
+        const llama_vocab * vocab,
+        const moss_delay_config & cfg,
+        const std::string & text,
+        const std::string & language,
+        const std::vector<llama_token> & reference_codes,
+        size_t reference_frames) {
+    const std::string audio_start_tok = common_token_to_piece(vocab, cfg.audio_start_token_id, true);
+    const std::string audio_end_tok = common_token_to_piece(vocab, cfg.audio_end_token_id, true);
+    const std::string user_slot_tok = common_token_to_piece(vocab, cfg.audio_user_slot_token_id, true);
+    const std::string im_start_tok = common_token_to_piece(vocab, cfg.im_start_token_id, true);
+    const std::string im_end_tok = common_token_to_piece(vocab, cfg.im_end_token_id, true);
+
+    const auto replace_audio_placeholders = [&](
+            const std::string & content,
+            const std::vector<size_t> & lengths) -> std::string {
+        size_t pos = 0;
+        size_t length_idx = 0;
+        std::string out;
+
+        while (true) {
+            const size_t ph = content.find(MOSS_AUDIO_PLACEHOLDER, pos);
+            if (ph == std::string::npos) {
+                out.append(content, pos, std::string::npos);
+                break;
+            }
+
+            out.append(content, pos, ph - pos);
+            if (length_idx >= lengths.size()) {
+                throw std::runtime_error("audio placeholder count does not match reference length count");
+            }
+
+            const size_t length = lengths[length_idx++];
+            out += audio_start_tok;
+            if (length > 0) {
+                for (size_t i = 0; i < length; ++i) {
+                    out += user_slot_tok;
+                }
+                for (size_t i = 1; i < cfg.n_vq; ++i) {
+                    out += user_slot_tok;
+                }
+            }
+            out += audio_end_tok;
+            pos = ph + std::strlen(MOSS_AUDIO_PLACEHOLDER);
+        }
+
+        if (length_idx != lengths.size()) {
+            throw std::runtime_error("unused reference audio lengths while replacing placeholders");
+        }
+
+        return out;
+    };
+
+    const auto build_unified_codes = [&](
+            const std::string & content,
+            const std::vector<std::vector<llama_token>> & audio_codes_list,
+            const std::vector<size_t> & audio_frames_list) -> std::vector<llama_token> {
+        const std::vector<llama_token> text_ids = common_tokenize(vocab, content, false, true);
+        if (audio_codes_list.empty()) {
+            std::vector<llama_token> packed(text_ids.size() * cfg.packed_stride(), cfg.audio_pad_code);
+            for (size_t i = 0; i < text_ids.size(); ++i) {
+                packed[i * cfg.packed_stride()] = text_ids[i];
+            }
+            return packed;
+        }
+
+        std::vector<size_t> audio_start_indices;
+        std::vector<size_t> audio_end_indices;
+        for (size_t i = 0; i < text_ids.size(); ++i) {
+            if (text_ids[i] == cfg.audio_start_token_id) {
+                audio_start_indices.push_back(i);
+            }
+            if (text_ids[i] == cfg.audio_end_token_id) {
+                audio_end_indices.push_back(i);
+            }
+        }
+
+        if (audio_start_indices.size() != audio_codes_list.size() || audio_end_indices.size() != audio_codes_list.size()) {
+            throw std::runtime_error("audio marker count does not match reference audio count");
+        }
+
+        std::vector<llama_token> delay_audio;
+        size_t prefix_idx = 0;
+        for (size_t i = 0; i < audio_codes_list.size(); ++i) {
+            const size_t start_idx = audio_start_indices[i];
+            const size_t end_idx = audio_end_indices[i];
+            const std::vector<llama_token> delayed = moss_apply_delay_pattern(audio_codes_list[i], audio_frames_list[i], cfg);
+
+            const size_t pad_before_rows = start_idx - prefix_idx + 1;
+            delay_audio.insert(delay_audio.end(), pad_before_rows * cfg.n_vq, cfg.audio_pad_code);
+            delay_audio.insert(delay_audio.end(), delayed.begin(), delayed.end());
+            prefix_idx = end_idx;
+        }
+
+        const size_t last_end = audio_end_indices.back();
+        const size_t pad_after_rows = text_ids.size() - last_end;
+        delay_audio.insert(delay_audio.end(), pad_after_rows * cfg.n_vq, cfg.audio_pad_code);
+
+        const size_t delay_rows = delay_audio.size() / cfg.n_vq;
+        const size_t text_rows = std::min(text_ids.size(), delay_rows);
+        std::vector<llama_token> packed(text_rows * cfg.packed_stride(), cfg.audio_pad_code);
+        for (size_t row = 0; row < text_rows; ++row) {
+            packed[row * cfg.packed_stride()] = text_ids[row];
+            std::copy_n(
+                    delay_audio.data() + row * cfg.n_vq,
+                    cfg.n_vq,
+                    packed.data() + row * cfg.packed_stride() + 1);
+        }
+        return packed;
+    };
+
+    const bool has_ref = reference_frames > 0;
+    const std::string ref_str = has_ref ? "[S1]:\n<|audio|>" : "None";
+    const std::string user_content =
+            "<user_inst>\n"
+            "- Reference(s):\n" + ref_str + "\n"
+            "- Instruction:\nNone\n"
+            "- Tokens:\nNone\n"
+            "- Quality:\nNone\n"
+            "- Sound Event:\nNone\n"
+            "- Ambient Sound:\nNone\n"
+            "- Language:\n" + language + "\n"
+            "- Text:\n" + text + "\n"
+            "</user_inst>";
+
+    const std::vector<size_t> ref_lengths = has_ref ? std::vector<size_t> { reference_frames } : std::vector<size_t> {};
+    const std::string replaced = replace_audio_placeholders(user_content, ref_lengths);
+    const std::string full_text = im_start_tok + "user\n" + replaced + im_end_tok + "\n" + im_start_tok + "assistant\n";
+
+    std::vector<std::vector<llama_token>> ref_list;
+    std::vector<size_t> ref_frames_list;
+    if (has_ref) {
+        ref_list.push_back(reference_codes);
+        ref_frames_list.push_back(reference_frames);
+    }
+
+    moss_prompt_input out;
+    out.packed_ids = build_unified_codes(full_text, ref_list, ref_frames_list);
+    out.prompt_frames = out.packed_ids.size() / cfg.packed_stride();
+    out.reference_frames = reference_frames;
+
+    out.packed_ids.push_back(cfg.audio_start_token_id);
+    out.packed_ids.insert(out.packed_ids.end(), cfg.n_vq, cfg.audio_pad_code);
+    out.prompt_frames += 1;
+
+    return out;
+}
+
 static int moss_run_audio_decoder_helper(
         const std::string & python_bin,
         const std::string & helper_script,
@@ -973,8 +1614,11 @@ static int moss_run_audio_decoder_helper(
 }
 
 static bool moss_decode_parity(
+        const std::string & model_path,
         const std::string & ref_path,
         const std::string & dump_codes_path,
+        const std::string & audio_decoder_model_path,
+        int32_t n_gpu_layers,
         const std::string & python_bin,
         const std::string & helper_script,
         const std::string & encoder_onnx,
@@ -1026,7 +1670,36 @@ static bool moss_decode_parity(
         moss_write_codes_file(dump_codes_path, decoded.raw_codes, decoded.raw_frames, cfg);
     }
 
-    if (!helper_script.empty()) {
+        if (!wav_out.empty()) {
+        if (!helper_script.empty()) {
+            if (dump_codes_path.empty()) {
+                throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
+            }
+            if (encoder_onnx.empty() || decoder_onnx.empty()) {
+                throw std::runtime_error("--audio-decoder-script requires both --audio-encoder-onnx and --audio-decoder-onnx");
+            }
+
+            const int rc = moss_run_audio_decoder_helper(
+                    python_bin, helper_script, dump_codes_path, wav_out,
+                    encoder_onnx, decoder_onnx, use_gpu_audio);
+            if (rc != 0) {
+                throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc));
+            }
+        } else if (!audio_decoder_model_path.empty()) {
+            moss_decode_audio_llama(
+                    audio_decoder_model_path,
+                    decoded.raw_codes,
+                    decoded.raw_frames,
+                    cfg,
+                    n_gpu_layers,
+                    wav_out);
+        } else {
+            if (model_path.empty()) {
+                throw std::runtime_error("--wav-out requires either --audio-decoder-model, --audio-decoder-script, or -m <model.gguf> with bundled codec");
+            }
+            moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out);
+        }
+    } else if (!helper_script.empty()) {
         if (dump_codes_path.empty()) {
             throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
         }
@@ -1090,6 +1763,7 @@ static void moss_generate_from_ref(
         const moss_sampling_config & sampling_cfg,
         uint32_t seed,
         const std::string & dump_raw_codes_path,
+        const std::string & audio_decoder_model_path,
         const std::string & python_bin,
         const std::string & helper_script,
         const std::string & encoder_onnx,
@@ -1116,100 +1790,171 @@ static void moss_generate_from_ref(
     moss_read_exact(in, prompt_packed.data(), prompt_packed.size(), "prompt packed ids");
     moss_read_exact(in, ignored_ref_raw_codes.data(), ignored_ref_raw_codes.size(), "reference raw codes");
 
-    llama_backend_scope backend_scope;
+    moss_generate_from_prompt(
+            model_path,
+            prompt_packed,
+            hdr.prompt_frames,
+            hdr.raw_frames,
+            n_gpu_layers,
+            max_new_tokens,
+            sampling_cfg,
+            seed,
+            dump_raw_codes_path,
+            audio_decoder_model_path,
+            python_bin,
+            helper_script,
+            encoder_onnx,
+            decoder_onnx,
+            wav_out,
+            use_gpu_audio);
+}
 
-    llama_model_params mparams = llama_model_default_params();
-    mparams.use_mmap = true;
-    mparams.n_gpu_layers = n_gpu_layers;
+static void moss_generate_from_prompt(
+        const std::string & model_path,
+        const std::vector<llama_token> & prompt_packed,
+        size_t prompt_frames,
+        size_t reference_frames,
+        int32_t n_gpu_layers,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & audio_decoder_model_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio) {
+    moss_delay_config cfg;
+    cfg.n_vq = MOSS_DELAY_DEFAULT_N_VQ;
+    cfg.audio_pad_code = MOSS_DELAY_DEFAULT_AUDIO_PAD_CODE;
 
-    llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams));
-    if (!model) {
-        throw std::runtime_error("failed to load model: " + model_path);
-    }
+    llama_backend_scope backend_scope;
+    moss_generation_audio decoded;
+    size_t generated_frames = 0;
 
-    const llama_vocab * vocab = llama_model_get_vocab(model.get());
-    const int32_t text_vocab = llama_vocab_n_tokens(vocab);
-    const moss_delay_config model_cfg = moss_delay_config_from_model(model.get());
+    {
+        llama_model_params mparams = llama_model_default_params();
+        mparams.use_mmap = true;
+        mparams.n_gpu_layers = n_gpu_layers;
 
-    if (model_cfg.n_vq != cfg.n_vq) {
-        throw std::runtime_error("generation reference n_vq does not match model metadata");
-    }
-    cfg.audio_vocab_size = model_cfg.audio_vocab_size;
+        llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams));
+        if (!model) {
+            throw std::runtime_error("failed to load model: " + model_path);
+        }
 
-    llama_context_params cparams = llama_context_default_params();
-    cparams.n_ctx = std::max<uint32_t>((uint32_t) hdr.prompt_frames + (uint32_t) max_new_tokens + 8u, 64u);
-    cparams.n_batch = std::max<uint32_t>((uint32_t) hdr.prompt_frames, 1u);
-    cparams.n_ubatch = cparams.n_batch;
-    cparams.n_seq_max = 1;
-    cparams.embeddings = false;
+        const llama_vocab * vocab = llama_model_get_vocab(model.get());
+        const int32_t text_vocab = llama_vocab_n_tokens(vocab);
+        const moss_delay_config model_cfg = moss_delay_config_from_model(model.get());
 
-    llama_context_ptr ctx(llama_init_from_model(model.get(), cparams));
-    if (!ctx) {
-        throw std::runtime_error("failed to create context");
-    }
+        cfg = model_cfg;
+        if (prompt_packed.size() % cfg.packed_stride() != 0) {
+            throw std::runtime_error("prompt packed input does not match model n_vq");
+        }
 
-    llama_set_warmup(ctx.get(), false);
-    llama_set_causal_attn(ctx.get(), true);
-    llama_set_embeddings(ctx.get(), false);
+        llama_context_params cparams = llama_context_default_params();
+        cparams.n_ctx = std::max<uint32_t>((uint32_t) prompt_frames + (uint32_t) max_new_tokens + 8u, 64u);
+        cparams.n_batch = std::max<uint32_t>((uint32_t) prompt_frames, 1u);
+        cparams.n_ubatch = cparams.n_batch;
+        cparams.n_seq_max = 1;
+        cparams.embeddings = false;
 
-    {
-        moss_owned_batch batch = moss_batch_from_packed_rows(
-                prompt_packed, 0, hdr.prompt_frames, cfg, 0, true);
-        const int ret = llama_decode(ctx.get(), batch.batch);
-        if (ret != 0) {
-            throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
+        llama_context_ptr ctx(llama_init_from_model(model.get(), cparams));
+        if (!ctx) {
+            throw std::runtime_error("failed to create context");
         }
-    }
 
-    moss_delay_state state = moss_init_delay_state(prompt_packed, cfg);
+        llama_set_warmup(ctx.get(), false);
+        llama_set_causal_attn(ctx.get(), true);
+        llama_set_embeddings(ctx.get(), false);
 
-    std::vector<llama_token> generated_packed;
-    generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride());
+        {
+            moss_owned_batch batch = moss_batch_from_packed_rows(
+                    prompt_packed, 0, prompt_frames, cfg, 0, true);
+            const int ret = llama_decode(ctx.get(), batch.batch);
+            if (ret != 0) {
+                throw std::runtime_error("prefill llama_decode failed: " + std::to_string(ret));
+            }
+        }
 
-    const size_t audio_vocab = moss_audio_vocab_with_pad(cfg);
-    moss_rng rng(seed);
+        moss_delay_state state = moss_init_delay_state(prompt_packed, cfg);
 
-    for (int32_t step = 0; step < max_new_tokens; ++step) {
-        const float * logits = llama_get_logits_ith(ctx.get(), -1);
-        if (logits == nullptr) {
-            throw std::runtime_error("llama_get_logits_ith returned null");
-        }
+        std::vector<llama_token> generated_packed;
+        generated_packed.reserve((size_t) max_new_tokens * cfg.packed_stride());
 
-        std::vector<float> text_logits(logits, logits + text_vocab);
-        std::vector<float> audio_logits(
-                logits + text_vocab,
-                logits + text_vocab + cfg.n_vq * audio_vocab);
+        const size_t audio_vocab = moss_audio_vocab_with_pad(cfg);
+        moss_rng rng(seed);
 
-        const std::vector<llama_token> next = moss_delay_step(
-                state, text_logits, audio_logits, sampling_cfg, cfg, rng);
-        generated_packed.insert(generated_packed.end(), next.begin(), next.end());
+        for (int32_t step = 0; step < max_new_tokens; ++step) {
+            const float * logits = llama_get_logits_ith(ctx.get(), -1);
+            if (logits == nullptr) {
+                throw std::runtime_error("llama_get_logits_ith returned null");
+            }
 
-        moss_owned_batch batch = moss_batch_from_packed_rows(
-                generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg,
-                hdr.prompt_frames + (size_t) step, true);
-        const int ret = llama_decode(ctx.get(), batch.batch);
-        if (ret != 0) {
-            throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret));
-        }
+            std::vector<float> text_logits(logits, logits + text_vocab);
+            std::vector<float> audio_logits(
+                    logits + text_vocab,
+                    logits + text_vocab + cfg.n_vq * audio_vocab);
+
+            const std::vector<llama_token> next = moss_delay_step(
+                    state, text_logits, audio_logits, sampling_cfg, cfg, rng);
+            generated_packed.insert(generated_packed.end(), next.begin(), next.end());
+
+            moss_owned_batch batch = moss_batch_from_packed_rows(
+                    generated_packed, generated_packed.size() / cfg.packed_stride() - 1, 1, cfg,
+                    prompt_frames + (size_t) step, true);
+            const int ret = llama_decode(ctx.get(), batch.batch);
+            if (ret != 0) {
+                throw std::runtime_error("generation llama_decode failed: " + std::to_string(ret));
+            }
 
-        if (state.is_stopping) {
-            break;
+            if (state.is_stopping) {
+                break;
+            }
         }
-    }
 
-    const moss_generation_audio decoded = moss_decode_generation_audio(state, hdr.prompt_frames, cfg);
+        generated_frames = generated_packed.size() / cfg.packed_stride();
+        decoded = moss_decode_generation_audio(state, prompt_frames, cfg);
+    }
 
     LOG("moss-tts first-class generation: prompt_frames=%u generated_frames=%zu raw_frames=%zu input_ref_raw_frames=%u\n",
-            hdr.prompt_frames,
-            generated_packed.size() / cfg.packed_stride(),
+            (uint32_t) prompt_frames,
+            generated_frames,
             decoded.raw_frames,
-            hdr.raw_frames);
+            (uint32_t) reference_frames);
 
     if (!dump_raw_codes_path.empty()) {
         moss_write_codes_file(dump_raw_codes_path, decoded.raw_codes, decoded.raw_frames, cfg);
     }
 
-    if (!helper_script.empty()) {
+        if (!wav_out.empty()) {
+        if (!helper_script.empty()) {
+            if (dump_raw_codes_path.empty()) {
+                throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
+            }
+            if (encoder_onnx.empty() || decoder_onnx.empty()) {
+                throw std::runtime_error("--audio-decoder-script requires both ONNX paths");
+            }
+
+            const int rc = moss_run_audio_decoder_helper(
+                    python_bin, helper_script, dump_raw_codes_path, wav_out,
+                    encoder_onnx, decoder_onnx, use_gpu_audio);
+            if (rc != 0) {
+                throw std::runtime_error("audio decoder helper failed with exit code " + std::to_string(rc));
+            }
+        } else if (!audio_decoder_model_path.empty()) {
+            moss_decode_audio_llama(
+                    audio_decoder_model_path,
+                    decoded.raw_codes,
+                    decoded.raw_frames,
+                    cfg,
+                    n_gpu_layers,
+                    wav_out);
+        } else {
+            moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out);
+        }
+    } else if (!helper_script.empty()) {
         if (dump_raw_codes_path.empty()) {
             throw std::runtime_error("--audio-decoder-script requires --dump-raw-codes");
         }
@@ -1229,6 +1974,83 @@ static void moss_generate_from_ref(
     }
 }
 
+static void moss_generate_from_text(
+        const std::string & model_path,
+        const std::string & text,
+        const std::string & language,
+        const std::string & reference_audio_path,
+        int32_t n_gpu_layers,
+        int32_t max_new_tokens,
+        const moss_sampling_config & sampling_cfg,
+        uint32_t seed,
+        const std::string & dump_raw_codes_path,
+        const std::string & audio_encoder_model_path,
+        const std::string & audio_decoder_model_path,
+        const std::string & python_bin,
+        const std::string & helper_script,
+        const std::string & encoder_onnx,
+        const std::string & decoder_onnx,
+        const std::string & wav_out,
+        bool use_gpu_audio) {
+    std::vector<llama_token> reference_codes;
+    size_t reference_frames = 0;
+    moss_prompt_input prompt;
+
+    {
+        llama_backend_scope backend_scope;
+
+        llama_model_params mparams = llama_model_default_params();
+        mparams.use_mmap = true;
+        mparams.vocab_only = true;
+
+        llama_model_ptr model(llama_model_load_from_file(model_path.c_str(), mparams));
+        if (!model) {
+            throw std::runtime_error("failed to load vocab-only model: " + model_path);
+        }
+
+        const llama_vocab * vocab = llama_model_get_vocab(model.get());
+        const moss_delay_config cfg = moss_delay_config_from_model(model.get());
+
+        if (!reference_audio_path.empty()) {
+            if (!audio_encoder_model_path.empty()) {
+                reference_codes = moss_encode_audio_llama(
+                        audio_encoder_model_path,
+                        reference_audio_path,
+                        n_gpu_layers,
+                        cfg.n_vq,
+                        &reference_frames);
+            } else {
+                moss_audio_tokenizer_options codec_opts;
+                codec_opts.n_threads = cpu_get_num_math();
+                moss_audio_tokenizer codec(model_path, codec_opts);
+                const std::vector<float> wav = moss_read_wav_f32_mono(reference_audio_path, codec.sample_rate());
+                reference_codes = codec.encode(wav, &reference_frames, cfg.n_vq);
+            }
+        }
+
+        prompt = moss_build_prompt_input(
+                vocab, cfg, text, language, reference_codes, reference_frames);
+    }
+
+    moss_generate_from_prompt(
+            model_path,
+            prompt.packed_ids,
+            prompt.prompt_frames,
+            prompt.reference_frames,
+            n_gpu_layers,
+            max_new_tokens,
+            sampling_cfg,
+            seed,
+            dump_raw_codes_path,
+            audio_decoder_model_path,
+            python_bin,
+            helper_script,
+            encoder_onnx,
+            decoder_onnx,
+            wav_out,
+            use_gpu_audio);
+}
+
 static std::vector<llama_token> moss_audio_history_slice(
         const moss_delay_state & state,
         size_t start_frame,
@@ -1530,7 +2352,13 @@ int main(int argc, char ** argv) {
     std::string model_path;
     std::string decode_parity_ref_path;
     std::string generation_input_path;
+    std::string text;
+    std::string text_file_path;
+    std::string reference_audio_path;
+    std::string language = "zh";
     std::string dump_raw_codes_path;
+    std::string audio_encoder_model_path;
+    std::string audio_decoder_model_path;
     std::string audio_decoder_script;
     std::string audio_encoder_onnx;
     std::string audio_decoder_onnx;
@@ -1554,6 +2382,22 @@ int main(int argc, char ** argv) {
             generation_input_path = argv[++i];
             continue;
         }
+        if (arg == "--text" && i + 1 < argc) {
+            text = argv[++i];
+            continue;
+        }
+        if (arg == "--text-file" && i + 1 < argc) {
+            text_file_path = argv[++i];
+            continue;
+        }
+        if (arg == "--reference-audio" && i + 1 < argc) {
+            reference_audio_path = argv[++i];
+            continue;
+        }
+        if (arg == "--language" && i + 1 < argc) {
+            language = argv[++i];
+            continue;
+        }
         if (arg == "--generation-ref" && i + 1 < argc) {
             generation_input_path = argv[++i];
             LOG("warning: --generation-ref is deprecated; use --generation-input instead.\n");
@@ -1579,6 +2423,14 @@ int main(int argc, char ** argv) {
             dump_raw_codes_path = argv[++i];
             continue;
         }
+        if (arg == "--audio-encoder-model" && i + 1 < argc) {
+            audio_encoder_model_path = argv[++i];
+            continue;
+        }
+        if (arg == "--audio-decoder-model" && i + 1 < argc) {
+            audio_decoder_model_path = argv[++i];
+            continue;
+        }
         if (arg == "--audio-decoder-script" && i + 1 < argc) {
             audio_decoder_script = argv[++i];
             continue;
@@ -1657,11 +2509,17 @@ int main(int argc, char ** argv) {
         LOG("moss delay state self-test: ok\n");
     }
 
+    llama_backend_scope backend_scope;
+
     if (!generation_input_path.empty()) {
         if (model_path.empty()) {
             LOG_ERR("--generation-input requires -m <model.gguf>\n");
             return EXIT_FAILURE;
         }
+        if (!text.empty() || !text_file_path.empty()) {
+            LOG_ERR("--generation-input cannot be combined with --text/--text-file\n");
+            return EXIT_FAILURE;
+        }
         try {
             moss_generate_from_ref(
                     model_path,
@@ -1671,6 +2529,7 @@ int main(int argc, char ** argv) {
                     sampling_cfg,
                     seed,
                     dump_raw_codes_path,
+                    audio_decoder_model_path,
                     python_bin,
                     audio_decoder_script,
                     audio_encoder_onnx,
@@ -1684,11 +2543,55 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!text.empty() || !text_file_path.empty()) {
+        if (model_path.empty()) {
+            LOG_ERR("--text/--text-file requires -m <model.gguf>\n");
+            return EXIT_FAILURE;
+        }
+        try {
+            std::string input_text = text;
+            if (!text_file_path.empty()) {
+                std::ifstream in(text_file_path);
+                if (!in) {
+                    throw std::runtime_error("failed to open text file: " + text_file_path);
+                }
+                std::ostringstream ss;
+                ss << in.rdbuf();
+                input_text = ss.str();
+            }
+            moss_generate_from_text(
+                    model_path,
+                    input_text,
+                    language,
+                    reference_audio_path,
+                    n_gpu_layers,
+                    max_new_tokens,
+                    sampling_cfg,
+                    seed,
+                    dump_raw_codes_path,
+                    audio_encoder_model_path,
+                    audio_decoder_model_path,
+                    python_bin,
+                    audio_decoder_script,
+                    audio_encoder_onnx,
+                    audio_decoder_onnx,
+                    wav_out_path,
+                    use_gpu_audio);
+            return EXIT_SUCCESS;
+        } catch (const std::exception & err) {
+            LOG_ERR("text generation failed: %s\n", err.what());
+            return EXIT_FAILURE;
+        }
+    }
+
     if (!decode_parity_ref_path.empty()) {
         try {
             const bool ok = moss_decode_parity(
+                    model_path,
                     decode_parity_ref_path,
                     dump_raw_codes_path,
+                    audio_decoder_model_path,
+                    n_gpu_layers,
                     python_bin,
                     audio_decoder_script,
                     audio_encoder_onnx,
@@ -1706,9 +2609,11 @@ int main(int argc, char ** argv) {
         if (self_test) {
             return EXIT_SUCCESS;
         }
-        LOG("moss delay state, multi-head sampler, and raw-code decode are in place; audio decode is available via the external Python/ONNX helper.\n");
+        LOG("moss delay state, multi-head sampler, raw-code decode, and native audio encode/decode helpers are available.\n");
         LOG("use --print-delay-config with -m <model.gguf> to inspect model metadata.\n");
         LOG("use --decode-parity-ref <ref.bin> to verify C++ de-delay/raw-code extraction against Python.\n");
+        LOG("use --text <text> -m <model.gguf> --audio-decoder-model <audio-decoder.gguf> --wav-out out.wav for native generation.\n");
+        LOG("use --text <text> --reference-audio ref.wav -m <model.gguf> --audio-encoder-model <audio-encoder.gguf> --audio-decoder-model <audio-decoder.gguf> --wav-out out.wav for native voice cloning.\n");
         LOG("use --generation-input <input.bin> -m <first-class-model.gguf> for first-class generation.\n");
         return EXIT_SUCCESS;
     }
@@ -1718,8 +2623,6 @@ int main(int argc, char ** argv) {
         return EXIT_FAILURE;
     }
 
-    llama_backend_scope backend_scope;
-
     llama_model_params mparams = llama_model_default_params();
     mparams.use_mmap = true;
     mparams.n_gpu_layers = n_gpu_layers;

From b59fdb941a846bb3e9904682ab8cc974cca8b0b8 Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Tue, 7 Apr 2026 21:18:00 +0800
Subject: [PATCH 2/3] Fix MOSS TTS processor typing

---
 tools/tts/moss_tts_processor.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/tts/moss_tts_processor.py b/tools/tts/moss_tts_processor.py
index 6f1b456e5..d8f264680 100644
--- a/tools/tts/moss_tts_processor.py
+++ b/tools/tts/moss_tts_processor.py
@@ -78,7 +78,8 @@ def build_generation_prompt(
     im_start_tok = _get_special_token_str(tokenizer, IM_START_TOKEN_ID)
     im_end_tok = _get_special_token_str(tokenizer, IM_END_TOKEN_ID)
 
-    has_ref = reference_codes is not None and reference_codes.shape[0] > 0
+    ref_frame_count = int(reference_codes.shape[0]) if reference_codes is not None else 0
+    has_ref = ref_frame_count > 0
     ref_str = f"[S1]:\n{AUDIO_PLACEHOLDER}" if has_ref else "None"
 
     user_content = (
@@ -94,7 +95,7 @@ def build_generation_prompt(
         f"</user_inst>"
     )
 
-    ref_lengths = [reference_codes.shape[0]] if has_ref else []
+    ref_lengths = [ref_frame_count] if has_ref else []
     user_content = _replace_audio_placeholders(
         user_content,
         ref_lengths,
@@ -106,7 +107,10 @@ def build_generation_prompt(
     )
 
     full_text = f"{im_start_tok}user\n{user_content}{im_end_tok}\n{im_start_tok}assistant\n"
-    ref_audio_list = [reference_codes] if has_ref else []
+    ref_audio_list: list[np.ndarray] = []
+    if has_ref:
+        assert reference_codes is not None
+        ref_audio_list.append(reference_codes)
     unified_codes = _get_unified_codes(tokenizer, full_text, ref_audio_list)
 
     assistant_gen = f"{audio_start_tok}"

From b785003ba497794ecfa337c3e47f01af79489888 Mon Sep 17 00:00:00 2001
From: CHiSwsz <xzzduang@gmail.com>
Date: Wed, 8 Apr 2026 13:06:34 +0800
Subject: [PATCH 3/3] Remove bundled MOSS audio tokenizer fallback

---
 include/llama-moss-audio-tokenizer.h |   61 --
 src/CMakeLists.txt                   |    2 -
 src/models/moss-audio-tokenizer.cpp  | 1205 --------------------------
 tools/tts/run-moss-tts-delay.cpp     |   84 +-
 4 files changed, 41 insertions(+), 1311 deletions(-)
 delete mode 100644 include/llama-moss-audio-tokenizer.h
 delete mode 100644 src/models/moss-audio-tokenizer.cpp

diff --git a/include/llama-moss-audio-tokenizer.h b/include/llama-moss-audio-tokenizer.h
deleted file mode 100644
index 5e0326787..000000000
--- a/include/llama-moss-audio-tokenizer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-
-#ifndef __cplusplus
-#error "This header is for C++ only"
-#endif
-
-#include "llama.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-struct moss_audio_tokenizer_options {
-    int n_threads = -1;
-};
-
-class LLAMA_API moss_audio_tokenizer {
-public:
-    explicit moss_audio_tokenizer(
-            const std::string & model_path,
-            const moss_audio_tokenizer_options & options = {});
-    ~moss_audio_tokenizer();
-
-    moss_audio_tokenizer(const moss_audio_tokenizer &) = delete;
-    moss_audio_tokenizer & operator=(const moss_audio_tokenizer &) = delete;
-
-    moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept;
-    moss_audio_tokenizer & operator=(moss_audio_tokenizer &&) noexcept;
-
-    int sample_rate() const;
-    uint32_t downsample_rate() const;
-    uint32_t num_quantizers() const;
-
-    std::vector<float> decode(
-            const std::vector<llama_token> & codes,
-            size_t n_frames,
-            uint32_t n_quantizers = 0) const;
-
-    std::vector<llama_token> encode(
-            const std::vector<float> & audio,
-            size_t * out_frames = nullptr,
-            uint32_t n_quantizers = 0) const;
-
-private:
-    struct impl;
-    std::unique_ptr<impl> impl_;
-};
-
-LLAMA_API int moss_audio_model_sample_rate(const struct llama_model * model);
-
-LLAMA_API uint32_t moss_audio_model_downsample_rate(const struct llama_model * model);
-
-LLAMA_API uint32_t moss_audio_model_num_quantizers(const struct llama_model * model);
-
-LLAMA_API std::vector<llama_token> moss_audio_model_quantizer_encode(
-        const struct llama_model * model,
-        const std::vector<float> & input,
-        size_t n_frames,
-        uint32_t n_quantizers = 0);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b93054d70..8a0f29646 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,7 +7,6 @@ llama_add_compile_flags()
 # llama
 
 add_library(llama
-            ../include/llama-moss-audio-tokenizer.h
             ../include/llama.h
             llama.cpp
             llama-adapter.cpp
@@ -104,7 +103,6 @@ add_library(llama
             models/moss-audio-common.cpp
             models/moss-audio-decoder.cpp
             models/moss-audio-encoder.cpp
-            models/moss-audio-tokenizer.cpp
             models/moss-tts-delay.cpp
             models/modern-bert.cpp
             models/mpt.cpp
diff --git a/src/models/moss-audio-tokenizer.cpp b/src/models/moss-audio-tokenizer.cpp
deleted file mode 100644
index a0f19415b..000000000
--- a/src/models/moss-audio-tokenizer.cpp
+++ /dev/null
@@ -1,1205 +0,0 @@
-#include "llama-moss-audio-tokenizer.h"
-
-#include "ggml.h"
-#include "ggml-cpp.h"
-#include "ggml-backend.h"
-#include "ggml-cpu.h"
-#include "gguf.h"
-#include "llama-impl.h"
-#include "llama-model.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <fstream>
-#include <limits>
-#include <map>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace {
-
-constexpr char MOSS_CODEC_ARCH[] = "moss-audio-tokenizer";
-constexpr float MOSS_LAYER_NORM_EPS = 1e-5f;
-constexpr size_t MOSS_CODEC_MAX_NODES_BASE = 256;
-constexpr size_t MOSS_CODEC_MAX_NODES_PER_LAYER = 32;
-
-enum class moss_codec_module_type {
-    PATCHED_PRETRANSFORM,
-    TRANSFORMER,
-};
-
-struct moss_codec_transformer_layer {
-    ggml_tensor * attn_in   = nullptr;
-    ggml_tensor * attn_out  = nullptr;
-    ggml_tensor * linear1   = nullptr;
-    ggml_tensor * linear2   = nullptr;
-    ggml_tensor * norm1_w   = nullptr;
-    ggml_tensor * norm1_b   = nullptr;
-    ggml_tensor * norm2_w   = nullptr;
-    ggml_tensor * norm2_b   = nullptr;
-    ggml_tensor * scale1    = nullptr;
-    ggml_tensor * scale2    = nullptr;
-};
-
-struct moss_codec_transformer_block {
-    int input_dimension   = 0;
-    int output_dimension  = 0;
-    int d_model           = 0;
-    int num_heads         = 0;
-    int num_layers        = 0;
-    int dim_feedforward   = 0;
-    int context           = 0;
-    float max_period      = 10000.0f;
-
-    ggml_tensor * input_proj  = nullptr;
-    ggml_tensor * output_proj = nullptr;
-
-    std::vector<moss_codec_transformer_layer> layers;
-};
-
-struct moss_codec_module {
-    moss_codec_module_type type = moss_codec_module_type::PATCHED_PRETRANSFORM;
-    int patch_size = 1;
-    moss_codec_transformer_block transformer;
-};
-
-struct moss_codec_quantizer_entry {
-    ggml_tensor * in_proj_w   = nullptr;
-    ggml_tensor * in_proj_b   = nullptr;
-    ggml_tensor * codebook    = nullptr;
-    ggml_tensor * out_proj_w  = nullptr;
-    ggml_tensor * out_proj_b  = nullptr;
-};
-
-struct moss_codec_quantizer {
-    int input_dim         = 0;
-    int rvq_dim          = 0;
-    int output_dim       = 0;
-    int num_quantizers   = 0;
-    int codebook_size    = 0;
-    int codebook_dim     = 0;
-
-    ggml_tensor * input_proj_w  = nullptr;
-    ggml_tensor * input_proj_b  = nullptr;
-    ggml_tensor * output_proj_w = nullptr;
-    ggml_tensor * output_proj_b = nullptr;
-
-    std::vector<moss_codec_quantizer_entry> quantizers;
-};
-
-static std::string moss_codec_module_type_to_string(const moss_codec_module_type type) {
-    switch (type) {
-        case moss_codec_module_type::PATCHED_PRETRANSFORM:
-            return "PatchedPretransform";
-        case moss_codec_module_type::TRANSFORMER:
-            return "Transformer";
-    }
-    return "Unknown";
-}
-
-static moss_codec_module_type moss_codec_module_type_from_string(const std::string & value) {
-    if (value == "PatchedPretransform") {
-        return moss_codec_module_type::PATCHED_PRETRANSFORM;
-    }
-    if (value == "Transformer") {
-        return moss_codec_module_type::TRANSFORMER;
-    }
-    throw std::runtime_error("unsupported codec module type: " + value);
-}
-
-static void moss_codec_set_n_threads(ggml_backend_t backend, int n_threads) {
-    if (backend == nullptr || n_threads <= 0) {
-        return;
-    }
-
-    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
-    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-    if (!reg) {
-        return;
-    }
-
-    auto fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-    if (fn != nullptr) {
-        fn(backend, n_threads);
-    }
-}
-
-static std::vector<int32_t> moss_codec_make_positions(const size_t n_tokens) {
-    std::vector<int32_t> positions(n_tokens);
-    for (size_t i = 0; i < n_tokens; ++i) {
-        positions[i] = (int32_t) i;
-    }
-    return positions;
-}
-
-static std::vector<float> moss_codec_make_causal_mask(const size_t n_tokens, const int context) {
-    std::vector<float> mask(n_tokens * n_tokens, -std::numeric_limits<float>::infinity());
-
-    for (size_t iq = 0; iq < n_tokens; ++iq) {
-        for (size_t ik = 0; ik < n_tokens; ++ik) {
-            if (ik > iq) {
-                continue;
-            }
-            if (context > 0 && (int) (iq - ik) >= context) {
-                continue;
-            }
-            mask[iq * n_tokens + ik] = 0.0f;
-        }
-    }
-
-    return mask;
-}
-
-static ggml_tensor * moss_codec_build_layer_norm(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * weight,
-        ggml_tensor * bias) {
-    cur = ggml_norm(ctx0, cur, MOSS_LAYER_NORM_EPS);
-    cur = ggml_mul(ctx0, cur, weight);
-    cur = ggml_add(ctx0, cur, bias);
-    return cur;
-}
-
-static ggml_tensor * moss_codec_build_attention(
-        ggml_context * ctx0,
-        ggml_tensor * wo,
-        ggml_tensor * q_cur,
-        ggml_tensor * k_cur,
-        ggml_tensor * v_cur,
-        ggml_tensor * kq_mask,
-        float kq_scale) {
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
-    v = ggml_cont(ctx0, v);
-
-    ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-    kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
-
-    ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
-    ggml_tensor * cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-    cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
-
-    if (wo != nullptr) {
-        cur = ggml_mul_mat(ctx0, wo, cur);
-    }
-
-    return cur;
-}
-
-static std::vector<float> moss_codec_patch_decode(
-        const std::vector<float> & input,
-        const int channels,
-        const size_t n_frames,
-        const int patch_size) {
-    if (patch_size <= 0) {
-        throw std::runtime_error("invalid patch size");
-    }
-    if (channels % patch_size != 0) {
-        throw std::runtime_error("patch decode channels not divisible by patch size");
-    }
-    if (input.size() != (size_t) channels * n_frames) {
-        throw std::runtime_error("patch decode input size mismatch");
-    }
-
-    const int out_channels = channels / patch_size;
-    const size_t out_frames = n_frames * (size_t) patch_size;
-    std::vector<float> output((size_t) out_channels * out_frames);
-
-    for (size_t t = 0; t < n_frames; ++t) {
-        for (int d = 0; d < out_channels; ++d) {
-            for (int i = 0; i < patch_size; ++i) {
-                const float value = input[(size_t) (d * patch_size + i) + t * (size_t) channels];
-                output[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) out_channels] = value;
-            }
-        }
-    }
-
-    return output;
-}
-
-static std::vector<float> moss_codec_patch_encode(
-        const std::vector<float> & input,
-        const int channels,
-        const size_t n_frames,
-        const int patch_size) {
-    if (patch_size <= 0) {
-        throw std::runtime_error("invalid patch size");
-    }
-    if (n_frames % (size_t) patch_size != 0) {
-        throw std::runtime_error("patch encode frame count not divisible by patch size");
-    }
-    if (input.size() != (size_t) channels * n_frames) {
-        throw std::runtime_error("patch encode input size mismatch");
-    }
-
-    const int out_channels = channels * patch_size;
-    const size_t out_frames = n_frames / (size_t) patch_size;
-    std::vector<float> output((size_t) out_channels * out_frames);
-
-    for (size_t t = 0; t < out_frames; ++t) {
-        for (int d = 0; d < channels; ++d) {
-            for (int i = 0; i < patch_size; ++i) {
-                const float value = input[(size_t) d + (t * (size_t) patch_size + (size_t) i) * (size_t) channels];
-                output[(size_t) (d * patch_size + i) + t * (size_t) out_channels] = value;
-            }
-        }
-    }
-
-    return output;
-}
-
-static std::vector<float> moss_codec_copy_f32_output(ggml_tensor * tensor) {
-    std::vector<float> output((size_t) ggml_nelements(tensor));
-    ggml_backend_tensor_get(tensor, output.data(), 0, ggml_nbytes(tensor));
-    return output;
-}
-
-struct moss_codec_linear_f32 {
-    int in_features = 0;
-    int out_features = 0;
-    std::vector<float> weight;
-    std::vector<float> bias;
-
-    bool empty() const {
-        return weight.empty();
-    }
-};
-
-struct moss_codec_quantizer_entry_f32 {
-    moss_codec_linear_f32 in_proj;
-    moss_codec_linear_f32 out_proj;
-    int codebook_size = 0;
-    int codebook_dim = 0;
-    std::vector<float> codebook;
-    std::vector<float> codebook_unit;
-};
-
-static std::vector<float> moss_codec_tensor_to_f32(const ggml_tensor * tensor) {
-    if (tensor == nullptr) {
-        return {};
-    }
-
-    const size_t n_elements = (size_t) ggml_nelements(tensor);
-
-    switch (tensor->type) {
-        case GGML_TYPE_F32: {
-            std::vector<float> values(n_elements);
-            ggml_backend_tensor_get(const_cast<ggml_tensor *>(tensor), values.data(), 0, ggml_nbytes(tensor));
-            return values;
-        }
-        case GGML_TYPE_F16: {
-            std::vector<ggml_fp16_t> values_f16(n_elements);
-            std::vector<float> values(n_elements);
-            ggml_backend_tensor_get(const_cast<ggml_tensor *>(tensor), values_f16.data(), 0, ggml_nbytes(tensor));
-            for (size_t i = 0; i < n_elements; ++i) {
-                values[i] = ggml_fp16_to_fp32(values_f16[i]);
-            }
-            return values;
-        }
-        default:
-            throw std::runtime_error("unsupported tensor dtype for float conversion: " + std::string(ggml_type_name(tensor->type)));
-    }
-}
-
-static moss_codec_linear_f32 moss_codec_linear_from_tensors(ggml_tensor * weight, ggml_tensor * bias) {
-    moss_codec_linear_f32 result;
-    if (weight == nullptr) {
-        return result;
-    }
-
-    switch (ggml_n_dims(weight)) {
-        case 2:
-            result.in_features = (int) weight->ne[0];
-            result.out_features = (int) weight->ne[1];
-            break;
-        case 3:
-            if (weight->ne[0] != 1) {
-                throw std::runtime_error("expected singleton leading dim for 3D linear weight tensor");
-            }
-            result.in_features = (int) weight->ne[1];
-            result.out_features = (int) weight->ne[2];
-            break;
-        case 4:
-            if (weight->ne[0] != 1 || weight->ne[1] != 1) {
-                throw std::runtime_error("expected singleton leading dims for 4D linear weight tensor");
-            }
-            result.in_features = (int) weight->ne[2];
-            result.out_features = (int) weight->ne[3];
-            break;
-        default:
-            throw std::runtime_error("expected 2D/3D/4D linear weight tensor");
-    }
-    result.weight = moss_codec_tensor_to_f32(weight);
-    result.bias = moss_codec_tensor_to_f32(bias);
-    return result;
-}
-
-static std::vector<float> moss_codec_linear_apply(
-        const moss_codec_linear_f32 & linear,
-        const std::vector<float> & input,
-        const size_t n_frames) {
-    if (linear.empty()) {
-        return input;
-    }
-    if (input.size() != (size_t) linear.in_features * n_frames) {
-        throw std::runtime_error("linear input size mismatch");
-    }
-
-    std::vector<float> output((size_t) linear.out_features * n_frames, 0.0f);
-    for (size_t t = 0; t < n_frames; ++t) {
-        const float * x = input.data() + t * (size_t) linear.in_features;
-        float * y = output.data() + t * (size_t) linear.out_features;
-
-        if (!linear.bias.empty()) {
-            std::copy(linear.bias.begin(), linear.bias.end(), y);
-        }
-
-        for (int o = 0; o < linear.out_features; ++o) {
-            const float * w = linear.weight.data() + (size_t) o * (size_t) linear.in_features;
-            float acc = y[o];
-            for (int i = 0; i < linear.in_features; ++i) {
-                acc += w[i] * x[i];
-            }
-            y[o] = acc;
-        }
-    }
-
-    return output;
-}
-
-static std::vector<float> moss_codec_normalize_rows(
-        const std::vector<float> & input,
-        const int row_width) {
-    if (row_width <= 0 || input.size() % (size_t) row_width != 0) {
-        throw std::runtime_error("invalid row width for normalization");
-    }
-
-    std::vector<float> output = input;
-    const size_t n_rows = input.size() / (size_t) row_width;
-    for (size_t r = 0; r < n_rows; ++r) {
-        float norm2 = 0.0f;
-        for (int c = 0; c < row_width; ++c) {
-            const float v = output[r * (size_t) row_width + (size_t) c];
-            norm2 += v * v;
-        }
-        const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
-        for (int c = 0; c < row_width; ++c) {
-            output[r * (size_t) row_width + (size_t) c] *= inv;
-        }
-    }
-    return output;
-}
-
-struct moss_codec_gguf_loader {
-    ggml_context_ptr ctx_meta;
-    gguf_context_ptr ctx_gguf;
-    ggml_context_ptr ctx_data;
-    ggml_backend_ptr backend;
-    ggml_backend_buffer_ptr buffer;
-
-    std::string fname;
-    std::map<std::string, size_t> tensor_offset;
-    std::map<std::string, ggml_tensor *> loaded_tensors;
-    std::vector<ggml_tensor *> tensors_to_load;
-
-    explicit moss_codec_gguf_loader(const std::string & model_path)
-        : fname(model_path),
-          backend(ggml_backend_cpu_init()) {
-        if (!backend) {
-            throw std::runtime_error("failed to initialize CPU backend for codec");
-        }
-
-        ggml_context * meta = nullptr;
-        gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &meta,
-        };
-
-        ctx_gguf.reset(gguf_init_from_file(fname.c_str(), params));
-        if (!ctx_gguf) {
-            throw std::runtime_error("failed to load codec GGUF metadata from: " + fname);
-        }
-
-        ctx_meta.reset(meta);
-
-        for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
-            const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
-            tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
-        }
-
-        ggml_init_params data_params = {
-            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        ctx_data.reset(ggml_init(data_params));
-        if (!ctx_data) {
-            throw std::runtime_error("failed to initialize codec tensor context");
-        }
-    }
-
-    int find_key(const std::string & key, const bool required = true) const {
-        const int idx = gguf_find_key(ctx_gguf.get(), key.c_str());
-        if (idx < 0 && required) {
-            throw std::runtime_error("GGUF key not found: " + key);
-        }
-        return idx;
-    }
-
-    bool has_key(const std::string & key) const {
-        return gguf_find_key(ctx_gguf.get(), key.c_str()) >= 0;
-    }
-
-    uint32_t get_u32(const std::string & key, const bool required = true, const uint32_t fallback = 0) const {
-        const int idx = find_key(key, required);
-        if (idx < 0) {
-            return fallback;
-        }
-        return gguf_get_val_u32(ctx_gguf.get(), idx);
-    }
-
-    float get_f32(const std::string & key, const bool required = true, const float fallback = 0.0f) const {
-        const int idx = find_key(key, required);
-        if (idx < 0) {
-            return fallback;
-        }
-        return gguf_get_val_f32(ctx_gguf.get(), idx);
-    }
-
-    std::string get_string(const std::string & key, const bool required = true, const std::string & fallback = {}) const {
-        const int idx = find_key(key, required);
-        if (idx < 0) {
-            return fallback;
-        }
-        return std::string(gguf_get_val_str(ctx_gguf.get(), idx));
-    }
-
-    ggml_tensor * get_tensor(const std::string & name, const bool required = true) {
-        const auto it = loaded_tensors.find(name);
-        if (it != loaded_tensors.end()) {
-            return it->second;
-        }
-
-        ggml_tensor * meta_tensor = ggml_get_tensor(ctx_meta.get(), name.c_str());
-        if (!meta_tensor) {
-            if (required) {
-                throw std::runtime_error("codec tensor not found: " + name);
-            }
-            return nullptr;
-        }
-
-        ggml_tensor * data_tensor = ggml_dup_tensor(ctx_data.get(), meta_tensor);
-        ggml_set_name(data_tensor, meta_tensor->name);
-        loaded_tensors.emplace(name, data_tensor);
-        tensors_to_load.push_back(data_tensor);
-        return data_tensor;
-    }
-
-    void load_tensor_bytes() {
-        if (!buffer) {
-            buffer.reset(ggml_backend_alloc_ctx_tensors(ctx_data.get(), backend.get()));
-            if (!buffer) {
-                throw std::runtime_error("failed to allocate codec weight buffer");
-            }
-            ggml_backend_buffer_set_usage(buffer.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        }
-
-        std::ifstream fin(fname, std::ios::binary);
-        if (!fin) {
-            throw std::runtime_error("failed to open codec GGUF for tensor loading: " + fname);
-        }
-
-        std::vector<uint8_t> read_buf;
-        for (ggml_tensor * tensor : tensors_to_load) {
-            const auto it = tensor_offset.find(tensor->name);
-            if (it == tensor_offset.end()) {
-                throw std::runtime_error("missing GGUF tensor offset for: " + std::string(tensor->name));
-            }
-
-            const size_t offset = it->second;
-            const size_t num_bytes = ggml_nbytes(tensor);
-
-            fin.seekg(offset, std::ios::beg);
-            if (!fin) {
-                throw std::runtime_error("failed to seek codec tensor: " + std::string(tensor->name));
-            }
-
-            if (ggml_backend_buffer_is_host(buffer.get())) {
-                fin.read(reinterpret_cast<char *>(tensor->data), (std::streamsize) num_bytes);
-            } else {
-                read_buf.resize(num_bytes);
-                fin.read(reinterpret_cast<char *>(read_buf.data()), (std::streamsize) num_bytes);
-                ggml_backend_tensor_set(tensor, read_buf.data(), 0, num_bytes);
-            }
-
-            if (!fin) {
-                throw std::runtime_error("failed to read codec tensor: " + std::string(tensor->name));
-            }
-        }
-    }
-};
-
-} // namespace
-
-struct moss_audio_tokenizer::impl {
-    int sample_rate = 0;
-    uint32_t downsample_rate = 0;
-    uint32_t num_quantizers = 0;
-    int n_threads = -1;
-
-    ggml_backend_ptr backend;
-    ggml_context_ptr ctx_meta;
-    gguf_context_ptr ctx_gguf;
-    ggml_context_ptr ctx_data;
-    ggml_backend_buffer_ptr weights_buffer;
-
-    moss_codec_quantizer quantizer;
-    moss_codec_linear_f32 quantizer_input_proj_f32;
-    std::vector<moss_codec_quantizer_entry_f32> quantizer_entries_f32;
-    std::vector<moss_codec_module> encoder;
-    std::vector<moss_codec_module> decoder;
-
-    explicit impl(const std::string & model_path, const moss_audio_tokenizer_options & options) {
-        moss_codec_gguf_loader loader(model_path);
-
-        if (!loader.has_key(std::string(MOSS_CODEC_ARCH) + ".quantizer_type")) {
-            throw std::runtime_error("model does not contain bundled MOSS audio tokenizer metadata");
-        }
-
-        sample_rate = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".sampling_rate");
-        downsample_rate = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".downsample_rate");
-        num_quantizers = loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.num_quantizers");
-        n_threads = options.n_threads;
-
-        quantizer.input_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.input_dim");
-        quantizer.rvq_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.rvq_dim");
-        quantizer.output_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.output_dim");
-        quantizer.num_quantizers = (int) num_quantizers;
-        quantizer.codebook_size = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_size");
-        quantizer.codebook_dim = (int) loader.get_u32(std::string(MOSS_CODEC_ARCH) + ".quantizer.codebook_dim");
-        quantizer.input_proj_w = loader.get_tensor("audio_tokenizer.quantizer.input_proj.weight", false);
-        quantizer.input_proj_b = loader.get_tensor("audio_tokenizer.quantizer.input_proj.bias", false);
-        quantizer.output_proj_w = loader.get_tensor("audio_tokenizer.quantizer.output_proj.weight", false);
-        quantizer.output_proj_b = loader.get_tensor("audio_tokenizer.quantizer.output_proj.bias", false);
-        quantizer.quantizers.resize(num_quantizers);
-        for (uint32_t iq = 0; iq < num_quantizers; ++iq) {
-            auto & entry = quantizer.quantizers[iq];
-            const std::string prefix = "audio_tokenizer.quantizer.quantizers." + std::to_string(iq);
-            entry.in_proj_w = loader.get_tensor(prefix + ".in_proj.weight", false);
-            entry.in_proj_b = loader.get_tensor(prefix + ".in_proj.bias", false);
-            entry.codebook = loader.get_tensor(prefix + ".codebook.weight");
-            entry.out_proj_w = loader.get_tensor(prefix + ".out_proj.weight", false);
-            entry.out_proj_b = loader.get_tensor(prefix + ".out_proj.bias", false);
-        }
-
-        const auto load_modules = [&](const std::string & section_name, std::vector<moss_codec_module> & modules) {
-            const uint32_t block_count = loader.get_u32(std::string(MOSS_CODEC_ARCH) + "." + section_name + ".block_count");
-            modules.resize(block_count);
-            for (uint32_t ib = 0; ib < block_count; ++ib) {
-                const std::string block_prefix = std::string(MOSS_CODEC_ARCH) + "." + section_name + "." + std::to_string(ib);
-                moss_codec_module & block = modules[ib];
-                block.type = moss_codec_module_type_from_string(loader.get_string(block_prefix + ".module_type"));
-
-                if (block.type == moss_codec_module_type::PATCHED_PRETRANSFORM) {
-                    block.patch_size = (int) loader.get_u32(block_prefix + ".patch_size");
-                    continue;
-                }
-
-                auto & tr = block.transformer;
-                tr.input_dimension = (int) loader.get_u32(block_prefix + ".input_dimension");
-                tr.output_dimension = (int) loader.get_u32(block_prefix + ".output_dimension");
-                tr.d_model = (int) loader.get_u32(block_prefix + ".d_model");
-                tr.num_heads = (int) loader.get_u32(block_prefix + ".num_heads");
-                tr.num_layers = (int) loader.get_u32(block_prefix + ".num_layers");
-                tr.dim_feedforward = (int) loader.get_u32(block_prefix + ".dim_feedforward");
-                tr.context = (int) loader.get_u32(block_prefix + ".context");
-                tr.max_period = loader.get_f32(block_prefix + ".max_period", false, 10000.0f);
-                tr.input_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".input_proj.weight", false);
-                tr.output_proj = loader.get_tensor("audio_tokenizer." + section_name + "." + std::to_string(ib) + ".output_proj.weight", false);
-
-                tr.layers.resize(tr.num_layers);
-                for (int il = 0; il < tr.num_layers; ++il) {
-                    auto & layer = tr.layers[il];
-                    const std::string layer_prefix =
-                            "audio_tokenizer." + section_name + "." + std::to_string(ib) + ".transformer.layers." + std::to_string(il);
-                    layer.attn_in  = loader.get_tensor(layer_prefix + ".self_attn.in_projs.0.weight");
-                    layer.attn_out = loader.get_tensor(layer_prefix + ".self_attn.out_projs.0.weight");
-                    layer.linear1  = loader.get_tensor(layer_prefix + ".linear1.weight");
-                    layer.linear2  = loader.get_tensor(layer_prefix + ".linear2.weight");
-                    layer.norm1_w  = loader.get_tensor(layer_prefix + ".norm1.weight");
-                    layer.norm1_b  = loader.get_tensor(layer_prefix + ".norm1.bias");
-                    layer.norm2_w  = loader.get_tensor(layer_prefix + ".norm2.weight");
-                    layer.norm2_b  = loader.get_tensor(layer_prefix + ".norm2.bias");
-                    layer.scale1   = loader.get_tensor(layer_prefix + ".layer_scale_1.scale", false);
-                    layer.scale2   = loader.get_tensor(layer_prefix + ".layer_scale_2.scale", false);
-                }
-            }
-        };
-
-        load_modules("encoder", encoder);
-        load_modules("decoder", decoder);
-
-        loader.load_tensor_bytes();
-
-        backend = std::move(loader.backend);
-        ctx_meta = std::move(loader.ctx_meta);
-        ctx_gguf = std::move(loader.ctx_gguf);
-        ctx_data = std::move(loader.ctx_data);
-        weights_buffer = std::move(loader.buffer);
-
-        quantizer_input_proj_f32 = moss_codec_linear_from_tensors(quantizer.input_proj_w, quantizer.input_proj_b);
-        quantizer_entries_f32.resize(num_quantizers);
-        for (uint32_t iq = 0; iq < num_quantizers; ++iq) {
-            auto & dst = quantizer_entries_f32[iq];
-            const auto & src = quantizer.quantizers[iq];
-            dst.in_proj = moss_codec_linear_from_tensors(src.in_proj_w, src.in_proj_b);
-            dst.out_proj = moss_codec_linear_from_tensors(src.out_proj_w, src.out_proj_b);
-            dst.codebook_dim = (int) src.codebook->ne[0];
-            dst.codebook_size = (int) src.codebook->ne[1];
-            dst.codebook = moss_codec_tensor_to_f32(src.codebook);
-            dst.codebook_unit = moss_codec_normalize_rows(dst.codebook, dst.codebook_dim);
-        }
-
-        LLAMA_LOG_INFO("%s: sample_rate=%d downsample_rate=%u num_quantizers=%u encoder_blocks=%zu decoder_blocks=%zu\n",
-                __func__, sample_rate, downsample_rate, num_quantizers, encoder.size(), decoder.size());
-    }
-
-    std::vector<float> run_quantizer_decode(
-            const std::vector<llama_token> & codes,
-            const size_t n_frames,
-            uint32_t n_quantizers_req) const {
-        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
-        if (nq == 0 || nq > num_quantizers) {
-            throw std::runtime_error("invalid quantizer count for decode");
-        }
-        if (codes.size() != n_frames * (size_t) nq) {
-            throw std::runtime_error("raw code size does not match frame count");
-        }
-
-        const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) nq * 8;
-        const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false);
-        std::vector<uint8_t> meta_buf(meta_size);
-
-        ggml_init_params params = {
-            /*.mem_size   =*/ meta_size,
-            /*.mem_buffer =*/ meta_buf.data(),
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx0 = ggml_init(params);
-        if (!ctx0) {
-            throw std::runtime_error("failed to init quantizer decode ggml context");
-        }
-
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false);
-        std::vector<ggml_tensor *> code_inputs(nq);
-
-        ggml_tensor * cur = nullptr;
-        for (uint32_t iq = 0; iq < nq; ++iq) {
-            const auto & entry = quantizer.quantizers[iq];
-            ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames);
-            ggml_set_input(inp);
-            code_inputs[iq] = inp;
-
-            ggml_tensor * emb = ggml_get_rows(ctx0, entry.codebook, inp);
-            if (entry.out_proj_w) {
-                emb = ggml_mul_mat(ctx0, entry.out_proj_w, emb);
-            }
-            if (entry.out_proj_b) {
-                emb = ggml_add(ctx0, emb, entry.out_proj_b);
-            }
-            cur = cur ? ggml_add(ctx0, cur, emb) : emb;
-        }
-
-        if (quantizer.output_proj_w) {
-            cur = ggml_mul_mat(ctx0, quantizer.output_proj_w, cur);
-        }
-        if (quantizer.output_proj_b) {
-            cur = ggml_add(ctx0, cur, quantizer.output_proj_b);
-        }
-
-        ggml_build_forward_expand(gf, cur);
-
-        ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) };
-        ggml_gallocr_alloc_graph(allocr.get(), gf);
-
-        for (uint32_t iq = 0; iq < nq; ++iq) {
-            std::vector<int32_t> gathered(n_frames);
-            for (size_t t = 0; t < n_frames; ++t) {
-                const llama_token code = codes[t * (size_t) nq + iq];
-                if (code < 0 || code >= quantizer.codebook_size) {
-                    ggml_free(ctx0);
-                    throw std::runtime_error("audio code out of codec range during decode");
-                }
-                gathered[t] = (int32_t) code;
-            }
-            ggml_backend_tensor_set(code_inputs[iq], gathered.data(), 0, gathered.size() * sizeof(int32_t));
-        }
-
-        moss_codec_set_n_threads(backend.get(), n_threads);
-        const ggml_status status = ggml_backend_graph_compute(backend.get(), gf);
-        if (status != GGML_STATUS_SUCCESS) {
-            ggml_free(ctx0);
-            throw std::runtime_error("quantizer decode graph compute failed");
-        }
-
-        std::vector<float> output = moss_codec_copy_f32_output(cur);
-        ggml_free(ctx0);
-        return output;
-    }
-
-    std::vector<float> run_transformer_block(
-            const moss_codec_transformer_block & block,
-            const std::vector<float> & input,
-            const size_t n_frames) const {
-        if (input.size() != (size_t) block.input_dimension * n_frames) {
-            throw std::runtime_error("transformer block input size mismatch");
-        }
-
-        const size_t max_nodes = MOSS_CODEC_MAX_NODES_BASE + (size_t) block.num_layers * MOSS_CODEC_MAX_NODES_PER_LAYER;
-        const size_t meta_size = max_nodes * ggml_tensor_overhead() + ggml_graph_overhead_custom(max_nodes, false);
-        std::vector<uint8_t> meta_buf(meta_size);
-
-        ggml_init_params params = {
-            /*.mem_size   =*/ meta_size,
-            /*.mem_buffer =*/ meta_buf.data(),
-            /*.no_alloc   =*/ true,
-        };
-        ggml_context * ctx0 = ggml_init(params);
-        if (!ctx0) {
-            throw std::runtime_error("failed to init transformer ggml context");
-        }
-
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx0, (int) max_nodes, false);
-
-        ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, block.input_dimension, (int64_t) n_frames);
-        ggml_set_input(inp);
-        ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t) n_frames);
-        ggml_set_input(positions);
-        ggml_tensor * mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, (int64_t) n_frames, (int64_t) n_frames, 1, 1);
-        ggml_set_input(mask);
-
-        ggml_tensor * cur = inp;
-        if (block.input_proj) {
-            cur = ggml_mul_mat(ctx0, block.input_proj, cur);
-        }
-
-        const int d_head = block.d_model / block.num_heads;
-        const float attn_scale = 1.0f / std::sqrt((float) d_head);
-
-        for (int il = 0; il < block.num_layers; ++il) {
-            const auto & layer = block.layers[il];
-
-            ggml_tensor * inp_sa = cur;
-            ggml_tensor * x = moss_codec_build_layer_norm(ctx0, cur, layer.norm1_w, layer.norm1_b);
-            ggml_tensor * qkv = ggml_mul_mat(ctx0, layer.attn_in, x);
-
-            ggml_tensor * q = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
-                    ggml_row_size(qkv->type, d_head), qkv->nb[1], 0);
-            ggml_tensor * k = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
-                    ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, block.d_model));
-            ggml_tensor * v = ggml_view_3d(ctx0, qkv, d_head, block.num_heads, (int64_t) n_frames,
-                    ggml_row_size(qkv->type, d_head), qkv->nb[1], ggml_row_size(qkv->type, 2 * block.d_model));
-
-            q = ggml_rope_ext(ctx0, q, positions, nullptr, d_head, 0, 0,
-                    block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
-            k = ggml_rope_ext(ctx0, k, positions, nullptr, d_head, 0, 0,
-                    block.max_period, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
-
-            ggml_tensor * attn = moss_codec_build_attention(ctx0, layer.attn_out, q, k, v, mask, attn_scale);
-            if (layer.scale1) {
-                attn = ggml_mul(ctx0, attn, layer.scale1);
-            }
-            cur = ggml_add(ctx0, inp_sa, attn);
-
-            ggml_tensor * inp_ff = cur;
-            x = moss_codec_build_layer_norm(ctx0, cur, layer.norm2_w, layer.norm2_b);
-            x = ggml_mul_mat(ctx0, layer.linear1, x);
-            x = ggml_gelu(ctx0, x);
-            x = ggml_mul_mat(ctx0, layer.linear2, x);
-            if (layer.scale2) {
-                x = ggml_mul(ctx0, x, layer.scale2);
-            }
-            cur = ggml_add(ctx0, inp_ff, x);
-        }
-
-        if (block.output_proj) {
-            cur = ggml_mul_mat(ctx0, block.output_proj, cur);
-        }
-
-        ggml_build_forward_expand(gf, cur);
-
-        ggml_gallocr_ptr allocr { ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend.get())) };
-        ggml_gallocr_alloc_graph(allocr.get(), gf);
-
-        const std::vector<int32_t> positions_data = moss_codec_make_positions(n_frames);
-        const std::vector<float> mask_data = moss_codec_make_causal_mask(n_frames, block.context);
-
-        ggml_backend_tensor_set(inp, input.data(), 0, input.size() * sizeof(float));
-        ggml_backend_tensor_set(positions, positions_data.data(), 0, positions_data.size() * sizeof(int32_t));
-        ggml_backend_tensor_set(mask, mask_data.data(), 0, mask_data.size() * sizeof(float));
-
-        moss_codec_set_n_threads(backend.get(), n_threads);
-        const ggml_status status = ggml_backend_graph_compute(backend.get(), gf);
-        if (status != GGML_STATUS_SUCCESS) {
-            ggml_free(ctx0);
-            throw std::runtime_error("transformer graph compute failed");
-        }
-
-        std::vector<float> output = moss_codec_copy_f32_output(cur);
-        ggml_free(ctx0);
-        return output;
-    }
-
-    std::vector<float> decode(
-            const std::vector<llama_token> & codes,
-            const size_t n_frames,
-            const uint32_t n_quantizers_req) const {
-        uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
-        if (nq == 0 || nq > num_quantizers) {
-            throw std::runtime_error("invalid quantizer count");
-        }
-
-        std::vector<float> cur = run_quantizer_decode(codes, n_frames, nq);
-        int channels = quantizer.output_dim;
-        size_t frames = n_frames;
-
-        for (const auto & module : decoder) {
-            switch (module.type) {
-                case moss_codec_module_type::TRANSFORMER:
-                    cur = run_transformer_block(module.transformer, cur, frames);
-                    channels = module.transformer.output_dimension;
-                    break;
-                case moss_codec_module_type::PATCHED_PRETRANSFORM:
-                    cur = moss_codec_patch_decode(cur, channels, frames, module.patch_size);
-                    channels /= module.patch_size;
-                    frames *= (size_t) module.patch_size;
-                    break;
-            }
-        }
-
-        if (channels != 1) {
-            throw std::runtime_error("codec decoder did not end with a mono waveform channel");
-        }
-
-        return cur;
-    }
-
-    std::vector<llama_token> run_quantizer_encode(
-            const std::vector<float> & input,
-            const size_t n_frames,
-            const uint32_t n_quantizers_req) const {
-        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
-        if (nq == 0 || nq > num_quantizers) {
-            throw std::runtime_error("invalid quantizer count for encode");
-        }
-
-        std::vector<float> residual = moss_codec_linear_apply(quantizer_input_proj_f32, input, n_frames);
-        if (residual.size() != (size_t) quantizer.rvq_dim * n_frames) {
-            throw std::runtime_error("quantizer input projection size mismatch");
-        }
-
-        std::vector<llama_token> codes(n_frames * (size_t) nq, 0);
-        std::vector<float> latents;
-        std::vector<float> latents_unit;
-        std::vector<float> decoded;
-
-        for (uint32_t iq = 0; iq < nq; ++iq) {
-            const auto & entry = quantizer_entries_f32[iq];
-            latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames);
-            if (latents.size() != (size_t) entry.codebook_dim * n_frames) {
-                throw std::runtime_error("quantizer latent projection size mismatch");
-            }
-
-            latents_unit.resize(latents.size());
-            for (size_t t = 0; t < n_frames; ++t) {
-                const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim;
-                float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim;
-
-                float norm2 = 0.0f;
-                for (int d = 0; d < entry.codebook_dim; ++d) {
-                    norm2 += in_ptr[d] * in_ptr[d];
-                }
-                const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
-                for (int d = 0; d < entry.codebook_dim; ++d) {
-                    out_ptr[d] = in_ptr[d] * inv;
-                }
-            }
-
-            std::vector<float> codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f);
-            for (size_t t = 0; t < n_frames; ++t) {
-                const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim;
-
-                float best_score = -std::numeric_limits<float>::infinity();
-                int best_index = 0;
-                for (int code = 0; code < entry.codebook_size; ++code) {
-                    const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim;
-                    float score = 0.0f;
-                    for (int d = 0; d < entry.codebook_dim; ++d) {
-                        score += row[d] * latent[d];
-                    }
-                    if (score > best_score) {
-                        best_score = score;
-                        best_index = code;
-                    }
-                }
-
-                codes[t * (size_t) nq + iq] = best_index;
-                const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim;
-                std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim));
-            }
-
-            decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames);
-            if (decoded.size() != residual.size()) {
-                throw std::runtime_error("quantizer decoded embedding size mismatch");
-            }
-
-            for (size_t i = 0; i < residual.size(); ++i) {
-                residual[i] -= decoded[i];
-            }
-        }
-
-        return codes;
-    }
-
-    std::vector<llama_token> encode(
-            const std::vector<float> & audio,
-            size_t * out_frames,
-            const uint32_t n_quantizers_req) const {
-        const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
-        if (nq == 0 || nq > num_quantizers) {
-            throw std::runtime_error("invalid quantizer count");
-        }
-
-        const size_t padded_samples =
-                ((audio.size() + (size_t) downsample_rate - 1) / (size_t) downsample_rate) * (size_t) downsample_rate;
-        const size_t valid_frames = audio.size() / (size_t) downsample_rate;
-
-        std::vector<float> cur(padded_samples, 0.0f);
-        std::copy(audio.begin(), audio.end(), cur.begin());
-
-        int channels = 1;
-        size_t frames = padded_samples;
-
-        for (const auto & module : encoder) {
-            switch (module.type) {
-                case moss_codec_module_type::PATCHED_PRETRANSFORM:
-                    cur = moss_codec_patch_encode(cur, channels, frames, module.patch_size);
-                    channels *= module.patch_size;
-                    frames /= (size_t) module.patch_size;
-                    break;
-                case moss_codec_module_type::TRANSFORMER:
-                    cur = run_transformer_block(module.transformer, cur, frames);
-                    channels = module.transformer.output_dimension;
-                    break;
-            }
-        }
-
-        if (channels != quantizer.input_dim) {
-            throw std::runtime_error("codec encoder output dimension does not match quantizer input dimension");
-        }
-
-        std::vector<llama_token> codes = run_quantizer_encode(cur, frames, nq);
-        if (out_frames) {
-            *out_frames = valid_frames;
-        }
-
-        if (valid_frames >= frames) {
-            return codes;
-        }
-
-        std::vector<llama_token> trimmed(valid_frames * (size_t) nq);
-        for (size_t t = 0; t < valid_frames; ++t) {
-            std::copy_n(codes.data() + t * (size_t) nq, nq, trimmed.data() + t * (size_t) nq);
-        }
-        return trimmed;
-    }
-};
-
-moss_audio_tokenizer::moss_audio_tokenizer(
-        const std::string & model_path,
-        const moss_audio_tokenizer_options & options)
-    : impl_(std::make_unique<impl>(model_path, options)) {
-}
-
-moss_audio_tokenizer::~moss_audio_tokenizer() = default;
-
-moss_audio_tokenizer::moss_audio_tokenizer(moss_audio_tokenizer &&) noexcept = default;
-
-moss_audio_tokenizer & moss_audio_tokenizer::operator=(moss_audio_tokenizer &&) noexcept = default;
-
-int moss_audio_tokenizer::sample_rate() const {
-    return impl_->sample_rate;
-}
-
-uint32_t moss_audio_tokenizer::downsample_rate() const {
-    return impl_->downsample_rate;
-}
-
-uint32_t moss_audio_tokenizer::num_quantizers() const {
-    return impl_->num_quantizers;
-}
-
-std::vector<float> moss_audio_tokenizer::decode(
-        const std::vector<llama_token> & codes,
-        const size_t n_frames,
-        const uint32_t n_quantizers) const {
-    return impl_->decode(codes, n_frames, n_quantizers);
-}
-
-std::vector<llama_token> moss_audio_tokenizer::encode(
-        const std::vector<float> & audio,
-        size_t * out_frames,
-        const uint32_t n_quantizers) const {
-    return impl_->encode(audio, out_frames, n_quantizers);
-}
-
-static std::string moss_codec_model_meta_str(const llama_model * model, const std::string & key) {
-    const auto it = model->gguf_kv.find(key);
-    if (it == model->gguf_kv.end()) {
-        throw std::runtime_error("missing GGUF key: " + key);
-    }
-
-    std::string value = it->second;
-    if (value.size() >= 2 && ((value.front() == '\'' && value.back() == '\'') || (value.front() == '"' && value.back() == '"'))) {
-        value = value.substr(1, value.size() - 2);
-    }
-    return value;
-}
-
-static uint32_t moss_codec_model_meta_u32(const llama_model * model, const std::string & key) {
-    return (uint32_t) std::stoul(moss_codec_model_meta_str(model, key));
-}
-
-static const ggml_tensor * moss_codec_model_require_tensor(const llama_model * model, const std::string & name) {
-    const ggml_tensor * tensor = model->get_tensor(name.c_str());
-    if (tensor == nullptr) {
-        throw std::runtime_error("missing tensor: " + name);
-    }
-    return tensor;
-}
-
-static const ggml_tensor * moss_codec_model_optional_tensor(const llama_model * model, const std::string & name) {
-    return model->get_tensor(name.c_str());
-}
-
-int moss_audio_model_sample_rate(const llama_model * model) {
-    const std::string arch_name = llm_arch_name(model->arch);
-    return (int) moss_codec_model_meta_u32(model, arch_name + ".sampling_rate");
-}
-
-uint32_t moss_audio_model_downsample_rate(const llama_model * model) {
-    const std::string arch_name = llm_arch_name(model->arch);
-    return moss_codec_model_meta_u32(model, arch_name + ".downsample_rate");
-}
-
-uint32_t moss_audio_model_num_quantizers(const llama_model * model) {
-    const std::string arch_name = llm_arch_name(model->arch);
-    return moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers");
-}
-
-std::vector<llama_token> moss_audio_model_quantizer_encode(
-        const llama_model * model,
-        const std::vector<float> & input,
-        size_t n_frames,
-        uint32_t n_quantizers_req) {
-    if (model->arch != LLM_ARCH_MOSS_TTS_AUDIO_ENCODER) {
-        throw std::runtime_error("quantizer encode expects a moss-tts-audio-encoder model");
-    }
-
-    const std::string arch_name = llm_arch_name(model->arch);
-    const uint32_t num_quantizers = moss_codec_model_meta_u32(model, arch_name + ".quantizer.num_quantizers");
-    const uint32_t nq = n_quantizers_req == 0 ? num_quantizers : n_quantizers_req;
-    if (nq == 0 || nq > num_quantizers) {
-        throw std::runtime_error("invalid quantizer count");
-    }
-
-    moss_codec_linear_f32 quantizer_input_proj = moss_codec_linear_from_tensors(
-            const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, "quantizer.input_proj.weight")),
-            const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, "quantizer.input_proj.bias")));
-
-    std::vector<moss_codec_quantizer_entry_f32> quantizers(nq);
-    for (uint32_t iq = 0; iq < nq; ++iq) {
-        auto & entry = quantizers[iq];
-        const std::string prefix = "quantizer.quantizers." + std::to_string(iq);
-        entry.in_proj = moss_codec_linear_from_tensors(
-                const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, prefix + ".in_proj.weight")),
-                const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, prefix + ".in_proj.bias")));
-        entry.out_proj = moss_codec_linear_from_tensors(
-                const_cast<ggml_tensor *>(moss_codec_model_require_tensor(model, prefix + ".out_proj.weight")),
-                const_cast<ggml_tensor *>(moss_codec_model_optional_tensor(model, prefix + ".out_proj.bias")));
-        const ggml_tensor * codebook = moss_codec_model_require_tensor(model, prefix + ".codebook.weight");
-        entry.codebook_dim = (int) codebook->ne[0];
-        entry.codebook_size = (int) codebook->ne[1];
-        entry.codebook = moss_codec_tensor_to_f32(codebook);
-        entry.codebook_unit = moss_codec_normalize_rows(entry.codebook, entry.codebook_dim);
-    }
-
-    std::vector<float> residual = moss_codec_linear_apply(quantizer_input_proj, input, n_frames);
-    std::vector<llama_token> codes(n_frames * (size_t) nq, 0);
-    std::vector<float> latents;
-    std::vector<float> latents_unit;
-    std::vector<float> decoded;
-
-    for (uint32_t iq = 0; iq < nq; ++iq) {
-        const auto & entry = quantizers[iq];
-        latents = moss_codec_linear_apply(entry.in_proj, residual, n_frames);
-        if (latents.size() != (size_t) entry.codebook_dim * n_frames) {
-            throw std::runtime_error("quantizer latent projection size mismatch");
-        }
-
-        latents_unit.resize(latents.size());
-        for (size_t t = 0; t < n_frames; ++t) {
-            const float * in_ptr = latents.data() + t * (size_t) entry.codebook_dim;
-            float * out_ptr = latents_unit.data() + t * (size_t) entry.codebook_dim;
-
-            float norm2 = 0.0f;
-            for (int d = 0; d < entry.codebook_dim; ++d) {
-                norm2 += in_ptr[d] * in_ptr[d];
-            }
-            const float inv = 1.0f / std::sqrt(std::max(norm2, std::numeric_limits<float>::epsilon()));
-            for (int d = 0; d < entry.codebook_dim; ++d) {
-                out_ptr[d] = in_ptr[d] * inv;
-            }
-        }
-
-        std::vector<float> codebook_emb((size_t) entry.codebook_dim * n_frames, 0.0f);
-        for (size_t t = 0; t < n_frames; ++t) {
-            const float * latent = latents_unit.data() + t * (size_t) entry.codebook_dim;
-
-            float best_score = -std::numeric_limits<float>::infinity();
-            int best_index = 0;
-            for (int code = 0; code < entry.codebook_size; ++code) {
-                const float * row = entry.codebook_unit.data() + (size_t) code * (size_t) entry.codebook_dim;
-                float score = 0.0f;
-                for (int d = 0; d < entry.codebook_dim; ++d) {
-                    score += row[d] * latent[d];
-                }
-                if (score > best_score) {
-                    best_score = score;
-                    best_index = code;
-                }
-            }
-
-            codes[t * (size_t) nq + iq] = best_index;
-            const float * row = entry.codebook.data() + (size_t) best_index * (size_t) entry.codebook_dim;
-            std::copy(row, row + entry.codebook_dim, codebook_emb.begin() + (ptrdiff_t) (t * (size_t) entry.codebook_dim));
-        }
-
-        decoded = moss_codec_linear_apply(entry.out_proj, codebook_emb, n_frames);
-        if (decoded.size() != residual.size()) {
-            throw std::runtime_error("quantizer decoded embedding size mismatch");
-        }
-
-        for (size_t i = 0; i < residual.size(); ++i) {
-            residual[i] -= decoded[i];
-        }
-    }
-
-    return codes;
-}
diff --git a/tools/tts/run-moss-tts-delay.cpp b/tools/tts/run-moss-tts-delay.cpp
index e98aab0ad..2df1e5f00 100644
--- a/tools/tts/run-moss-tts-delay.cpp
+++ b/tools/tts/run-moss-tts-delay.cpp
@@ -3,7 +3,6 @@
 #include "log.h"
 #include "llama.h"
 #include "llama-cpp.h"
-#include "llama-moss-audio-tokenizer.h"
 
 #include <algorithm>
 #include <atomic>
@@ -237,12 +236,6 @@ static void moss_decode_audio_llama(
         const moss_delay_config & cfg,
         int32_t n_gpu_layers,
         const std::string & wav_out_path);
-static void moss_decode_audio_native(
-        const std::string & model_path,
-        const std::vector<llama_token> & raw_codes,
-        size_t raw_frames,
-        const moss_delay_config & cfg,
-        const std::string & wav_out_path);
 static std::vector<float> moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate);
 static moss_prompt_input moss_build_prompt_input(
         const llama_vocab * vocab,
@@ -460,6 +453,37 @@ static std::string moss_model_architecture(const llama_model * model) {
     return std::string(buf);
 }
 
+static uint32_t moss_audio_model_meta_u32(
+        const llama_model * model,
+        const char * expected_arch,
+        const char * suffix) {
+    const std::string arch = moss_model_architecture(model);
+    if (arch != expected_arch) {
+        throw std::runtime_error(
+                "unexpected audio model architecture: expected " +
+                std::string(expected_arch) + ", got " + arch);
+    }
+
+    uint32_t value = 0;
+    const std::string key = arch + "." + suffix;
+    if (!parse_meta_u32(model, key.c_str(), value)) {
+        throw std::runtime_error("missing audio model metadata key: " + key);
+    }
+    return value;
+}
+
+static uint32_t moss_audio_model_sampling_rate(const llama_model * model, const char * expected_arch) {
+    return moss_audio_model_meta_u32(model, expected_arch, "sampling_rate");
+}
+
+static uint32_t moss_audio_model_downsample_rate(const llama_model * model, const char * expected_arch) {
+    return moss_audio_model_meta_u32(model, expected_arch, "downsample_rate");
+}
+
+static uint32_t moss_audio_model_num_quantizers(const llama_model * model, const char * expected_arch) {
+    return moss_audio_model_meta_u32(model, expected_arch, "quantizer.num_quantizers");
+}
+
 struct moss_audio_runtime {
     llama_model_ptr model;
     llama_context_ptr ctx;
@@ -1198,9 +1222,9 @@ static std::vector<llama_token> moss_encode_audio_llama(
             audio_encoder_model_path,
             "moss-tts-audio-encoder",
             n_gpu_layers);
-    const int sample_rate = moss_audio_model_sample_rate(runtime.model.get());
-    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get());
-    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get());
+    const int sample_rate = (int) moss_audio_model_sampling_rate(runtime.model.get(), "moss-tts-audio-encoder");
+    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get(), "moss-tts-audio-encoder");
+    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get(), "moss-tts-audio-encoder");
     const uint32_t nq = n_quantizers == 0 ? model_quantizers : n_quantizers;
     if (nq == 0 || nq > model_quantizers) {
         throw std::runtime_error("invalid audio encoder quantizer count");
@@ -1276,9 +1300,9 @@ static void moss_decode_audio_llama(
             n_gpu_layers,
             std::max<uint32_t>((uint32_t) raw_frames, 1u));
 
-    const int sample_rate = moss_audio_model_sample_rate(runtime.model.get());
-    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get());
-    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get());
+    const int sample_rate = (int) moss_audio_model_sampling_rate(runtime.model.get(), "moss-tts-audio-decoder");
+    const uint32_t downsample_rate = moss_audio_model_downsample_rate(runtime.model.get(), "moss-tts-audio-decoder");
+    const uint32_t model_quantizers = moss_audio_model_num_quantizers(runtime.model.get(), "moss-tts-audio-decoder");
     if (cfg.n_vq != model_quantizers) {
         throw std::runtime_error(
                 "audio decoder quantizer count mismatch: model expects " +
@@ -1314,23 +1338,6 @@ static void moss_decode_audio_llama(
     }
 }
 
-static void moss_decode_audio_native(
-        const std::string & model_path,
-        const std::vector<llama_token> & raw_codes,
-        size_t raw_frames,
-        const moss_delay_config & cfg,
-        const std::string & wav_out_path) {
-    moss_audio_tokenizer_options codec_opts;
-    codec_opts.n_threads = cpu_get_num_math();
-
-    moss_audio_tokenizer codec(model_path, codec_opts);
-    const std::vector<float> audio = codec.decode(raw_codes, raw_frames, cfg.n_vq);
-
-    if (!save_wav16(wav_out_path, audio, codec.sample_rate())) {
-        throw std::runtime_error("failed to write WAV file: " + wav_out_path);
-    }
-}
-
 static std::vector<float> moss_read_wav_f32_mono(const std::string & path, int expected_sample_rate) {
     std::ifstream in(path, std::ios::binary);
     if (!in) {
@@ -1614,7 +1621,6 @@ static int moss_run_audio_decoder_helper(
 }
 
 static bool moss_decode_parity(
-        const std::string & model_path,
         const std::string & ref_path,
         const std::string & dump_codes_path,
         const std::string & audio_decoder_model_path,
@@ -1694,10 +1700,7 @@ static bool moss_decode_parity(
                     n_gpu_layers,
                     wav_out);
         } else {
-            if (model_path.empty()) {
-                throw std::runtime_error("--wav-out requires either --audio-decoder-model, --audio-decoder-script, or -m <model.gguf> with bundled codec");
-            }
-            moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out);
+            throw std::runtime_error("--wav-out requires either --audio-decoder-model or --audio-decoder-script");
         }
     } else if (!helper_script.empty()) {
         if (dump_codes_path.empty()) {
@@ -1952,7 +1955,7 @@ static void moss_generate_from_prompt(
                     n_gpu_layers,
                     wav_out);
         } else {
-            moss_decode_audio_native(model_path, decoded.raw_codes, decoded.raw_frames, cfg, wav_out);
+            throw std::runtime_error("--wav-out requires either --audio-decoder-model or --audio-decoder-script");
         }
     } else if (!helper_script.empty()) {
         if (dump_raw_codes_path.empty()) {
@@ -2020,11 +2023,7 @@ static void moss_generate_from_text(
                         cfg.n_vq,
                         &reference_frames);
             } else {
-                moss_audio_tokenizer_options codec_opts;
-                codec_opts.n_threads = cpu_get_num_math();
-                moss_audio_tokenizer codec(model_path, codec_opts);
-                const std::vector<float> wav = moss_read_wav_f32_mono(reference_audio_path, codec.sample_rate());
-                reference_codes = codec.encode(wav, &reference_frames, cfg.n_vq);
+                throw std::runtime_error("--reference-audio requires --audio-encoder-model");
             }
         }
 
@@ -2587,7 +2586,6 @@ int main(int argc, char ** argv) {
     if (!decode_parity_ref_path.empty()) {
         try {
             const bool ok = moss_decode_parity(
-                    model_path,
                     decode_parity_ref_path,
                     dump_raw_codes_path,
                     audio_decoder_model_path,
@@ -2609,7 +2607,7 @@ int main(int argc, char ** argv) {
         if (self_test) {
             return EXIT_SUCCESS;
         }
-        LOG("moss delay state, multi-head sampler, raw-code decode, and native audio encode/decode helpers are available.\n");
+        LOG("moss delay state, multi-head sampler, raw-code decode, and native three-GGUF audio encode/decode are available.\n");
         LOG("use --print-delay-config with -m <model.gguf> to inspect model metadata.\n");
         LOG("use --decode-parity-ref <ref.bin> to verify C++ de-delay/raw-code extraction against Python.\n");
         LOG("use --text <text> -m <model.gguf> --audio-decoder-model <audio-decoder.gguf> --wav-out out.wav for native generation.\n");