> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Quantization guide

> FP8 and FP4 trade-offs and hardware requirements for all engines

export const QuantizationMatrix = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const W = 620, H = 198, padL = 16, padR = 16;
    const isDark = () => document.documentElement.classList.contains("dark");
    const C = () => isDark() ? {
      sub: "#869089",
      body: "#dee4de",
      brd: "#344339",
      dim: "#46514a",
      track: "rgba(255,255,255,0.06)",
      stripBg: "#0C1D13",
      stripBrd: "#203026",
      ok: ["#17D465", "rgba(23,212,101,0.22)"],
      bar: "#5b9dff",
      barF: "rgba(91,157,255,0.22)"
    } : {
      sub: "#869089",
      body: "#021309",
      brd: "#dee4de",
      dim: "#c2c9c2",
      track: "rgba(0,0,0,0.05)",
      stripBg: "#f4f9f3",
      stripBrd: "#dee4de",
      ok: ["#0e863f", "rgba(178,247,207,0.7)"],
      bar: "#1960d3",
      barF: "rgba(25,96,211,0.16)"
    };
    function setRich(el, s) {
      el.replaceChildren();
      const parts = s.split("`");
      for (let i = 0; i < parts.length; i++) {
        if (i % 2 === 0) {
          if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
        } else {
          const c = document.createElement("code");
          c.textContent = parts[i];
          c.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
          el.appendChild(c);
        }
      }
    }
    const GPUS = ["L4", "H100", "H200", "B200"];
    const ROWS = [{
      label: "FP16 (no_quant)",
      gpu: [1, 1, 1, 1],
      frac: 1.0,
      val: "1.0x",
      key: "fp16"
    }, {
      label: "FP8 / FP8_KV",
      gpu: [1, 1, 1, 1],
      frac: 0.5,
      val: "0.5x",
      key: "fp8"
    }, {
      label: "FP4 (MLP only)",
      gpu: [0, 0, 0, 1],
      frac: 0.38,
      val: "mixed",
      key: "fp4mlp"
    }, {
      label: "FP4 / FP4_KV",
      gpu: [0, 0, 0, 1],
      frac: 0.25,
      val: "0.25x",
      key: "fp4"
    }];
    const EXPL = {
      fp16: ["Full precision, the baseline", "Weights stay at 16-bit. This is the reference footprint and runs on every GPU; the quantized formats are measured against it."],
      fp8: ["`FP8` and `FP8_KV`", "8-bit weights, about half the footprint, supported on every GPU family here. `FP8_KV` also quantizes the KV cache."],
      fp4mlp: ["`FP4_MLP_ONLY`", "Mixed precision: 4-bit MLP layers with higher-precision attention, so the footprint sits between `FP8` and `FP4`. Requires a B200."],
      fp4: ["`FP4` and `FP4_KV`", "4-bit weights, about a quarter of the footprint and the smallest shown. Requires a B200. `FP4_KV` also quantizes the KV cache."]
    };
    let hl = 0, cyc = 0, hover = null, grow = 0, visible = true, dirty = true, raf = 0, last = 0;
    const cv = document.createElement("canvas");
    cv.style.cssText = "display:block;width:100%;max-width:" + W + "px;touch-action:pan-y";
    const ctx = cv.getContext("2d");
    const dpr = window.devicePixelRatio || 1;
    cv.width = W * dpr;
    cv.height = H * dpr;
    cv.style.height = H + "px";
    ctx.scale(dpr, dpr);
    const strip = document.createElement("div");
    const sDot = document.createElement("span");
    sDot.style.cssText = "flex:0 0 auto;width:8px;height:8px;border-radius:50%;margin-top:6px";
    const sTxt = document.createElement("div");
    sTxt.style.cssText = "flex:1;min-width:0";
    const sTit = document.createElement("div");
    sTit.style.cssText = "font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;color:#869089;margin:0 0 2px";
    const sBod = document.createElement("div");
    sBod.style.cssText = "font:400 13px/1.4 system-ui,-apple-system,sans-serif;margin:0";
    sTxt.appendChild(sTit);
    sTxt.appendChild(sBod);
    strip.appendChild(sDot);
    strip.appendChild(sTxt);
    ref.current.appendChild(cv);
    ref.current.appendChild(strip);
    const labelX = padL, chipX0 = 150, chipW = 38, chipGap = 6, barX0 = 372, barX1 = W - padR - 38;
    const headY = 34, row0 = 64, rowH = 32;
    const chipX = j => chipX0 + j * (chipW + chipGap);
    const rowY = i => row0 + i * rowH;
    function rowAt(py) {
      const i = Math.floor((py - (row0 - rowH / 2)) / rowH);
      return i >= 0 && i < ROWS.length ? i : null;
    }
    function draw() {
      const col = C();
      ctx.clearRect(0, 0, W, H);
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.sub;
      ctx.textBaseline = "middle";
      for (let j = 0; j < GPUS.length; j++) {
        ctx.textAlign = "center";
        ctx.fillText(GPUS[j], chipX(j) + chipW / 2, headY);
      }
      ctx.textAlign = "left";
      ctx.fillText("weight memory vs FP16", barX0, headY);
      const sel = hover !== null ? hover : hl;
      for (let i = 0; i < ROWS.length; i++) {
        const r = ROWS[i], y = rowY(i), on = i === sel;
        if (on) {
          ctx.beginPath();
          ctx.roundRect(padL - 6, y - rowH / 2 + 3, W - padL - padR + 12, rowH - 6, 5);
          ctx.fillStyle = col.track;
          ctx.fill();
        }
        ctx.fillStyle = on ? col.body : col.sub;
        ctx.font = (on ? "600 " : "500 ") + "11px ui-monospace,Menlo,monospace";
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        ctx.fillText(r.label, labelX, y);
        for (let j = 0; j < GPUS.length; j++) {
          const sup = r.gpu[j];
          ctx.beginPath();
          ctx.roundRect(chipX(j), y - 10, chipW, 20, 4);
          ctx.fillStyle = sup ? col.ok[1] : "transparent";
          ctx.fill();
          ctx.strokeStyle = sup ? col.ok[0] : col.dim;
          ctx.lineWidth = 1;
          ctx.globalAlpha = sup ? 1 : 0.6;
          ctx.stroke();
          if (sup) {
            ctx.fillStyle = col.ok[0];
            ctx.font = "600 11px ui-monospace,Menlo,monospace";
            ctx.textAlign = "center";
            ctx.fillText("✓", chipX(j) + chipW / 2, y);
          }
          ctx.globalAlpha = 1;
        }
        ctx.beginPath();
        ctx.roundRect(barX0, y - 7, barX1 - barX0, 14, 3);
        ctx.fillStyle = col.track;
        ctx.fill();
        const bw = (barX1 - barX0) * r.frac * grow;
        ctx.beginPath();
        ctx.roundRect(barX0, y - 7, Math.max(2, bw), 14, 3);
        ctx.fillStyle = col.barF;
        ctx.fill();
        ctx.strokeStyle = col.bar;
        ctx.lineWidth = 1;
        ctx.stroke();
        ctx.fillStyle = col.bar;
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.textAlign = "left";
        ctx.fillText(r.val, barX1 + 6, y);
      }
      ctx.fillStyle = col.sub;
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.textAlign = "left";
      ctx.textBaseline = "alphabetic";
      ctx.fillText("✓ = supported; bar = weight footprint from bit width, excludes KV cache and activations", padL, H - 6);
      const e = EXPL[ROWS[sel].key];
      sDot.style.background = col.ok[0];
      setRich(sTit, e[0]);
      setRich(sBod, e[1]);
      sBod.style.color = col.body;
      strip.style.cssText = "display:flex;align-items:flex-start;gap:10px;padding:10px 14px;margin:10px 0 0;border-radius:6px;height:76px;overflow:hidden;background:" + col.stripBg + ";border:1px solid " + col.stripBrd;
    }
    function setHover(cx, cy) {
      const r = cv.getBoundingClientRect();
      hover = rowAt((cy - r.top) / r.height * H);
      dirty = true;
    }
    cv.addEventListener("mousemove", e => setHover(e.clientX, e.clientY));
    cv.addEventListener("mouseleave", () => {
      hover = null;
      dirty = true;
    });
    cv.addEventListener("touchstart", e => {
      if (e.touches[0]) setHover(e.touches[0].clientX, e.touches[0].clientY);
    }, {
      passive: true
    });
    cv.addEventListener("touchend", () => {
      hover = null;
      dirty = true;
    });
    const io = new IntersectionObserver(en => visible = en[0].isIntersecting, {
      threshold: 0.1
    });
    io.observe(cv);
    const themeObs = new MutationObserver(() => {
      dirty = true;
    });
    themeObs.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ["class"]
    });
    function loop(ts) {
      raf = requestAnimationFrame(loop);
      if (!visible) {
        last = ts;
        return;
      }
      const dt = last ? Math.min(0.05, (ts - last) / 1000) : 0;
      last = ts;
      if (grow < 1) {
        grow = Math.min(1, grow + dt / 0.6);
        dirty = true;
      }
      if (hover === null) {
        cyc += dt;
        if (cyc > 2.1) {
          cyc = 0;
          hl = (hl + 1) % ROWS.length;
          dirty = true;
        }
      }
      if (dirty) {
        dirty = false;
        draw();
      }
    }
    draw();
    raf = requestAnimationFrame(loop);
    return () => {
      cancelAnimationFrame(raf);
      io.disconnect();
      themeObs.disconnect();
      cv.remove();
      strip.remove();
    };
  }, []);
  return <div ref={ref} />;
};

export const EnginesSymbolLegend = () => <Note>
  Gated (🔒) features are enabled through Enterprise plans. <a href="mailto:support@baseten.co">Contact us</a> for more information.
</Note>;

*Quantization* trades precision for speed and memory efficiency. This guide covers Baseten's supported formats, hardware requirements, and model-specific recommendations.

Two facts bound a format choice: which GPU families run it, and how much weight memory it saves. The matrix below shows both at once. Each row is a format, the columns mark which GPUs support it, and the bar on the right is its weight footprint measured against `FP16`. `FP8` runs everywhere and halves the footprint, while the `FP4` formats reach a quarter but require a B200.

<QuantizationMatrix />

A ✓ marks the GPU families that run a format, so you can rule out the ones your hardware cannot run before weighing memory. Each bar divides the format's bit width by the 16-bit `FP16` baseline: 8-bit formats land at half, 4-bit formats at a quarter. The bar measures model weights only. It excludes KV cache and activation memory, and it shows neither end-to-end memory savings nor any accuracy trade-off, both of which depend on the model and workload. `FP4_MLP_ONLY` is mixed precision, so its bar sits between `FP8` and `FP4` rather than at a single clean ratio.

## Quantization options

Quantization type availability depends on the engine and GPU.

<EnginesSymbolLegend />

### Engine support

| **Quantization**       | [**BIS-LLM**](/engines/bis-llm/overview) | [**Engine-Builder-LLM**](/engines/engine-builder-llm/overview) | [**BEI**](/engines/bei/overview) |
| ---------------------- | ---------------------------------------- | -------------------------------------------------------------- | -------------------------------- |
| `FP8`                  | ✅                                        | ✅                                                              | ✅                                |
| `FP8_KV`               | ✅                                        | ✅                                                              | ⚠️                               |
| `FP4`                  | ✅                                        | ✅                                                              | ⚠️                               |
| `FP4_KV`               | ✅                                        | ✅                                                              | ⚠️                               |
| `FP4_MLP_ONLY`         | ✅                                        | ✅                                                              | ✅                                |
| `no_quant`             | ✅                                        | ✅                                                              | ✅                                |
| `INT8` / `SmoothQuant` | ❌                                        | ✅                                                              | ❌                                |

`_KV` quantization formats (`FP8_KV`, `FP4_KV`) store compressed KV cache state. Encoder models (BEI, BEI-Bert) do not use a decoder-style KV cache, so these formats are not applicable. The ⚠️ cells above mark that limitation, not partial support.

`INT8` and `SmoothQuant` quantization types are supported on Engine-Builder-LLM (v1) but rejected on BIS-LLM (v2). The v2 build raises an error at build time: use `FP8` or `FP4` instead, which provide better accuracy-to-compression ratios on modern GPUs.

### `no_quant` and pre-quantized checkpoints

Setting `quantization_type: no_quant` tells the engine to skip post-training quantization and use the checkpoint's native precision. This is the right choice in two scenarios:

1. **Unquantized FP16/BF16 checkpoints.** The engine uses the model's native dtype without any calibration step. This is the default for development and accuracy-critical deployments.

2. **Pre-quantized ModelOpt checkpoints.** Some Hugging Face repos ship with NVIDIA ModelOpt quantization already applied (indicated by an `hf_quant_config.json` file in the repo). For these checkpoints, set `quantization_type: no_quant`. The engine detects the ModelOpt config and applies the pre-baked quantization automatically. Attempting to re-quantize a ModelOpt checkpoint with a different `quantization_type` causes a build error.

**Example: deploying a pre-quantized ModelOpt checkpoint on BIS-LLM**

```yaml theme={"system"}
trt_llm:
  inference_stack: v2
  build:
    checkpoint_repository:
      source: HF
      repo: "nvidia/DeepSeek-V3.1-NVFP4"
    quantization_type: no_quant  # ModelOpt quantization detected from hf_quant_config.json
```

Non-ModelOpt pre-quantized checkpoints (for example, GPTQ or AWQ safetensors) are not supported. The build rejects them with an error.

### GPU support

| **GPU type** | `FP8` | `FP8_KV` | `FP4` | `FP4_KV` | `FP4_MLP_ONLY` |
| ------------ | ----- | -------- | ----- | -------- | -------------- |
| **L4**       | ✅     | ✅        | ❌     | ❌        | ❌              |
| **H100**     | ✅     | ✅        | ❌     | ❌        | ❌              |
| **H200**     | ✅     | ✅        | ❌     | ❌        | ❌              |
| **B200**     | ✅     | ✅        | ✅     | ✅        | ✅              |

## Model recommendations

Some model families have specific quantization requirements that affect accuracy.

### Qwen2 models

Qwen2 retains QKV projection bias (attention bias), while Qwen3, Llama3, Llama2, and most other models remove it. This makes Qwen2 sensitive to symmetric KV cache quantization, so `FP8_KV` causes quality degradation. Use regular `FP8` instead and increase calibration size to 1024 or greater for better accuracy.

### Llama models

Llama variants work well with `FP8_KV` and standard calibration sizes (1024-1536). For B200 deployments, use `FP4_MLP_ONLY` for the best balance of speed and quality.

### BEI models (embeddings)

Use `FP8` for causal embedding models. Skip quantization for smaller models since the overhead isn't worth the minimal benefit and Bert is not supported. BEI doesn't support `FP8_KV` or other `_KV` formats because encoder models have no KV cache to quantize.

## Calibration

Quantization requires calibration data to determine optimal scaling factors. Larger models generally need more calibration samples.

### Calibration datasets

The default dataset is `cnn_dailymail` (general news text). For specialized models, or fine-tunes specific to a chat template, use domain-specific datasets when available.
For using a custom dataset, reference the huggingface name under `calib_dataset`, and make sure the dataset has a `train` split with a `text`/`messages` column.

When using the `messages` column, we require the tokenizer of your model to have a `apply_chat_template()` function on which we can apply `apply_chat_template(row["messages"]) for row in rows`.
If you want to use a dataset without preprocessing, you can provide a `text` column.

For chat-based calibration with thinking , we open-sourced [`baseten/quant_calibration_dataset_v1`](https://huggingface.co/datasets/baseten/quant_calibration_dataset_v1), to showcase an example.

### Calibration configuration

```yaml theme={"system"}
quantization_config:
  calib_size: 768                    # Number of samples
  calib_dataset: "abisee/cnn_dailymail"  # Dataset name
  calib_max_seq_length: 1024          # Max sequence length
```

Increase `calib_size` for larger models. Use domain-specific datasets when available for better accuracy on specialized tasks.

## Hardware requirements

`FP4` quantization requires B200 GPUs. `FP8` runs on L4 and above.

| **Quantization** | **Minimum GPU** | **Recommended GPU** | **Memory reduction** |
| ---------------- | --------------- | ------------------- | -------------------- |
| `FP16`/`BF16`    | A100            | H100                | None                 |
| `FP8`            | L4              | H100                | \~50%                |
| `FP8_KV`         | L4              | H100                | \~60%                |
| `FP4`            | B200            | B200                | \~75%                |
| `FP4_KV`         | B200            | B200                | \~80%                |

### Configuration examples

**Engine-Builder-LLM:**

```yaml theme={"system"}
trt_llm:
  build:
    base_model: decoder
    quantization_type: fp8
    quantization_config:
      calib_size: 1024
```

**BIS-LLM:**

```yaml theme={"system"}
trt_llm:
  inference_stack: v2
  build:
    quantization_type: fp8
    quantization_config:
      calib_size: 1024
  runtime:
    max_seq_len: 32768
```

**BEI:**

```yaml theme={"system"}
trt_llm:
  build:
    base_model: encoder
    quantization_type: fp8
    max_num_tokens: 16384
```

Set `quantization_type` in the build section and add `quantization_config` to customize calibration. BIS-LLM uses `inference_stack: v2` while Engine-Builder-LLM uses `base_model: decoder`.

## Best practices

### When to use quantization

Use `FP8` for production deployments to achieve cost-effective scaling. For memory-constrained environments, `FP8_KV` or `FP4` variants provide additional memory reduction. Quantization becomes essential for models over 15B parameters where memory and cost savings are significant.

### When to avoid quantization

Skip quantization when maximum accuracy is critical. Use `FP16`/`BF16` instead. Small models under 8B parameters see minimal benefit from quantization. BEI-Bert models don't support quantization at all. During research and development, `FP16` provides faster iteration without calibration overhead.

### Optimization tips

Use calibration datasets that match your domain for best accuracy. Test quantized models with your specific data before production deployment. Monitor the accuracy vs. performance trade-off and consider your hardware constraints when selecting quantization type.

## Related

* [Configure Engine-Builder-LLM quantization](/engines/engine-builder-llm/engine-builder-config): Dense model build options.
* [Configure BIS-LLM quantization](/engines/bis-llm/bis-llm-config): MoE model build options.
* [Configure BEI quantization](/engines/bei/bei-reference): Embedding model build options.
