> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Advanced features for BIS-LLM

> KV-aware routing, disaggregated serving, and speculative decoding

export const SpeculativeDecoding = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const W = 640, H = 196, padL = 16, padR = 16, labelW = 112;
    const N = 12, RUN = 3, cycleDur = 1.15, holdDur = 1.6;
    const TOK = ["The", "cat", "sat", "on", "the", "mat", "and", "watched", "the", "rain", "fall", "."];
    const MODES = [{
      name: "NGram",
      drafter: "n-gram guess",
      blurb: "N-gram automata propose the next run from pattern matches in the context. No model runs to draft."
    }, {
      name: "Eagle",
      drafter: "Eagle head",
      blurb: "A lightweight Eagle head drafts the next run from the model's hidden state."
    }, {
      name: "MTP",
      drafter: "MTP layers",
      blurb: "The model's own multi-token-prediction layers draft several tokens per step."
    }];
    const isDark = () => document.documentElement.classList.contains("dark");
    const C = () => isDark() ? {
      sub: "#869089",
      body: "#dee4de",
      brd: "#344339",
      box: "#0C1D13",
      stripBg: "#0C1D13",
      stripBrd: "#203026",
      ok: ["#17D465", "rgba(23,212,101,0.22)"],
      draft: ["#d6a52a", "rgba(214,165,42,0.16)"]
    } : {
      sub: "#869089",
      body: "#021309",
      brd: "#dee4de",
      box: "#ffffff",
      stripBg: "#f4f9f3",
      stripBrd: "#dee4de",
      ok: ["#0e863f", "rgba(178,247,207,0.7)"],
      draft: ["#9c7400", "rgba(156,116,0,0.14)"]
    };
    function setRich(el, s) {
      el.replaceChildren();
      const parts = s.split("`");
      for (let i = 0; i < parts.length; i++) {
        if (i % 2 === 0) {
          if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
        } else {
          const c = document.createElement("code");
          c.textContent = parts[i];
          c.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
          el.appendChild(c);
        }
      }
    }
    let mode = 0, t = 0, hold = 0, visible = true, raf = 0, last = 0, col;
    const ctrl = document.createElement("div");
    ctrl.style.cssText = "display:flex;flex-wrap:wrap;gap:6px;margin:0 0 10px";
    const btns = MODES.map((m, i) => {
      const b = document.createElement("button");
      b.textContent = m.name;
      b.type = "button";
      b.addEventListener("click", () => {
        mode = i;
        t = 0;
        hold = 0;
        style();
      });
      ctrl.appendChild(b);
      return b;
    });
    const cv = document.createElement("canvas");
    cv.style.cssText = "display:block;width:100%;max-width:" + W + "px;touch-action:pan-y";
    const ctx = cv.getContext("2d");
    const dpr = window.devicePixelRatio || 1;
    cv.width = W * dpr;
    cv.height = H * dpr;
    cv.style.height = H + "px";
    ctx.scale(dpr, dpr);
    const strip = document.createElement("div");
    const sDot = document.createElement("span");
    sDot.style.cssText = "flex:0 0 auto;width:8px;height:8px;border-radius:50%;margin-top:6px";
    const sTxt = document.createElement("div");
    sTxt.style.cssText = "flex:1;min-width:0";
    const sTit = document.createElement("div");
    sTit.style.cssText = "font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;color:#869089;margin:0 0 2px";
    const sBod = document.createElement("div");
    sBod.style.cssText = "font:400 13px/1.4 system-ui,-apple-system,sans-serif;margin:0";
    sTxt.appendChild(sTit);
    sTxt.appendChild(sBod);
    strip.appendChild(sDot);
    strip.appendChild(sTxt);
    ref.current.appendChild(ctrl);
    ref.current.appendChild(cv);
    ref.current.appendChild(strip);
    const trackX0 = padL + labelW, trackX1 = W - padR, ch = 22, yGuess = 62, yOut = 130;
    ctx.font = "600 9px ui-monospace,Menlo,monospace";
    const avail = trackX1 - trackX0, tokPad = 10;
    const natural = TOK.map(x => ctx.measureText(x).width + tokPad);
    const scale = avail / natural.reduce((a, b) => a + b, 0);
    const tokPx = Math.max(7, 9 * Math.min(1, scale));
    const cellX = [], cellW = [];
    {
      let acc = trackX0;
      for (const n of natural) {
        const w = n * scale;
        cellX.push(acc);
        cellW.push(w);
        acc += w;
      }
    }
    function rr(x, y, w, h, r) {
      ctx.beginPath();
      ctx.roundRect(x, y, w, h, r);
    }
    function tokText(i, cx, cy, color, alpha) {
      ctx.globalAlpha = alpha;
      ctx.fillStyle = color;
      ctx.font = "600 " + tokPx + "px ui-monospace,Menlo,monospace";
      ctx.textAlign = "center";
      ctx.textBaseline = "middle";
      ctx.fillText(TOK[i], cx, cy + 0.5);
      ctx.globalAlpha = 1;
    }
    function cellOutline(i, y) {
      const x = cellX[i] + 1, w = cellW[i] - 2, top = y - ch / 2;
      rr(x, top, w, ch, 3);
      ctx.strokeStyle = col.brd;
      ctx.lineWidth = 1;
      ctx.stroke();
    }
    function draw() {
      col = C();
      ctx.clearRect(0, 0, W, H);
      const m = MODES[mode];
      const total = Math.ceil(N / RUN);
      const done = t >= total * cycleDur;
      const cf = t / cycleDur;
      const cycle = done ? total : Math.floor(cf);
      const phase = done ? 1 : cf - cycle;
      const accepted = Math.min(N, cycle * RUN);
      const runN = !done && accepted < N ? Math.min(RUN, N - accepted) : 0;
      const passes = done ? total : Math.min(total, cycle + 1);
      const drafting = phase < 0.42, verifying = phase >= 0.42 && phase < 0.74, accepting = phase >= 0.74;
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.sub;
      ctx.textAlign = "left";
      ctx.textBaseline = "alphabetic";
      ctx.fillText("each verify is one big-model pass", padL, 18);
      ctx.textAlign = "right";
      ctx.textBaseline = "middle";
      ctx.font = "600 11px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.draft[0];
      ctx.fillText(m.drafter, trackX0 - 10, yGuess);
      ctx.font = "600 11px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.ok[0];
      ctx.fillText("output", trackX0 - 10, yOut - 4);
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.sub;
      ctx.fillText(passes + (passes === 1 ? " pass" : " passes"), trackX0 - 10, yOut + 9);
      for (let i = 0; i < runN; i++) {
        const idx = accepted + i;
        let a = 1;
        if (drafting) a = Math.min(1, phase / 0.42); else if (accepting) a = Math.max(0, 1 - (phase - 0.74) / 0.26);
        const x = cellX[idx] + 1, w = cellW[idx] - 2, top = yGuess - ch / 2;
        ctx.globalAlpha = a;
        ctx.setLineDash([3, 2]);
        rr(x, top, w, ch, 3);
        ctx.fillStyle = col.draft[1];
        ctx.fill();
        ctx.strokeStyle = col.draft[0];
        ctx.lineWidth = 1;
        ctx.stroke();
        ctx.setLineDash([]);
        ctx.globalAlpha = 1;
        tokText(idx, cellX[idx] + cellW[idx] / 2, yGuess, col.draft[0], a * 0.95);
      }
      if (runN > 0) {
        const cx = (cellX[accepted] + cellX[accepted + runN - 1] + cellW[accepted + runN - 1]) / 2;
        const y1 = yGuess + ch / 2 + 2, y2 = yOut - ch / 2 - 2;
        const hot = verifying || accepting;
        ctx.strokeStyle = hot ? col.ok[0] : col.brd;
        ctx.lineWidth = hot ? 1.5 : 1;
        ctx.beginPath();
        ctx.moveTo(cx, y1);
        ctx.lineTo(cx, y2);
        ctx.stroke();
        ctx.fillStyle = hot ? col.ok[0] : col.brd;
        ctx.beginPath();
        ctx.moveTo(cx, y2 + 1);
        ctx.lineTo(cx - 3.5, y2 - 4);
        ctx.lineTo(cx + 3.5, y2 - 4);
        ctx.fill();
        ctx.font = "600 9px ui-monospace,Menlo,monospace";
        const tag = "big model", tw = ctx.measureText(tag).width + 14;
        rr(cx - tw / 2, (y1 + y2) / 2 - 8, tw, 16, 4);
        ctx.fillStyle = col.box;
        ctx.fill();
        ctx.strokeStyle = hot ? col.ok[0] : col.sub;
        ctx.lineWidth = 1;
        ctx.stroke();
        ctx.fillStyle = hot ? col.ok[0] : col.sub;
        ctx.textAlign = "center";
        ctx.textBaseline = "middle";
        ctx.fillText(tag, cx, (y1 + y2) / 2);
      }
      for (let i = 0; i < N; i++) {
        if (i < accepted) {
          const x = cellX[i] + 1, w = cellW[i] - 2, top = yOut - ch / 2;
          rr(x, top, w, ch, 3);
          ctx.fillStyle = col.ok[1];
          ctx.fill();
          ctx.strokeStyle = col.ok[0];
          ctx.lineWidth = 1;
          ctx.stroke();
          tokText(i, cellX[i] + cellW[i] / 2, yOut, col.ok[0], 1);
        } else if (runN > 0 && i >= accepted && i < accepted + runN && accepting) {
          const a = (phase - 0.74) / 0.26, x = cellX[i] + 1, w = cellW[i] - 2, top = yOut - ch / 2;
          ctx.globalAlpha = a;
          rr(x, top, w, ch, 3);
          ctx.fillStyle = col.ok[1];
          ctx.fill();
          ctx.strokeStyle = col.ok[0];
          ctx.lineWidth = 1;
          ctx.stroke();
          ctx.globalAlpha = 1;
          tokText(i, cellX[i] + cellW[i] / 2, yOut, col.ok[0], a);
        } else {
          cellOutline(i, yOut);
        }
      }
      const e = done ? ["Same output, " + total + " model passes", "The big model ran once per verified run instead of once per token, so " + N + " tokens cleared in " + total + " passes. Only the draft source changes between " + MODES.map(x => x.name).join(", ") + "."] : ["Draft with " + m.name, m.blurb + " The big model verifies the whole run in one forward pass and accepts the tokens that match, so several tokens clear per pass."];
      sDot.style.background = col.ok[0];
      setRich(sTit, e[0]);
      setRich(sBod, e[1]);
      sBod.style.color = col.body;
      strip.style.cssText = "display:flex;align-items:flex-start;gap:10px;padding:10px 14px;margin:10px 0 0;border-radius:6px;height:88px;overflow:hidden;background:" + col.stripBg + ";border:1px solid " + col.stripBrd;
    }
    function style() {
      col = C();
      btns.forEach((b, i) => {
        const on = i === mode;
        b.style.cssText = "font:500 12px system-ui,-apple-system,sans-serif;padding:4px 11px;border-radius:6px;cursor:pointer;border:1px solid " + (on ? col.ok[0] : col.brd) + ";background:" + (on ? col.ok[0] : "transparent") + ";color:" + (on ? isDark() ? "#021309" : "#fff" : col.sub);
      });
    }
    const io = new IntersectionObserver(en => visible = en[0].isIntersecting, {
      threshold: 0.1
    });
    io.observe(cv);
    const themeObs = new MutationObserver(() => {
      style();
      draw();
    });
    themeObs.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ["class"]
    });
    function loop(ts) {
      raf = requestAnimationFrame(loop);
      if (!visible) {
        last = ts;
        return;
      }
      const dt = last ? Math.min(0.05, (ts - last) / 1000) : 0;
      last = ts;
      const total = Math.ceil(N / RUN);
      if (t >= total * cycleDur) {
        hold += dt;
        if (hold > holdDur) {
          t = 0;
          hold = 0;
        }
      } else {
        t += dt;
      }
      draw();
    }
    style();
    draw();
    raf = requestAnimationFrame(loop);
    return () => {
      cancelAnimationFrame(raf);
      io.disconnect();
      themeObs.disconnect();
      ctrl.remove();
      cv.remove();
      strip.remove();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

BIS-LLM ships three Enterprise-gated production features that target distinct bottlenecks in large-scale LLM serving: KV-aware routing reduces time-to-first-token on repeated prefixes, disaggregated serving prevents long prefills from blocking decode latency, and speculative decoding raises throughput on a single replica. Each section uses the same shape: how it works, configuration, when to use it, and the metric to watch.

All three are configured through the BIS-LLM Management API (`POST /v1/llm_models`) under the `llm_config` block, not through Truss `config.yaml`. To enable any of them on your deployment, [contact your Baseten representative](mailto:support@baseten.co).

## KV-aware routing

Long prompts repeat context across requests. Without cache-aware routing, each worker rebuilds KV state from scratch on every request, even when another worker in the deployment already has the prefix cached. The KV-aware router maintains a real-time index of every worker's KV cache contents and picks the worker most likely to serve a request from cache.

### How it works

The router runs as a stateful service in front of the BIS-LLM worker pool. For each incoming request:

1. The frontend tokenizes the prompt and calls the router for a worker assignment.
2. The router scores each worker against the prompt's tokens using a radix tree that indexes every worker's KV cache.
3. The router returns the worker most likely to serve the request from cache, balanced against current worker load.
4. The frontend sends the request directly to that worker.

Workers publish KV cache block events as blocks are added or evicted; the router consumes those events to keep its index in sync. The router periodically writes index snapshots to persistent storage so it can recover state on restart without replaying every event.

### Configuration

Settings live under `b10_routing_config`. Defaults match production Model APIs and rarely need to change.

```json theme={"system"}
{
  "b10_routing_config": {
    "router_queue_policy": "fcfs",
    "router_overlap_score_weight": 3.5,
    "router_temperature": 0.05
  }
}
```

<ParamField body="router_queue_policy" type="string" default="fcfs">
  How queued requests are ordered when all workers are saturated. Queueing rarely triggers under normal load.

  * `fcfs`: First-come, first-served with priority bumps. Optimizes tail TTFT and provides fairness.
  * `wspt`: Weighted shortest processing time. Prioritizes cheaper requests (high cache hit, short prompts). Risks starving costly requests; use when average TTFT matters more than tail TTFT.
</ParamField>

<ParamField body="router_overlap_score_weight" type="number" default="3.5">
  Bias toward cache hits versus load balance. Higher values bias toward cache hits at the cost of balance; lower values bias toward balance at the cost of hits. [Contact us](mailto:support@baseten.co) before changing in production.
</ParamField>

<ParamField body="router_temperature" type="number" default="0.05">
  Randomness in worker selection. Higher values spread load across more workers; lower values concentrate hits on fewer workers. [Contact us](mailto:support@baseten.co) before changing in production.
</ParamField>

A single active router becomes a bottleneck above roughly 50 workers. For larger deployments, the router can run as multiple active replicas that share in-flight request state. [Contact us](mailto:support@baseten.co) to add router replicas.

### When to use

KV-aware routing is on by default for BIS-LLM deployments and pays off whenever prompts share prefixes: agent loops, chat with long system messages, RAG pipelines reusing retrieved context, and code completion. Workloads with no prefix overlap (unique single-turn prompts) see only the load-balancing benefit.

### Monitoring

| Metric                          | What it measures                                              | What to look for                                                                                             |
| ------------------------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
| `kv_cache_hit_rate`             | Actual KV cache hit rate observed by workers.                 | Baseline varies by model and traffic. Track changes over time, not absolute values.                          |
| `kv_cache_hit_rate_skew`        | Router's estimated hit rate minus actual hit rate.            | Typically slightly positive (\~+10%). Large positive: high cache churn. Large negative: missed event stream. |
| `kv_cache_best_prefix_hit_rate` | Best hit rate the router could have selected given its index. | Upper bound of routing quality for the current index.                                                        |
| `kv_cache_hit_rate_efficiency`  | Ratio of actual hit rate to best possible.                    | Typically 90-100%. Lower values mean the router is trading hits for balance.                                 |

## Disaggregated serving

In a standard deployment, each replica handles both prefill (prompt processing) and decode (token generation). When a long prompt arrives, the replica must finish prefill before it can decode any tokens, blocking shorter requests queued behind it.

### How it works

Disaggregated serving splits prefill and decode into separate replica groups:

* **Prefill replicas** process input prompts and transfer the resulting KV cache to decode replicas.
* **Decode replicas** receive KV cache from prefill replicas and generate output tokens.

Each phase scales independently based on its own load. A long prefill never blocks decode latency on other replicas.

### Configuration

Set `is_disaggregated` and `b10_disagg_config` in the `llm_config` block:

```json theme={"system"}
{
  "is_disaggregated": true,
  "b10_disagg_config": {
    "prefill_workers_per_replica": 1,
    "decode_workers_per_replica": 2
  }
}
```

<ParamField body="is_disaggregated" type="boolean" required>
  Enables disaggregated serving. Must be `true` for `b10_disagg_config` to take effect. Setting `b10_disagg_config` without `is_disaggregated: true` fails validation.
</ParamField>

<ParamField body="b10_disagg_config.prefill_workers_per_replica" type="integer" required>
  Prefill worker pods per replication unit. Must be an integer >= 1.
</ParamField>

<ParamField body="b10_disagg_config.decode_workers_per_replica" type="integer" required>
  Decode worker pods per replication unit. Must be an integer >= 1.
</ParamField>

The two worker counts define a **replication unit**: the smallest independently scalable group. A `prefill: 1, decode: 2` configuration means each unit has one prefill pod and two decode pods. The autoscaler scales the number of units, not individual pods.

The backend rejects deployments where `is_disaggregated` is `false` or absent but `b10_disagg_config` is set, and rejects deployments where `is_disaggregated` is `true` but either worker count is missing or less than one.

### When to use

Disaggregated serving fits deployments with at least one of these traits:

* **Mismatched prefill and decode resource profiles.** Long-context models (128K+ tokens) have compute-heavy prefills and memory-bound decodes. Separate scaling right-sizes each phase.
* **Strict TTFT targets.** Isolating prefill on dedicated replicas prevents decode requests from queuing behind long prompts.
* **Variable prompt lengths.** Mixed short/long workloads benefit more than uniform traffic.

For consistent prompt lengths or workloads where TTFT is not a bottleneck, aggregated serving is simpler and sufficient.

### Monitoring

Watch [BIS-LLM autoscaling metrics](/engines/performance-concepts/autoscaling-engines#bis-llm) on each replica group. Token-based autoscaling sizes prefill and decode independently using their own in-flight token counts.

## Speculative decoding

Speculative decoding accelerates inference by drafting several future tokens cheaply, then verifying them against the main model in a single forward pass. Accepted tokens advance the output; rejected tokens are discarded and the model resumes autoregressive decoding from the last accepted token.

### How it works

BIS-LLM speculative decoding uses a fast draft mechanism (a lightweight Eagle head, the model's own MTP layers, or n-gram automata) to generate candidate tokens. The main model then verifies these candidates in a single batched forward pass. Higher acceptance rates yield more tokens per forward pass and lower latency.

This is a different system from v1 [lookahead decoding](/engines/engine-builder-llm/lookahead-decoding), which uses n-gram patterns within a single model and is configured with `trt_llm.build.speculator`. The v2 stack rejects `trt_llm.build.speculator`; use `speculative_config` instead.

| Decoding type | How it works                                                                    | Best for                                                                   |
| ------------- | ------------------------------------------------------------------------------- | -------------------------------------------------------------------------- |
| `Eagle`       | Separate Eagle head drafts tokens from a hidden-state representation.           | Models with trained Eagle checkpoints.                                     |
| `MTP`         | The model's own multi-token-prediction layers draft multiple tokens per step.   | Models with MTP heads built in (DeepSeek-V3).                              |
| `NGram`       | N-gram automata predict tokens from pattern matching without model computation. | High-throughput workloads where latency matters more than acceptance rate. |

All three share the same loop: the draft mechanism proposes a run of tokens, the model verifies the whole run in one forward pass, and the matching tokens are accepted together. Only the draft source changes. Switch it to compare:

<SpeculativeDecoding />

### Configuration

Set `speculative_config` in the `llm_config` block. The required fields depend on `decoding_type`.

<ParamField body="speculative_config.decoding_type" type="string" required>
  Speculative strategy. One of `Eagle`, `MTP`, or `NGram` (case-insensitive).
</ParamField>

<ParamField body="speculative_config.speculative_model_dir" type="string">
  Required when `decoding_type` is `Eagle`. Path to the Eagle head weights directory. BDN mirrors this as a standalone weight volume, separate from the main model weights.
</ParamField>

<ParamField body="speculative_config.num_nextn_predict_layers" type="integer">
  Required when `decoding_type` is `MTP`. Number of next-token prediction layers in the model architecture.
</ParamField>

<ParamField body="speculative_config.max_draft_len" type="integer">
  Optional. Maximum number of tokens the draft proposes per step. Raise it for more aggressive speculation, lower it if acceptance is poor.
</ParamField>

<ParamField body="speculative_config.eagle3_one_model" type="boolean">
  Optional, `Eagle` only. Run the Eagle3 draft head and the target model as a single fused model. Set to `true` for Eagle3 checkpoints that support it.
</ParamField>

Eagle example:

```json theme={"system"}
{
  "speculative_config": {
    "decoding_type": "Eagle",
    "speculative_model_dir": "/models/eagle",
    "max_draft_len": 3,
    "eagle3_one_model": true
  }
}
```

MTP example:

```json theme={"system"}
{
  "speculative_config": {
    "decoding_type": "MTP",
    "num_nextn_predict_layers": 1
  }
}
```

NGram example:

```json theme={"system"}
{
  "speculative_config": {
    "decoding_type": "NGram"
  }
}
```

### When to use

Pick by model architecture, not preference. Use `MTP` for DeepSeek-V3 and other models that ship MTP heads. Use `Eagle` when you have a trained Eagle head for the target model. Use `NGram` for high-throughput workloads where any acceleration helps and no draft model is available.

### Monitoring

The BIS-LLM dashboard exposes `speculation_rate` when speculative decoding is active: the percentage of draft tokens accepted by the main model.

* **Above 80%**: Draft is well-aligned with the main model. Speculation is effective.
* **40-80%**: Some rejections. Consider tuning the draft model or switching decoding types.
* **Below 40%**: Speculation likely costs more than it saves. Disable it or reduce draft length.

## Related

* [BIS-LLM overview](/engines/bis-llm/overview): Engine fundamentals and supported model families.
* [BIS-LLM configuration](/engines/bis-llm/bis-llm-config): Truss `config.yaml` reference for the build step.
* [Autoscaling BIS-LLM](/engines/performance-concepts/autoscaling-engines#bis-llm): Token-based autoscaling for prefill, decode, and aggregated replicas.
* [Lookahead decoding (v1)](/engines/engine-builder-llm/lookahead-decoding): N-gram speculation for Engine-Builder-LLM, when you need the v1 path.
