> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Autoscaling engines

> Engine-specific autoscaling settings for BEI, Engine-Builder-LLM, and BIS-LLM

export const MiniTokenScaling = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const PER = 7000;
    const REQS = [{
      at: 0.05,
      tok: 256
    }, {
      at: 0.18,
      tok: 256
    }, {
      at: 0.32,
      tok: 512
    }, {
      at: 0.45,
      tok: 256
    }, {
      at: 0.58,
      tok: 10000
    }];
    const TARGET_CONC = 10;
    const TARGET_UTIL = 0.7;
    const THRESHOLD_REQ = TARGET_CONC * TARGET_UTIL;
    const TARGET_TOK = 8000;
    const MAX_TOK = 12000;
    const opts = {
      title: "`target_in_flight_tokens`",
      desc: "A request-based autoscaler counts how many requests are open. The token-based autoscaler counts the work each request implies. Same five requests; very different decisions.",
      formula: "BIS-LLM: desired_replicas = ⌈in_flight_tokens / `target_in_flight_tokens`⌉",
      W: 580,
      H: 240,
      draw: function (ctx, t, p) {
        const h = window._mdHelpers;
        const ph = t % PER / PER;
        const x0 = 40, x1 = 540, w = x1 - x0;
        let inflightReq = 0;
        let inflightTok = 0;
        for (const r of REQS) if (ph > r.at) {
          inflightReq++;
          inflightTok += r.tok;
        }
        const overReq = inflightReq > THRESHOLD_REQ;
        const overTok = inflightTok > TARGET_TOK;
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        ctx.fillText("Request-based view (standard autoscaler)", x0, 14);
        ctx.textAlign = "right";
        ctx.fillStyle = p.txt;
        ctx.fillText(inflightReq + " of " + TARGET_CONC + " slots", x1, 14);
        const slotW = 36, slotGap = 6, slotH = 22, slotY = 30;
        const rowW = TARGET_CONC * slotW + (TARGET_CONC - 1) * slotGap;
        const rowX = x0 + (w - rowW) / 2;
        for (let i = 0; i < TARGET_CONC; i++) {
          const sx = rowX + i * (slotW + slotGap);
          ctx.strokeStyle = p.brd;
          ctx.lineWidth = 0.8;
          ctx.globalAlpha = 0.7;
          ctx.beginPath();
          ctx.roundRect(sx, slotY, slotW, slotH, 3);
          ctx.stroke();
          ctx.globalAlpha = 1;
          if (i < inflightReq) {
            const arrAlpha = h.fade(ph, REQS[i].at, REQS[i].at + 0.06);
            ctx.fillStyle = overReq ? p.w : p.q;
            ctx.globalAlpha = 0.85 * arrAlpha;
            ctx.fillRect(sx + 1, slotY + 1, slotW - 2, slotH - 2);
            ctx.globalAlpha = 1;
          }
        }
        const thrX = rowX + THRESHOLD_REQ * (slotW + slotGap) - slotGap / 2;
        ctx.strokeStyle = p.sub;
        ctx.setLineDash([3, 3]);
        ctx.lineWidth = 1.1;
        ctx.globalAlpha = 0.9;
        ctx.beginPath();
        ctx.moveTo(thrX, slotY - 8);
        ctx.lineTo(thrX, slotY + slotH + 4);
        ctx.stroke();
        ctx.setLineDash([]);
        ctx.globalAlpha = 1;
        ctx.font = "500 9px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "bottom";
        ctx.fillText("threshold = 7", thrX + 5, slotY - 6);
        ctx.font = "500 11px ui-monospace,Menlo,monospace";
        ctx.textBaseline = "middle";
        ctx.fillStyle = overReq ? p.w : p.sub;
        ctx.textAlign = "left";
        const reqStatus = overReq ? "Above threshold. Scale up." : "Below threshold. No scale-up.";
        ctx.fillText(reqStatus, x0, 82);
        ctx.strokeStyle = p.brdM;
        ctx.lineWidth = 1;
        ctx.beginPath();
        ctx.moveTo(x0, 108);
        ctx.lineTo(x1, 108);
        ctx.stroke();
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        ctx.fillText("Token-based view (BIS-LLM autoscaler)", x0, 122);
        ctx.textAlign = "right";
        ctx.fillStyle = overTok ? p.w : p.txt;
        const pct = Math.round(inflightTok / TARGET_TOK * 100);
        ctx.fillText(inflightTok.toLocaleString() + " of " + TARGET_TOK.toLocaleString() + " tokens (" + pct + "%)", x1, 122);
        const tokY = 140, tokH = 26;
        const PX_PER_TOK = w / MAX_TOK;
        const targetX = x0 + TARGET_TOK * PX_PER_TOK;
        ctx.strokeStyle = p.brd;
        ctx.lineWidth = 0.8;
        ctx.beginPath();
        ctx.roundRect(x0, tokY, w, tokH, 3);
        ctx.stroke();
        let cx = x0;
        for (let i = 0; i < REQS.length; i++) {
          const r = REQS[i];
          if (ph <= r.at) continue;
          const arrAlpha = h.fade(ph, r.at, r.at + 0.06);
          const segW = r.tok * PX_PER_TOK;
          const drawW = Math.min(segW, x1 - cx - 1);
          if (drawW > 0) {
            ctx.globalAlpha = 0.85 * arrAlpha;
            ctx.fillStyle = overTok ? p.w : p.q;
            ctx.fillRect(cx + 1, tokY + 1, drawW - 1, tokH - 2);
            ctx.globalAlpha = 1;
          }
          cx += segW;
        }
        ctx.strokeStyle = p.sub;
        ctx.setLineDash([3, 3]);
        ctx.lineWidth = 1.1;
        ctx.globalAlpha = 0.9;
        ctx.beginPath();
        ctx.moveTo(targetX, tokY - 8);
        ctx.lineTo(targetX, tokY + tokH + 4);
        ctx.stroke();
        ctx.setLineDash([]);
        ctx.globalAlpha = 1;
        ctx.font = "500 9px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "bottom";
        ctx.fillText("target = 8,000", targetX + 5, tokY - 6);
        ctx.font = "500 11px ui-monospace,Menlo,monospace";
        ctx.textBaseline = "middle";
        ctx.fillStyle = overTok ? p.w : p.sub;
        ctx.textAlign = "left";
        let tokStatus;
        if (!overTok) tokStatus = "Under target. No scale-up."; else {
          const reps = Math.ceil(inflightTok / TARGET_TOK);
          tokStatus = "Above target. Scale to " + reps + " replicas.";
        }
        ctx.fillText(tokStatus, x0, 188);
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        let cap;
        if (ph < REQS[4].at) cap = "Four small requests arrive (256 to 512 tokens each). Both views agree: the deployment is fine."; else cap = "One 10,000-token request lands. The request view doesn't notice. The token view scales up.";
        ctx.fillText(cap, x0, 220);
      }
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._mdMount) {
        cleanup = window._mdMount(ref.current, opts);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

BEI, Engine-Builder-LLM, and BIS-LLM batch requests for throughput, so they need different autoscaling settings than standard models. BEI and Engine-Builder-LLM scale on **request concurrency** with engine-tuned targets. BIS-LLM scales on **target in-flight tokens** to account for the wide variance in LLM request size.

## Quick reference

| Setting                    | BEI                                             | Engine-Builder-LLM            |
| -------------------------- | ----------------------------------------------- | ----------------------------- |
| **Target utilization**     | 25%                                             | 40-50%                        |
| **Concurrency target**     | 96+ (min >= 8)                                  | 32-256                        |
| **Special considerations** | Use Performance client for multi-payload routes | Never exceed max\_batch\_size |

BIS-LLM uses a token-aware metric instead of request concurrency. See the [BIS-LLM](#bis-llm) section.

For general autoscaling concepts, see [Autoscaling](/deployment/autoscaling/overview).

***

## BEI

BEI provides millisecond-range inference times and scales differently than other models. With too few replicas, backpressure can build up quickly.

### Recommendations

| Setting            | Value              | Why                                             |
| ------------------ | ------------------ | ----------------------------------------------- |
| Target utilization | **25%**            | Low target provides headroom for traffic spikes |
| Concurrency target | **96+** (min >= 8) | High concurrency allows maximum throughput      |
| Autoscaling        | **Enabled**        | Required for variable traffic                   |

### Multi-payload routes

The `/rerank` and `/v1/embeddings` routes can send multiple items per request, which challenges request-based autoscaling. Each API call counts as one request regardless of how many items it contains.

Use the [Performance client](/inference/performance-client) for optimal scaling with multi-payload routes.

***

## Engine-Builder-LLM

Engine-Builder-LLM uses dynamic batching similar to BEI but doesn't face the multi-payload challenge.

### Recommendations

| Setting            | Value      | Why                                    |
| ------------------ | ---------- | -------------------------------------- |
| Target utilization | **40-50%** | Accommodates dynamic batching behavior |
| Concurrency target | **32-256** | Match or stay below max\_batch\_size   |
| Min concurrency    | **>= 8**   | Optimal performance floor              |

### Concurrency target vs `max_batch_size`

`concurrency_target` tells the autoscaler how many concurrent requests each replica should handle. `max_batch_size` tells the engine how many sequences to batch in a single forward pass. They measure different things: concurrency is a scaling signal, batch size is an engine limit.

Setting `concurrency_target` higher than `max_batch_size` causes on-replica queueing. The autoscaler sends more requests than the engine can batch, and excess requests wait instead of scaling to a new replica. Always keep `concurrency_target` at or below `max_batch_size`.

### Lookahead decoding

If using lookahead decoding, set concurrency target to the same or slightly below `max_batch_size`. This allows lookahead to perform optimizations. This guidance applies to all Engine-Builder-LLM deployments, not just those using lookahead.

***

## BIS-LLM

BIS-LLM autoscales differently from Baseten's other engines. The [standard Baseten autoscaler](/deployment/autoscaling/overview) divides **in-flight requests** by a per-replica concurrency target to decide how many replicas to run. That works when requests cost about the same to serve, but LLM requests don't. One prompt might decode 50 tokens; the next might decode 10,000. Counting them as equal load over-provisions on short prompts and under-provisions on long ones.

The BIS-LLM engine scales on **target in-flight tokens** instead. An in-flight token is any token a replica is currently working on. The deployment API rejects `concurrency_target` and `target_utilization_percentage`. Configure scaling with `target_in_flight_tokens` only (replica bounds in the table below).

<MiniTokenScaling />

### How in-flight tokens are counted

The Planner's load measure is the sum of two per-worker counts:

* **Prefill tokens:** the uncached input tokens currently being processed across active requests. Tokens served from KV cache reuse do not count.
* **Decode tokens:** the full sequence length (input plus tokens generated so far) for every request currently decoding.

This is why request count alone misses the real load: a long-context decode with a 100K-token KV cache contributes 100K to the load measure even though it is "just one request."

The total across the deployment roughly equals `active_requests × average_tokens_per_request`, which makes targets easy to derive from request-based intuition.

### What you configure

You configure four standard fields plus a token target. All five are editable from the deployment's autoscaling settings in the Baseten UI.

```yaml config.yaml theme={"system"}
autoscaling_settings:
  min_replica: 1
  max_replica: 4
  autoscaling_window: 300   # seconds; recommended 300 (5 minutes)
  scale_down_delay: 300     # seconds; recommended 300 (5 minutes)

additional_autoscaling_config:
  metrics:
    - name: in_flight_tokens
      target: 40000
```

| Setting                       | What it controls                                                                                                                                                                                                       |
| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `min_replica` / `max_replica` | Replica bounds. Scale-to-zero is not supported. `min_replica` defaults to `1` when omitted. Set `max_replica` to cap scale-up during cold starts.                                                                      |
| `autoscaling_window`          | Sliding window (in seconds) used to average in-flight tokens before making a scaling decision. Longer windows smooth out short spikes; shorter windows react faster. A 5-minute (300s) window is a reasonable default. |
| `scale_down_delay`            | Waiting period (in seconds) before removing replicas after load drops.                                                                                                                                                 |
| `metrics.target`              | Target in-flight tokens per replica. This is the primary knob to tune.                                                                                                                                                 |

### Set target in-flight tokens

For most LLMs, a target in the **50,000 to 150,000** range is a sensible starting point. From there:

* **Lower target:** more replicas at a given load. More headroom, higher cost.
* **Higher target:** fewer replicas at a given load. Less headroom, lower cost.

If you're coming from another engine and already have a request concurrency target in mind, convert it directly. In-flight tokens roughly equals `active_requests × average_tokens_per_request`, so:

```math theme={"system"}
target = concurrency\_target × average\_tokens\_per\_request
```

`average_tokens_per_request` is approximately `average_input_tokens + average_output_tokens`. For a model averaging 4K input and 1K output tokens at a concurrency of 10:

```math theme={"system"}
target = 5{,}000 × 10 = 50{,}000
```

Once a target is set, the autoscaler computes desired replicas as:

```math theme={"system"}
desired\_replicas = avg\_in\_flight\_tokens / target\_in\_flight\_tokens
```

Start conservatively and adjust based on observed latency.

### Graceful scale-down with `scale_down_half_life_seconds`

Kubernetes (through Knative) allows scale-down of up to **50% of replicas per step**. For most services this is fine, but BIS-LLM deployments hold KV cache state on each worker. A sudden 50% drop in replica count means a 50% loss of KV cache space, which causes a wave of cache misses and TTFT spikes for cache-sensitive workloads (long shared system prompts, multi-turn conversations).

`scale_down_half_life_seconds` applies **exponential decay** to the current replica count, lowering it gradually over the configured half-life rather than allowing a single large drop. The default is **900 seconds (15 minutes)**, which keeps KV cache erosion gradual. Set it shorter to release capacity faster and shed KV cache state more abruptly; set it longer to keep replicas (and their cache) around for more reuse.

This setting lives in `b10_autoscaling_config` in the `llm_config` block of the Management API (`POST /v1/llm_models`), not in Truss `config.yaml`. It is not configurable from the UI.

```json theme={"system"}
{
  "b10_autoscaling_config": {
    "scale_down_half_life_seconds": 900
  }
}
```

Recommended range: 600-1800 seconds. Setting it shorter risks the same abrupt KV cache loss the setting exists to prevent. Setting it longer wastes GPU cost.

### Known issues

Two failure modes are structural to the autoscaling loop, not configuration mistakes.

**Scale-up overshoot during rapid load increase.** Workers take time to start (model loading and warmup). Until they are healthy, they are not counted in the autoscaler's worker pool, so the Planner continues to see high per-worker load and keeps requesting more replicas. By the time all the new workers are healthy, the deployment may be over-provisioned.

Mitigation: set `max_replica` to cap the overshoot. Cold start time is the underlying constraint; there is no way to fully prevent this without reducing it.

**Scale-down thrashing and KV cache loss.** When workers scale down, their KV cache disappears with them. Aggressive or frequent scale-down forces full prefill on requests that would otherwise have hit cache (higher TTFT), and if many replicas drop at once a large fraction of total KV cache space vanishes simultaneously.

Mitigation: set `scale_down_half_life_seconds` to 600-1800 seconds and keep `scale_down_delay` modest. The half-life exists specifically to prevent abrupt large-scale downscales.

### Monitoring

The Planner emits autoscaler metrics directly. Start with `autoscaler_in_flight_tokens` to see what the autoscaler is currently observing, then reach for the averaged and policy-applied metrics when tuning.

| Metric                             | Type  | What it measures                                                                                                                                 |
| ---------------------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| `autoscaler_in_flight_tokens`      | Gauge | Instantaneous in-flight tokens across all workers. The primary product-visible metric, labeled with `exported_namespace` and `model_version_id`. |
| `autoscaler_avg_in_flight_tokens`  | Gauge | Sliding-window average used for scaling decisions.                                                                                               |
| `autoscaler_avg_num_requests`      | Gauge | Sliding-window average request count across all workers.                                                                                         |
| `autoscaler_avg_num_workers`       | Gauge | Sliding-window average healthy worker count. The denominator for per-worker load.                                                                |
| `autoscaler_desired_scale`         | Gauge | Raw desired scale from the token-based autoscaler, before policy.                                                                                |
| `autoscaler_policy_desired_scale`  | Gauge | Desired scale after policy is applied.                                                                                                           |
| `autoscaler_rounded_desired_scale` | Gauge | Final integer scale sent to Kubernetes.                                                                                                          |

What to watch:

* `autoscaler_rounded_desired_scale` pinned at `max_replica` for extended periods means the deployment is capacity-constrained. Raise the cap or the target.
* A large persistent gap between `autoscaler_desired_scale` and the actual replica count means scaling is too slow in one direction. Tune `autoscaling_window` for scale-up or `scale_down_half_life_seconds` for scale-down.

***

## Related

* [Configure autoscaling parameters](/deployment/autoscaling/overview): Full parameter reference.
* [Match autoscaling to your traffic pattern](/deployment/autoscaling/traffic-patterns): Pattern-specific settings.
* [Deploy BEI embedding models](/engines/bei/overview): General BEI documentation.
* [Deploy Engine-Builder-LLM models](/engines/engine-builder-llm/overview): Generation model details.
* [Deploy BIS-LLM models](/engines/bis-llm/overview): MoE and advanced LLM engine details.
* [Maximize throughput with the Performance Client](/inference/performance-client): Client usage for batch processing.