> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Cold starts

> Learn what makes a cold start slow and how to shrink it for your model.

export const MiniColdStart = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const PER = 9000;
    const STAGES = [["Scaled to zero", 0], ["Waking up", 1 / 3], ["Loading model", 2 / 3], ["Active", 1]];
    const STAGE_PH = {
      "Scaled to zero": 0.01,
      "Waking up": 0.12,
      "Loading model": 0.26,
      "Active": 0.65
    };
    const SEGS = [["container pull", 0.06, 0.18, 74], ["weight load", 0.18, 0.34, 100], ["engine init", 0.34, 0.44, 62]];
    const QX = 40, QY = 84, QW = 110, QH = 32;
    const RX = 190, RY = 84, RW = 240, RH = 32;
    const BX = 190, BY = 132, BH = 8;
    let dotRadii = [2.5, 2.5, 2.5, 2.5];
    let frozen = null;
    const opts = {
      title: "Cold start",
      desc: "Click a status to freeze the loop there. Click anywhere else to resume.",
      formula: "cold start = container pull + weight load + engine initialization",
      W: 580,
      H: 215,
      onClick: function (x, y) {
        if (y > 10 && y < 48) {
          let nearest = null, nearestDist = Infinity;
          STAGES.forEach(s => {
            const sx = 40 + s[1] * 500;
            const d = Math.abs(x - sx);
            if (d < nearestDist) {
              nearestDist = d;
              nearest = s[0];
            }
          });
          if (nearestDist < 90) frozen = frozen === nearest ? null : nearest;
        } else {
          frozen = null;
        }
      },
      draw: function (ctx, t, p) {
        const h = window._mdHelpers;
        const ph = frozen != null ? STAGE_PH[frozen] : t % PER / PER;
        let visState = "stopped", status = "Scaled to zero";
        if (ph < 0.06) {
          visState = "stopped";
          status = "Scaled to zero";
        } else if (ph < 0.18) {
          visState = "starting";
          status = "Waking up";
        } else if (ph < 0.44) {
          visState = "starting";
          status = "Loading model";
        } else if (ph < 0.50) {
          visState = "ready";
          status = "Active";
        } else if (ph < 0.84) {
          visState = "busy";
          status = "Active";
        } else if (ph < 0.94) {
          visState = "stopping";
          status = "Active";
        } else {
          visState = "stopped";
          status = "Scaled to zero";
        }
        const prog = h.fade(ph, 0.06, 0.44);
        ctx.strokeStyle = p.brd;
        ctx.lineWidth = 0.8;
        ctx.beginPath();
        ctx.moveTo(40, 36);
        ctx.lineTo(540, 36);
        ctx.stroke();
        STAGES.forEach((s, i) => {
          const sx = h.lerp(40, 540, s[1]);
          const cur = status === s[0];
          dotRadii[i] += ((cur ? 4.2 : 2.5) - dotRadii[i]) * 0.18;
          ctx.fillStyle = cur ? p.p : p.brd;
          ctx.beginPath();
          ctx.arc(sx, 36, dotRadii[i], 0, Math.PI * 2);
          ctx.fill();
          ctx.font = (cur ? "600 " : "500 ") + "9.5px ui-monospace,Menlo,monospace";
          ctx.fillStyle = cur ? p.txt : p.sub;
          ctx.textAlign = "center";
          ctx.textBaseline = "middle";
          ctx.fillText(s[0], sx, 22);
        });
        ctx.globalAlpha = 0.8;
        ctx.setLineDash([3, 3]);
        ctx.beginPath();
        ctx.roundRect(QX, QY, QW, QH, 6);
        ctx.strokeStyle = p.brd;
        ctx.lineWidth = 1.3;
        ctx.stroke();
        ctx.setLineDash([]);
        ctx.font = "500 9.5px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        ctx.fillText("Queue", QX + 10, QY + QH / 2);
        ctx.globalAlpha = 1;
        h.repBox(ctx, RX, RY, RW, RH, visState, "Replica", prog, p, status);
        const barA = h.fade(ph, 0.04, 0.07) * (1 - h.fade(ph, 0.84, 0.92)) * h.lerp(1, 0.55, h.fade(ph, 0.44, 0.52));
        if (barA > 0.01) {
          ctx.globalAlpha = barA;
          let sx = BX;
          SEGS.forEach(seg => {
            const frac = h.fade(ph, seg[1], seg[2]);
            const active = ph >= seg[1] && ph < seg[2];
            ctx.strokeStyle = p.brd;
            ctx.lineWidth = 1;
            ctx.beginPath();
            ctx.roundRect(sx, BY, seg[3], BH, 2);
            ctx.stroke();
            if (frac > 0) {
              ctx.fillStyle = frac >= 1 ? p.p : p.w;
              ctx.beginPath();
              ctx.roundRect(sx, BY, seg[3] * frac, BH, 2);
              ctx.fill();
            }
            ctx.font = (active ? "600 " : "500 ") + "8.5px ui-monospace,Menlo,monospace";
            ctx.fillStyle = active ? p.w : p.sub;
            ctx.textAlign = "center";
            ctx.textBaseline = "middle";
            ctx.fillText(seg[0], sx + seg[3] / 2, BY + BH + 11);
            sx += seg[3] + 2;
          });
          ctx.globalAlpha = 1;
        }
        const reqVis = ph > 0.03 && ph < 0.84;
        let rx = 128;
        const ry = QY + QH / 2;
        if (ph < 0.06) {
          rx = h.lerp(14, 128, h.fade(ph, 0.03, 0.06));
        } else if (ph < 0.44) {
          rx = 128 + Math.sin(t * 0.005) * 2;
        } else {
          rx = h.lerp(128, RX + RW / 2, h.fade(ph, 0.44, 0.50));
        }
        if (reqVis) {
          const a = h.fade(ph, 0.03, 0.05) * (1 - h.fade(ph, 0.80, 0.84));
          ctx.globalAlpha = a;
          ctx.fillStyle = ph < 0.44 ? p.w : p.p;
          ctx.beginPath();
          ctx.arc(rx, ry, 4.5, 0, Math.PI * 2);
          ctx.fill();
          ctx.globalAlpha = 1;
        }
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        let cap = "";
        if (ph < 0.03) cap = "No replicas running, deployment is Scaled to zero"; else if (ph < 0.06) cap = "Request arrives and triggers a scale-up"; else if (ph < 0.18) cap = "Container pull: image layers stream to the node, request waits"; else if (ph < 0.34) cap = "Weight load: weights move into GPU memory, request waits"; else if (ph < 0.44) cap = "Engine initialization: CUDA graphs and kernel compilation"; else if (ph < 0.50) cap = "Replica is Active and admits the queued request"; else if (ph < 0.84) cap = "Replica serves the request"; else if (ph < 0.94) cap = "Idle past scale_down_delay, the replica winds down"; else cap = "No replicas running, deployment is Scaled to zero";
        ctx.fillText(cap, 40, 188);
      }
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._mdMount) {
        cleanup = window._mdMount(ref.current, opts);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const MiniDiagramEngine = () => {
  React.useEffect(() => {
    if (window._mdMount) return;
    const isDark = () => document.documentElement.classList.contains("dark");
    const lerp = (a, b, t) => a + (b - a) * Math.min(1, Math.max(0, t));
    const fade = (p, a, b) => Math.min(1, Math.max(0, (p - a) / (b - a)));
    function P() {
      const d = isDark();
      return {
        bg: d ? "#021309" : "#fff",
        sub: "#869089",
        brd: d ? "#344339" : "#dee4de",
        brdM: d ? "#203026" : "#f4f9f3",
        q: d ? "#4a90ff" : "#2176ff",
        qFill: d ? "rgba(74,144,255,0.18)" : "rgba(199,220,255,0.7)",
        qDark: "#114aa6",
        w: d ? "#f7c42f" : "#9c7400",
        p: d ? "#19E76E" : "#0e863f",
        rb: "#005934",
        rbf: d ? "rgba(25,231,110,0.22)" : "rgba(178,247,207,0.55)",
        rsf: d ? "rgba(247,196,47,0.18)" : "rgba(253,237,188,0.55)",
        txt: d ? "#dee4de" : "#0c1d13"
      };
    }
    function repBox(ctx, x, y, w, h, state, label, prog, p, displayState) {
      let fl = p.bg, st = p.p, tc = p.p, ds = [];
      if (state === "stopped") {
        fl = "transparent";
        st = p.brd;
        tc = p.sub;
        ds = [3, 3];
      } else if (state === "starting") {
        fl = p.rsf;
        st = p.w;
        tc = p.w;
      } else if (state === "busy") {
        fl = p.rbf;
        st = p.rb;
        tc = p.rb;
      } else if (state === "stopping") {
        st = p.sub;
        tc = p.sub;
      }
      ctx.globalAlpha = state === "stopped" || state === "stopping" ? 0.55 : 1;
      ctx.setLineDash(ds);
      ctx.beginPath();
      ctx.roundRect(x, y, w, h, 6);
      ctx.fillStyle = fl;
      ctx.fill();
      ctx.strokeStyle = st;
      ctx.lineWidth = 1.3;
      ctx.stroke();
      ctx.setLineDash([]);
      ctx.globalAlpha = 1;
      ctx.font = "500 11px ui-monospace,Menlo,monospace";
      ctx.fillStyle = tc;
      ctx.textAlign = "left";
      ctx.textBaseline = "middle";
      ctx.fillText(label, x + 10, y + h / 2);
      ctx.font = "500 9.5px ui-monospace,Menlo,monospace";
      ctx.textAlign = "right";
      ctx.fillText(displayState || state, x + w - 10, y + h / 2);
      if (state === "starting" && prog > 0) {
        ctx.fillStyle = p.w;
        ctx.fillRect(x, y + h - 2, w * prog, 2);
      }
    }
    function setRich(el, s) {
      el.replaceChildren();
      const parts = s.split("`");
      for (let i = 0; i < parts.length; i++) {
        if (i % 2 === 0) {
          if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
        } else {
          const code = document.createElement("code");
          code.textContent = parts[i];
          code.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
          el.appendChild(code);
        }
      }
    }
    window._mdHelpers = {
      lerp,
      fade,
      repBox
    };
    window._mdMount = function (root, opts) {
      const W = opts.W || 580, H = opts.H || 200;
      const card = document.createElement("div");
      const tit = document.createElement("div");
      tit.style.cssText = "font:500 12px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;margin:0 0 4px";
      const desc = document.createElement("p");
      desc.style.cssText = "margin:0 0 10px;font-size:12px;line-height:1.45;font-family:system-ui,-apple-system,sans-serif";
      const cv = document.createElement("canvas");
      cv.style.cssText = "display:block;width:100%;max-width:" + W + "px;touch-action:pan-y";
      const formula = document.createElement("div");
      formula.style.cssText = "margin:8px 0 0;font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;border-radius:4px;padding:4px 8px;display:inline-block";
      setRich(tit, opts.title);
      setRich(desc, opts.desc);
      if (opts.formula) setRich(formula, opts.formula);
      card.appendChild(tit);
      card.appendChild(desc);
      card.appendChild(cv);
      if (opts.formula) card.appendChild(formula);
      root.appendChild(card);
      const ctx = cv.getContext("2d");
      const dpr = window.devicePixelRatio || 1;
      cv.width = W * dpr;
      cv.height = H * dpr;
      cv.style.height = H + "px";
      ctx.scale(dpr, dpr);
      if (opts.onClick) {
        cv.style.cursor = "pointer";
        cv.addEventListener("click", e => {
          const r = cv.getBoundingClientRect();
          opts.onClick((e.clientX - r.left) / r.width * W, (e.clientY - r.top) / r.height * H);
        });
      }
      function applyTheme() {
        const d = isDark();
        card.style.cssText = "border:1px solid " + (d ? "#344339" : "#f4f9f3") + ";border-radius:8px;padding:16px 18px;margin:12px 0;background:" + (d ? "#021309" : "#fff") + ";max-width:" + W + "px";
        tit.style.color = d ? "#dee4de" : "#0c1d13";
        desc.style.color = d ? "#9CA59E" : "#5a675e";
        formula.style.background = d ? "#0C1D13" : "#f4f9f3";
        formula.style.borderColor = d ? "#203026" : "#dee4de";
        formula.style.border = "1px solid " + (d ? "#203026" : "#dee4de");
        formula.style.color = d ? "#19E76E" : "#0e863f";
        if (opts.title) setRich(tit, opts.title);
        if (opts.desc) setRich(desc, opts.desc);
        if (opts.formula) setRich(formula, opts.formula);
      }
      applyTheme();
      let visible = true, raf = 0, t0 = performance.now(), dirty = true;
      const obs = new IntersectionObserver(e => visible = e[0].isIntersecting, {
        threshold: 0.15
      });
      obs.observe(cv);
      const themeObs = new MutationObserver(() => {
        dirty = true;
        applyTheme();
      });
      themeObs.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ["class"]
      });
      function loop(ts) {
        raf = requestAnimationFrame(loop);
        if (!visible) {
          dirty = true;
          return;
        }
        const t = ts - t0;
        ctx.clearRect(0, 0, W, H);
        opts.draw(ctx, t, P());
        dirty = false;
      }
      raf = requestAnimationFrame(loop);
      return () => {
        cancelAnimationFrame(raf);
        obs.disconnect();
        themeObs.disconnect();
        card.remove();
      };
    };
    return () => {
      delete window._mdMount;
      delete window._mdHelpers;
    };
  }, []);
  return <span />;
};

<MiniDiagramEngine />

A *cold start* is the time a fresh replica spends starting up before it can accept traffic. A request that triggers one waits in the queue until the replica is ready, so the cold-start duration sets the latency floor for that request. The following diagram traces a deployment through that cycle, from **Scaled to zero** to **Active** and back, with the startup steps that add up to the wait.

<MiniColdStart />

## Cold start triggers

Every new replica cold-starts before it can serve traffic, no matter why it was created.

*Scale-from-zero* applies when a deployment's `min_replica` is 0. Once traffic stays at zero for the full [`scale_down_delay`](/deployment/autoscaling/overview#how-autoscaling-works), the autoscaler shuts down every replica. The next request finds nothing running and waits for a full startup, so users feel this cold start directly.

*Scaling events* happen while a deployment is already serving traffic. When load crosses the scaling threshold, the autoscaler adds replicas, and each one cold-starts before it can serve traffic. The replicas already running keep serving in the meantime, so users notice only when load grows faster than new replicas can start up.

## Contributing factors

A new replica works through these steps in order, and their durations add up to the cold-start time:

| Step                  | What happens                                                                                                                                                                       |
| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Container pull        | The replica downloads your Docker image layers.                                                                                                                                    |
| Weight load           | Model weights (often 10s to 100s of GB) move from storage into GPU memory.                                                                                                         |
| Engine initialization | Your model's setup code runs. For inference engines like vLLM and SGLang, this includes capturing CUDA graphs, compiling kernels with `torch.compile`, and profiling the KV cache. |

Baseten provides the [Baseten Delivery Network (BDN)](/development/model/bdn), which speeds up weight load by mirroring your weights and caching them next to your replicas. Each scale-up then reads them from a nearby cache instead of re-downloading hundreds of gigabytes from the source. Baseten also streams your container image in the background, so container pull rarely dominates.

That leaves engine initialization as the step you usually own. It dominates for small models (a few billion parameters or fewer), where CUDA graph capture and `torch.compile` can run well over a minute, and Baseten doesn't cache those artifacts unless you opt in. For the largest models (70B+ parameters or large mixture-of-experts), even BDN can't make hundreds of gigabytes instant, so weight load stays the dominant step.

Cold start time isn't a fixed number. It varies with model size and the GPU you run on, so benchmark your own model rather than relying on a single figure.

## Reduce cold starts

The biggest win comes from shrinking whichever step dominates startup. When that isn't enough, keep replicas warm so requests skip the cold start entirely.

### Faster weight loading

BDN runs automatically on engine-builder deployments. On any other deployment, turn it on by adding a [`weights`](/development/model/bdn) block to your config.

### Compilation caching

`torch.compile` and CUDA graph capture rerun on every fresh replica unless their output is cached. [Torch compile caching](/development/model/runtime-caching#torch-compile-caching), built on [b10cache](/development/model/runtime-caching), persists those artifacts so a new replica loads them instead of recompiling, which cuts compilation from minutes to roughly 5 to 20 seconds.

### Warm replicas

`min_replica` sets a floor on running replicas. Keep it at 1 or higher so a replica stays warm to serve the first request. You pay for that replica while it's idle, but the request no longer waits for a startup. Set it in the dashboard or through the [autoscaling settings API](/reference/management-api/deployments/autoscaling/updates-a-deployments-autoscaling-settings):

```json Autoscaling settings theme={"system"}
{
  "min_replica": 1
}
```

For production redundancy, set `min_replica` to 2 or higher so one replica can fail during maintenance without causing cold starts.

Your replica floor trades cost against latency:

| Approach                         | Cost                                                                      | Latency                                                       | Best for                                              |
| -------------------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------------- | ----------------------------------------------------- |
| Scale to zero (`min_replica: 0`) | No charge while idle; wake-up minutes are [billed](/organization/billing) | First request waits for a full cold start                     | Batch jobs, development, and spiky low-volume traffic |
| Always on (`min_replica` ≥ 1)    | Pay for idle replicas                                                     | No cold start from idle, though new replicas still cold-start | Latency-sensitive production traffic                  |

Start warm for production, and scale to zero only when an occasional slow first request is acceptable.

### Pre-warming

For predictable traffic spikes, raise `min_replica` ahead of the expected load:

```bash Terminal theme={"system"}
# 10-15 minutes before expected spike
curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Bearer $BASETEN_API_KEY" \
  -d '{"min_replica": 5}'
```

After traffic stabilizes, reset to your normal minimum.

### Scale-down delay

A longer scale-down delay keeps replicas warm through brief traffic dips. The default is 15 minutes (900 seconds); this example doubles it to 30 minutes:

```json Autoscaling settings theme={"system"}
{
  "scale_down_delay": 1800
}
```

A replica that's still warm when traffic returns serves immediately, with no cold start.

## Next steps

* [Request lifecycle](/deployment/autoscaling/request-lifecycle): What happens to requests during cold starts, including queuing and timeout behavior.
* [Autoscaling](/deployment/autoscaling/overview): Configure `min_replica`, `scale_down_delay`, and the rest of the scaling settings.
* [Traffic patterns](/deployment/autoscaling/traffic-patterns): Pre-warming strategies for different traffic types.
* [Billing and usage](/organization/billing): How cold-start time is metered.
* [Troubleshooting](/troubleshooting/deployments#autoscaling-issues): Diagnose cold start issues.
