> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Cold starts

> Understand cold starts and how to minimize their impact on your deployments.

export const MiniColdStart = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const PER = 6500;
    const STAGES = [["Scaled to zero", 0], ["Waking up", 1 / 3], ["Loading model", 2 / 3], ["Active", 1]];
    const STAGE_PH = {
      "Scaled to zero": 0.02,
      "Waking up": 0.12,
      "Loading model": 0.30,
      "Active": 0.65
    };
    let dotRadii = [2.5, 2.5, 2.5, 2.5];
    let frozen = null;
    const opts = {
      title: "Cold start",
      desc: "Click any stage below to freeze on it. Click again to resume the loop.",
      formula: "Request waits in queue while replica boots",
      W: 580,
      H: 210,
      onClick: function (x, y) {
        if (y > 10 && y < 48) {
          let nearest = null, nearestDist = Infinity;
          STAGES.forEach(s => {
            const sx = 40 + s[1] * 500;
            const d = Math.abs(x - sx);
            if (d < nearestDist) {
              nearestDist = d;
              nearest = s[0];
            }
          });
          if (nearestDist < 90) frozen = frozen === nearest ? null : nearest;
        } else {
          frozen = null;
        }
      },
      draw: function (ctx, t, p) {
        const h = window._mdHelpers;
        const ph = frozen != null ? STAGE_PH[frozen] : t % PER / PER;
        let visState = "stopped", status = "Scaled to zero", prog = 0;
        if (ph < 0.05) {
          visState = "stopped";
          status = "Scaled to zero";
        } else if (ph < 0.20) {
          visState = "starting";
          status = "Waking up";
          prog = (ph - 0.05) / 0.35;
        } else if (ph < 0.40) {
          visState = "starting";
          status = "Loading model";
          prog = (ph - 0.05) / 0.35;
        } else if (ph < 0.50) {
          visState = "ready";
          status = "Active";
        } else if (ph < 0.85) {
          visState = "busy";
          status = "Active";
        } else if (ph < 0.95) {
          visState = "stopping";
          status = "Active";
        } else {
          visState = "stopped";
          status = "Scaled to zero";
        }
        ctx.strokeStyle = p.brd;
        ctx.lineWidth = 0.8;
        ctx.beginPath();
        ctx.moveTo(40, 36);
        ctx.lineTo(540, 36);
        ctx.stroke();
        STAGES.forEach((s, i) => {
          const sx = h.lerp(40, 540, s[1]);
          const cur = status === s[0];
          dotRadii[i] += ((cur ? 4.2 : 2.5) - dotRadii[i]) * 0.18;
          ctx.fillStyle = cur ? p.p : p.brd;
          ctx.beginPath();
          ctx.arc(sx, 36, dotRadii[i], 0, Math.PI * 2);
          ctx.fill();
          ctx.font = (cur ? "600 " : "500 ") + "9.5px ui-monospace,Menlo,monospace";
          ctx.fillStyle = cur ? p.txt : p.sub;
          ctx.textAlign = "center";
          ctx.textBaseline = "middle";
          ctx.fillText(s[0], sx, 22);
        });
        h.repBox(ctx, 180, 84, 220, 32, visState, "Replica", prog, p, status);
        const reqVis = ph > 0.02 && ph < 0.85;
        let rx = 40, ry = 100;
        if (ph < 0.05) {
          rx = h.lerp(20, 140, ph / 0.05);
        } else if (ph < 0.50) {
          rx = 140 + Math.sin(t * 0.005) * 2;
        } else if (ph < 0.85) {
          rx = h.lerp(140, 290, (ph - 0.50) / 0.05);
        }
        const reqColor = ph < 0.50 ? p.w : p.p;
        if (reqVis) {
          const a = h.fade(ph, 0.02, 0.04) * (1 - h.fade(ph, 0.82, 0.85));
          ctx.globalAlpha = a;
          ctx.fillStyle = reqColor;
          ctx.beginPath();
          ctx.arc(rx, ry, 4.5, 0, Math.PI * 2);
          ctx.fill();
          ctx.globalAlpha = 1;
        }
        ctx.font = "500 10px ui-monospace,Menlo,monospace";
        ctx.fillStyle = p.sub;
        ctx.textAlign = "left";
        ctx.textBaseline = "middle";
        let cap = "";
        if (status === "Waking up") cap = "Container starting, request waits in queue"; else if (status === "Loading model") cap = "Model loading " + Math.round((ph - 0.20) / 0.20 * 100) + "%, request waits"; else if (visState === "ready") cap = "Replica is now Active and admits the queued request"; else if (visState === "busy") cap = "Replica processing the request"; else if (visState === "stopping") cap = "Idle past scale_down_delay, replica reclaimed"; else cap = "No replicas running, deployment is Scaled to zero";
        ctx.fillText(cap, 40, 180);
      }
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._mdMount) {
        cleanup = window._mdMount(ref.current, opts);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const MiniDiagramEngine = () => {
  React.useEffect(() => {
    if (window._mdMount) return;
    const isDark = () => document.documentElement.classList.contains("dark");
    const lerp = (a, b, t) => a + (b - a) * Math.min(1, Math.max(0, t));
    const fade = (p, a, b) => Math.min(1, Math.max(0, (p - a) / (b - a)));
    function P() {
      const d = isDark();
      return {
        bg: d ? "#021309" : "#fff",
        sub: "#869089",
        brd: d ? "#344339" : "#dee4de",
        brdM: d ? "#203026" : "#f4f9f3",
        q: d ? "#4a90ff" : "#2176ff",
        qFill: d ? "rgba(74,144,255,0.18)" : "rgba(199,220,255,0.7)",
        qDark: "#114aa6",
        w: d ? "#f7c42f" : "#9c7400",
        p: d ? "#19E76E" : "#0e863f",
        rb: "#005934",
        rbf: d ? "rgba(25,231,110,0.22)" : "rgba(178,247,207,0.55)",
        rsf: d ? "rgba(247,196,47,0.18)" : "rgba(253,237,188,0.55)",
        txt: d ? "#dee4de" : "#0c1d13"
      };
    }
    function repBox(ctx, x, y, w, h, state, label, prog, p, displayState) {
      let fl = p.bg, st = p.p, tc = p.p, ds = [];
      if (state === "stopped") {
        fl = "transparent";
        st = p.brd;
        tc = p.sub;
        ds = [3, 3];
      } else if (state === "starting") {
        fl = p.rsf;
        st = p.w;
        tc = p.w;
      } else if (state === "busy") {
        fl = p.rbf;
        st = p.rb;
        tc = p.rb;
      } else if (state === "stopping") {
        st = p.sub;
        tc = p.sub;
      }
      ctx.globalAlpha = state === "stopped" || state === "stopping" ? 0.55 : 1;
      ctx.setLineDash(ds);
      ctx.beginPath();
      ctx.roundRect(x, y, w, h, 6);
      ctx.fillStyle = fl;
      ctx.fill();
      ctx.strokeStyle = st;
      ctx.lineWidth = 1.3;
      ctx.stroke();
      ctx.setLineDash([]);
      ctx.globalAlpha = 1;
      ctx.font = "500 11px ui-monospace,Menlo,monospace";
      ctx.fillStyle = tc;
      ctx.textAlign = "left";
      ctx.textBaseline = "middle";
      ctx.fillText(label, x + 10, y + h / 2);
      ctx.font = "500 9.5px ui-monospace,Menlo,monospace";
      ctx.textAlign = "right";
      ctx.fillText(displayState || state, x + w - 10, y + h / 2);
      if (state === "starting" && prog > 0) {
        ctx.fillStyle = p.w;
        ctx.fillRect(x, y + h - 2, w * prog, 2);
      }
    }
    function setRich(el, s) {
      el.replaceChildren();
      const parts = s.split("`");
      for (let i = 0; i < parts.length; i++) {
        if (i % 2 === 0) {
          if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
        } else {
          const code = document.createElement("code");
          code.textContent = parts[i];
          code.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
          el.appendChild(code);
        }
      }
    }
    window._mdHelpers = {
      lerp,
      fade,
      repBox
    };
    window._mdMount = function (root, opts) {
      const W = opts.W || 580, H = opts.H || 200;
      const card = document.createElement("div");
      const tit = document.createElement("div");
      tit.style.cssText = "font:500 12px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;margin:0 0 4px";
      const desc = document.createElement("p");
      desc.style.cssText = "margin:0 0 10px;font-size:12px;line-height:1.45;font-family:system-ui,-apple-system,sans-serif";
      const cv = document.createElement("canvas");
      cv.style.cssText = "display:block;width:100%;max-width:" + W + "px";
      const formula = document.createElement("div");
      formula.style.cssText = "margin:8px 0 0;font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;border-radius:4px;padding:4px 8px;display:inline-block";
      setRich(tit, opts.title);
      setRich(desc, opts.desc);
      if (opts.formula) setRich(formula, opts.formula);
      card.appendChild(tit);
      card.appendChild(desc);
      card.appendChild(cv);
      if (opts.formula) card.appendChild(formula);
      root.appendChild(card);
      const ctx = cv.getContext("2d");
      const dpr = window.devicePixelRatio || 1;
      cv.width = W * dpr;
      cv.height = H * dpr;
      cv.style.height = H + "px";
      ctx.scale(dpr, dpr);
      if (opts.onClick) {
        cv.style.cursor = "pointer";
        cv.addEventListener("click", e => {
          const r = cv.getBoundingClientRect();
          opts.onClick((e.clientX - r.left) / r.width * W, (e.clientY - r.top) / r.height * H);
        });
      }
      function applyTheme() {
        const d = isDark();
        card.style.cssText = "border:1px solid " + (d ? "#344339" : "#f4f9f3") + ";border-radius:8px;padding:16px 18px;margin:12px 0;background:" + (d ? "#021309" : "#fff") + ";max-width:" + W + "px";
        tit.style.color = d ? "#dee4de" : "#0c1d13";
        desc.style.color = d ? "#9CA59E" : "#5a675e";
        formula.style.background = d ? "#0C1D13" : "#f4f9f3";
        formula.style.borderColor = d ? "#203026" : "#dee4de";
        formula.style.border = "1px solid " + (d ? "#203026" : "#dee4de");
        formula.style.color = d ? "#19E76E" : "#0e863f";
        if (opts.title) setRich(tit, opts.title);
        if (opts.desc) setRich(desc, opts.desc);
        if (opts.formula) setRich(formula, opts.formula);
      }
      applyTheme();
      let visible = true, raf = 0, t0 = performance.now(), dirty = true;
      const obs = new IntersectionObserver(e => visible = e[0].isIntersecting, {
        threshold: 0.15
      });
      obs.observe(cv);
      const themeObs = new MutationObserver(() => {
        dirty = true;
        applyTheme();
      });
      themeObs.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ["class"]
      });
      function loop(ts) {
        raf = requestAnimationFrame(loop);
        if (!visible) {
          dirty = true;
          return;
        }
        const t = ts - t0;
        ctx.clearRect(0, 0, W, H);
        opts.draw(ctx, t, P());
        dirty = false;
      }
      raf = requestAnimationFrame(loop);
      return () => {
        cancelAnimationFrame(raf);
        obs.disconnect();
        themeObs.disconnect();
        card.remove();
      };
    };
  }, []);
  return <span />;
};

<MiniDiagramEngine />

A *cold start* is the time a fresh replica spends booting before it can accept traffic. During that boot, the deployment moves through the statuses you see in the dashboard: it leaves **Scaled to zero**, enters **Waking up** while the container is scheduled, then **Loading model** while weights move into GPU memory and the model's setup code runs. Any request that triggered the scale-up sits in the queue until the deployment reaches **Active**, so the cold-start duration becomes the latency floor for that request. The diagram below traces a deployment cycling from **Scaled to zero** through **Waking up** and **Loading model** to **Active** and back, with a queued request waiting through the boot bar before being admitted.

<MiniColdStart />

***

## When cold starts happen

Cold starts show up in two situations. The first is *scale-from-zero*: a deployment with `min_replica` set to 0 has shut all its replicas down to save cost, so the next request triggers a fresh boot before anything can serve it. The second is during ordinary scaling events: when load crosses the autoscaler's threshold, every new replica it provisions has to cold-start before joining the pool. The first pattern is the more visible one because the scale-from-zero request is the one waiting on the boot. The second is usually masked by the existing replicas absorbing load while the new ones come online, so users only feel it when load grows faster than the autoscaler can keep up.

***

## What contributes to cold start time

Cold start duration is the sum of three steps that the new replica works through in order:

| Factor         | Impact                                                                 |
| -------------- | ---------------------------------------------------------------------- |
| Model loading  | Loading model weights (10s–100s of GB), typically the dominant factor. |
| Container pull | Downloading Docker image layers.                                       |
| Initialization | Running your model's setup code.                                       |

For large models, cold starts can take minutes, and model weight downloads are usually the bottleneck. Even with caching in place, the physics of moving hundreds of gigabytes from storage into GPU memory creates inherent lag, which is why Baseten's [platform optimizations](#platform-optimizations) focus on shrinking that hop.

***

## Minimizing cold starts

### Keep replicas warm

Set [`min_replica`](/reference/management-api/deployments/autoscaling/updates-a-deployments-autoscaling-settings) to always have at least one replica ready to serve requests. This eliminates cold starts for the first request but increases cost.

```json theme={"system"}
{
  "min_replica": 1
}
```

For production redundancy, set `min_replica ≥ 2` so one replica can fail during maintenance without causing cold starts.

### Pre-warm before expected traffic

For predictable traffic spikes, increase min replicas before the expected load:

```bash theme={"system"}
# 10-15 minutes before expected spike
curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Api-Key $BASETEN_API_KEY" \
  -d '{"min_replica": 5}'
```

After traffic stabilizes, reset to your normal minimum.

### Use longer scale-down delay

A longer scale-down delay keeps replicas warm during temporary traffic dips:

```json theme={"system"}
{
  "scale_down_delay": 900
}
```

This prevents cold starts when traffic returns within the delay window.

***

## Platform optimizations

Baseten automatically applies several optimizations to reduce cold start times:

**Baseten Delivery Network**: The [`weights`](/development/model/bdn) configuration optimizes cold starts by mirroring weights to Baseten's infrastructure and caching them close to your model pods. See [Baseten Delivery Network (BDN)](/development/model/bdn) for full configuration options.

**Image streaming**: Optimized images stream into nodes, allowing model loading to begin before the full download completes:

```
Successfully pulled streaming-enabled image in 15.851s. Image size: 32 GB.
```

These optimizations are applied automatically.

***

## The tradeoff

Cold starts create a fundamental tradeoff between **cost** and **latency**:

| Approach                         | Cost                          | Latency                                    |
| -------------------------------- | ----------------------------- | ------------------------------------------ |
| Scale to zero (`min_replica: 0`) | Lower: no cost when idle      | Higher: first request waits for cold start |
| Always on (`min_replica: ≥1`)    | Higher: pay for idle replicas | Lower: no cold starts                      |

For latency-sensitive production workloads, the cost of keeping replicas warm is usually justified. For batch workloads or development, scale-to-zero often makes sense.

***

## Next steps

* [Request lifecycle](/deployment/autoscaling/request-lifecycle): What happens to requests during cold starts, including queuing and timeout behavior.
* [Autoscaling](/deployment/autoscaling/overview): Configure min replicas and scale-down delay.
* [Traffic patterns](/deployment/autoscaling/traffic-patterns): Pre-warming strategies for different traffic types.
* [Troubleshooting](/troubleshooting/deployments#autoscaling-issues): Diagnose cold start issues.
