> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Traffic patterns

> Identify your traffic pattern and configure autoscaling settings to match.

export const AutoscaleSteady = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    function mr(s) {
      return function () {
        s |= 0;
        s = s + 0x6D2B79F5 | 0;
        let t = Math.imul(s ^ s >>> 15, 1 | s);
        t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
        return ((t ^ t >>> 14) >>> 0) / 4294967296;
      };
    }
    function sm(a, w) {
      const o = new Array(a.length);
      let s = 0;
      for (let i = 0; i < a.length; i++) {
        s += a[i];
        if (i >= w) s -= a[i - w];
        o[i] = s / Math.min(i + 1, w);
      }
      return o;
    }
    const N = 240, r = mr(23);
    const raw = new Array(N).fill(0);
    for (let i = 0; i < N; i++) {
      const t = i / N, env = 0.5 - 0.5 * Math.cos(t * 2 * Math.PI);
      raw[i] = 12 + env * 28 + (r() - 0.5) * 1.2;
    }
    const sd = sm(raw, 10);
    const rep = sd.map(v => Math.max(2, Math.ceil(v / 8)));
    const data = {
      raw,
      sd,
      rep,
      yT: 48,
      yR: 8,
      sw: 10,
      ann: [[10, 45, "Morning, gradual ramp", "Load rises smoothly. The autoscaler sees no sudden jumps, so replicas step up one at a time as utilization crosses the threshold.", "g"], [90, 135, "Peak, running hot", "A `target_utilization_percentage` of 75 to 80% lets replicas pack tightly. It is cost-efficient because the traffic shape gives the autoscaler time to react.", "g"], [175, 215, "Evening, gradual drain", "Load falls just as smoothly. Replicas step down one by one after each `scale_down_delay` timer completes. No oscillation.", "g"], [220, 235, "Overnight floor", "A `min_replica` of 2 or more holds the floor for redundancy. Traffic stays light, but the deployment never goes below the safety net.", "m"]]
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._aceMount) {
        cleanup = window._aceMount(ref.current, data);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const AutoscaleScheduled = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    function mr(s) {
      return function () {
        s |= 0;
        s = s + 0x6D2B79F5 | 0;
        let t = Math.imul(s ^ s >>> 15, 1 | s);
        t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
        return ((t ^ t >>> 14) >>> 0) / 4294967296;
      };
    }
    const N = 240, r = mr(19);
    const raw = new Array(N).fill(0);
    function burst(s, e, p) {
      for (let i = s; i < e; i++) {
        const t = (i - s) / (e - s);
        raw[i] = Math.max(0, p * Math.sin(t * Math.PI) + (r() - 0.5) * 1.0);
      }
    }
    burst(70, 100, 36);
    burst(170, 200, 42);
    const rep = new Array(N).fill(0);
    for (let i = 60; i < 70; i++) rep[i] = 3;
    for (let i = 70; i < 110; i++) rep[i] = 5;
    for (let i = 110; i < 130; i++) rep[i] = 3;
    for (let i = 160; i < 170; i++) rep[i] = 3;
    for (let i = 170; i < 210; i++) rep[i] = 6;
    for (let i = 210; i < 230; i++) rep[i] = 3;
    const data = {
      raw,
      rep,
      yT: 48,
      yR: 8,
      sw: 10,
      reg: [[60, 70], [160, 170]],
      ann: [[10, 55, "Scaled to zero, $0 / min", "No traffic, `min_replica` is 0. Replicas shut down entirely. Billing pauses until the next request, or, here, the cron pre-warm.", "f"], [60, 69, "Pre-warm window (cron)", "Five minutes before the hourly job, a cron PATCHes `min_replica` to 3. Replicas boot and warm up while traffic is still zero.", "y"], [75, 100, "Burst absorbed, no cold start", "The batch hits warm replicas. No first-request latency penalty. That is why pre-warm is worth the brief cost.", "g"], [115, 155, "Cooldown, then back to zero", "Job done. Cron resets `min_replica` to 0, then `scale_down_delay` drains replicas. Compute returns to free.", "f"], [162, 168, "Next pre-warm", "Same playbook. Predictable cadence with cron means latency-sensitive batches never see a cold start.", "y"], [175, 210, "Second burst, same shape", "The pattern repeats. Cost stays low (idle is far greater than burst), but every job hits warm capacity.", "g"]]
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._aceMount) {
        cleanup = window._aceMount(ref.current, data);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const AutoscaleBursty = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    function mr(s) {
      return function () {
        s |= 0;
        s = s + 0x6D2B79F5 | 0;
        let t = Math.imul(s ^ s >>> 15, 1 | s);
        t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
        return ((t ^ t >>> 14) >>> 0) / 4294967296;
      };
    }
    const N = 240, r = mr(11);
    const raw = new Array(N).fill(0);
    for (let i = 0; i < N; i++) {
      let v = 8;
      if (i >= 50 && i < 60) v = 8 + (i - 50) * 2.4; else if (i >= 60 && i < 110) v = 32; else if (i >= 110 && i < 120) v = 32 - (i - 110) * 2.5; else if (i >= 120 && i < 165) v = 8; else if (i >= 165 && i < 175) v = 8 + (i - 165) * 2.6; else if (i >= 175 && i < 220) v = 34; else if (i >= 220 && i < 230) v = 34 - (i - 220) * 2.6;
      raw[i] = Math.max(0, v + (r() - 0.5) * 1.6);
    }
    const rep = new Array(N).fill(2);
    for (let i = 55; i < 120; i++) rep[i] = 6;
    for (let i = 120; i < 165; i++) rep[i] = 5;
    for (let i = 165; i < 230; i++) rep[i] = 6;
    for (let i = 230; i < N; i++) rep[i] = 5;
    const data = {
      raw,
      rep,
      yT: 40,
      yR: 8,
      sw: 8,
      reg: [[110, 165]],
      ann: [[30, 50, "Baseline, 2 replicas", "8 concurrent requests sit well under the `concurrency_target` threshold. Two replicas absorb the load with headroom.", "m"], [52, 70, "Window detects the ramp, scale up", "A short `autoscaling_window` (30 to 60s) catches the increase fast. ceil(32 / (8 × 0.6)) = 7 replicas, and the autoscaler provisions them.", "g"], [75, 105, "Stable at 6 replicas", "Headroom (50 to 60% `target_utilization_percentage`) absorbs the burst while new replicas finish booting. p95 latency holds.", "g"], [122, 162, "`scale_down_delay` holds replicas warm", "Traffic dropped, but the 900s `scale_down_delay` keeps capacity. If a second wave hits before the timer expires, no cold starts.", "b"], [168, 218, "Second wave, no cold start", "Warm replicas are already there. Latency stays flat. That is exactly what the long delay buys you.", "g"]]
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._aceMount) {
        cleanup = window._aceMount(ref.current, data);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const AutoscaleJittery = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    function mr(s) {
      return function () {
        s |= 0;
        s = s + 0x6D2B79F5 | 0;
        let t = Math.imul(s ^ s >>> 15, 1 | s);
        t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t;
        return ((t ^ t >>> 14) >>> 0) / 4294967296;
      };
    }
    const N = 240, r = mr(7);
    const raw = new Array(N).fill(0);
    const sp = [22, 58, 85, 118, 145, 172, 198, 222];
    for (let i = 0; i < N; i++) {
      let v = 10 + (r() - 0.5) * 1.4;
      for (const c of sp) {
        const d = i - c;
        if (Math.abs(d) < 5) v += 16 * Math.exp(-(d * d) / 4);
      }
      raw[i] = Math.max(0, v);
    }
    const data = {
      raw,
      rep: new Array(N).fill(2),
      yT: 32,
      yR: 4,
      sw: 28,
      ann: [[18, 30, "Spike to ~26 concurrent", "A spike to ~26 concurrent requests lasts ~10 seconds. The 3-minute `autoscaling_window` averages it away, so the smoothed value barely moves and no scale event fires.", "g"], [80, 95, "Window absorbs the noise", "Back-to-back spikes still average to ~12 concurrent over the `autoscaling_window`. The autoscaler sees calm.", "g"], [138, 165, "Replicas held steady", "Replica count stays at 2. A long `autoscaling_window` (180s+) trades reaction speed for stability, which is acceptable when spikes are not sustained.", "m"], [195, 230, "Still no scale-up", "A short `autoscaling_window` would chase every spike. The longer window keeps replica count flat: no oscillation, no thrash.", "g"]]
    };
    let cleanup = null, destroyed = false, timer = null, retries = 0;
    const tryMount = () => {
      if (destroyed || !ref.current) return;
      if (window._aceMount) {
        cleanup = window._aceMount(ref.current, data);
      } else if (retries++ < 60) {
        timer = setTimeout(tryMount, 30);
      }
    };
    tryMount();
    return () => {
      destroyed = true;
      if (timer) clearTimeout(timer);
      if (cleanup) cleanup();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

export const AutoscaleChartEngine = () => {
  React.useEffect(() => {
    if (window._aceMount) return;
    const W = 620, H = 240;
    const padL = 38, padR = 14, padT = 18, padB = 28;
    const iW = W - padL - padR, iH = H - padT - padB;
    function sm(a, w) {
      const o = new Array(a.length);
      let s = 0;
      for (let i = 0; i < a.length; i++) {
        s += a[i];
        if (i >= w) s -= a[i - w];
        o[i] = s / Math.min(i + 1, w);
      }
      return o;
    }
    const isDark = () => document.documentElement.classList.contains("dark");
    const C = () => isDark() ? {
      line: "#17D465",
      area: "rgba(23,212,101,0.20)",
      raw: "#19E76E",
      rep: "#9CA59E",
      grid: "#203026",
      sub: "#869089",
      chip: "#021309",
      brd: "#344339",
      hl: "rgba(180,186,179,0.10)",
      reg: "rgba(33,118,255,0.10)",
      body: "#dee4de"
    } : {
      line: "#0e863f",
      area: "rgba(178,247,207,0.55)",
      raw: "#19E76E",
      rep: "#5a675e",
      grid: "#dee4de",
      sub: "#869089",
      chip: "#ffffff",
      brd: "#dee4de",
      hl: "rgba(200,207,200,0.40)",
      reg: "#f0f6ff",
      body: "#021309"
    };
    const DC = {
      g: "#0e863f",
      b: "#1960d3",
      y: "#9c7400",
      m: "#5a675e",
      f: "#869089"
    };
    window._aceMount = function (root, opts) {
      const N = opts.raw.length;
      const smv = opts.sd || sm(opts.raw, opts.sw || 10);
      const ann = opts.ann || [];
      const cv = document.createElement("canvas");
      cv.style.cssText = "display:block;width:100%;max-width:" + W + "px;touch-action:pan-y";
      root.appendChild(cv);
      const ctx = cv.getContext("2d");
      const dpr = window.devicePixelRatio || 1;
      cv.width = W * dpr;
      cv.height = H * dpr;
      cv.style.height = H + "px";
      ctx.scale(dpr, dpr);
      const strip = document.createElement("div");
      const sDot = document.createElement("span");
      sDot.style.cssText = "flex:0 0 auto;width:8px;height:8px;border-radius:50%;margin-top:6px;background:#9CA59E";
      const sTxt = document.createElement("div");
      sTxt.style.cssText = "flex:1;min-width:0";
      const sTit = document.createElement("div");
      sTit.style.cssText = "font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;color:#869089;margin:0 0 2px";
      const sBod = document.createElement("div");
      sBod.style.cssText = "font:400 13px/1.4 system-ui,-apple-system,sans-serif;margin:0";
      function setRich(el, s) {
        el.replaceChildren();
        const parts = s.split("`");
        for (let i = 0; i < parts.length; i++) {
          if (i % 2 === 0) {
            if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
          } else {
            const code = document.createElement("code");
            code.textContent = parts[i];
            code.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
            el.appendChild(code);
          }
        }
      }
      sTxt.appendChild(sTit);
      sTxt.appendChild(sBod);
      strip.appendChild(sDot);
      strip.appendChild(sTxt);
      root.appendChild(strip);
      const xFor = i => padL + i / (N - 1) * iW;
      const yL = v => padT + iH - v / opts.yT * iH;
      const yR = v => padT + iH - v / opts.yR * iH;
      let hover = null, visible = true, raf = 0, dirty = true;
      const obs = new IntersectionObserver(e => visible = e[0].isIntersecting, {
        threshold: 0.15
      });
      obs.observe(cv);
      const themeObs = new MutationObserver(() => {
        dirty = true;
      });
      themeObs.observe(document.documentElement, {
        attributes: true,
        attributeFilter: ["class"]
      });
      function setHoverFromX(clientX) {
        const r = cv.getBoundingClientRect();
        const x = (clientX - r.left) / r.width * W;
        hover = Math.max(0, Math.min(N - 1, Math.round((x - padL) / iW * (N - 1))));
      }
      const onMouseMove = e => setHoverFromX(e.clientX);
      const onMouseLeave = () => {
        hover = null;
      };
      const onTouch = e => {
        if (e.touches[0]) setHoverFromX(e.touches[0].clientX);
      };
      const onTouchEnd = () => {
        hover = null;
      };
      cv.addEventListener("mousemove", onMouseMove);
      cv.addEventListener("mouseleave", onMouseLeave);
      cv.addEventListener("touchstart", onTouch, {
        passive: true
      });
      cv.addEventListener("touchmove", onTouch, {
        passive: true
      });
      cv.addEventListener("touchend", onTouchEnd);
      cv.addEventListener("touchcancel", onTouchEnd);
      function pL(arr, yFn) {
        ctx.beginPath();
        for (let i = 0; i < arr.length; i++) {
          const x = xFor(i), y = yFn(arr[i]);
          if (i === 0) ctx.moveTo(x, y); else ctx.lineTo(x, y);
        }
      }
      function pA(arr, yFn) {
        ctx.beginPath();
        ctx.moveTo(xFor(0), padT + iH);
        for (let i = 0; i < arr.length; i++) ctx.lineTo(xFor(i), yFn(arr[i]));
        ctx.lineTo(xFor(arr.length - 1), padT + iH);
        ctx.closePath();
      }
      function pS(arr, yFn) {
        ctx.beginPath();
        for (let i = 0; i < arr.length; i++) {
          const x = xFor(i), y = yFn(arr[i]);
          if (i === 0) ctx.moveTo(x, y); else {
            ctx.lineTo(x, yFn(arr[i - 1]));
            ctx.lineTo(x, y);
          }
        }
      }
      function tx(s, x, y, c, a, sz) {
        ctx.font = "500 " + (sz || 9) + "px ui-monospace,Menlo,monospace";
        ctx.fillStyle = c;
        ctx.textAlign = a || "left";
        ctx.textBaseline = "middle";
        ctx.fillText(s, x, y);
      }
      function draw() {
        const col = C();
        const active = hover;
        ctx.clearRect(0, 0, W, H);
        ctx.strokeStyle = col.grid;
        ctx.lineWidth = 1;
        for (let i = 0; i <= 4; i++) {
          const y = padT + iH - i / 4 * iH;
          ctx.beginPath();
          ctx.setLineDash(i ? [2, 3] : []);
          ctx.moveTo(padL, y);
          ctx.lineTo(padL + iW, y);
          ctx.stroke();
        }
        ctx.setLineDash([]);
        if (opts.reg) for (const r of opts.reg) {
          ctx.fillStyle = col.reg;
          ctx.fillRect(xFor(r[0]), padT, xFor(r[1]) - xFor(r[0]), iH);
        }
        const aA = active !== null ? ann.find(a => active >= a[0] && active <= a[1]) : null;
        if (aA) {
          ctx.fillStyle = col.hl;
          ctx.fillRect(xFor(aA[0]), padT, xFor(aA[1]) - xFor(aA[0]), iH);
        }
        pA(smv, yL);
        ctx.fillStyle = col.area;
        ctx.fill();
        pL(opts.raw, yL);
        ctx.strokeStyle = col.raw;
        ctx.globalAlpha = 0.7;
        ctx.lineWidth = 1;
        ctx.stroke();
        ctx.globalAlpha = 1;
        pL(smv, yL);
        ctx.strokeStyle = col.line;
        ctx.lineWidth = 2;
        ctx.stroke();
        ctx.setLineDash([4, 3]);
        pS(opts.rep, yR);
        ctx.strokeStyle = col.rep;
        ctx.lineWidth = 1.5;
        ctx.globalAlpha = 0.85;
        ctx.stroke();
        ctx.globalAlpha = 1;
        ctx.setLineDash([]);
        for (let i = 0; i <= 4; i++) {
          const v = Math.round(opts.yT / 4 * i);
          tx(v.toString(), padL - 5, yL(v), col.sub, "end");
        }
        const steps = Math.min(4, opts.yR);
        for (let i = 0; i <= steps; i++) {
          const v = Math.round(opts.yR / steps * i);
          tx(v.toString(), padL + iW + 5, yR(v), col.sub);
        }
        tx("concurrent", padL, padT - 8, col.sub);
        tx("replicas", padL + iW, padT - 8, col.sub, "end");
        ctx.strokeStyle = col.brd;
        ctx.lineWidth = 1;
        ctx.beginPath();
        ctx.moveTo(padL, padT + iH);
        ctx.lineTo(padL + iW, padT + iH);
        ctx.stroke();
        if (active !== null) {
          const px = xFor(active);
          ctx.strokeStyle = col.rep;
          ctx.globalAlpha = 0.5;
          ctx.beginPath();
          ctx.moveTo(px, padT);
          ctx.lineTo(px, padT + iH);
          ctx.stroke();
          ctx.globalAlpha = 1;
          ctx.fillStyle = col.line;
          ctx.beginPath();
          ctx.arc(px, yL(smv[active]), 3.5, 0, Math.PI * 2);
          ctx.fill();
          ctx.strokeStyle = col.chip;
          ctx.lineWidth = 1.5;
          ctx.stroke();
          ctx.fillStyle = col.rep;
          ctx.beginPath();
          ctx.arc(px, yR(opts.rep[active]), 3, 0, Math.PI * 2);
          ctx.fill();
          ctx.strokeStyle = col.chip;
          ctx.stroke();
          const cx = Math.min(W - 96, Math.max(padL, px + 8));
          ctx.fillStyle = col.chip;
          ctx.strokeStyle = col.brd;
          ctx.lineWidth = 1;
          ctx.beginPath();
          ctx.roundRect(cx, padT + 4, 88, 32, 4);
          ctx.fill();
          ctx.stroke();
          tx(Math.round(smv[active]) + " concurrent", cx + 6, padT + 14, col.line, null, 10);
          tx(opts.rep[active] + " replicas", cx + 6, padT + 27, col.rep, null, 10);
        }
        if (aA) {
          sDot.style.background = DC[aA[4]] || "#869089";
          setRich(sTit, aA[2]);
          setRich(sBod, aA[3]);
          sBod.style.color = col.body;
        } else if (active !== null) {
          const r = opts.rep[active], s = Math.round(smv[active]);
          let dot = DC.m, tt, bd;
          if (r === 0 && s === 0) {
            dot = DC.f;
            tt = "Scaled to zero";
            bd = "No replicas, no traffic. Compute is free until the next request.";
          } else if (s === 0) {
            dot = DC.b;
            tt = "Idle, waiting on `scale_down_delay`";
            bd = r + " replica" + (r === 1 ? "" : "s") + " held warm. They are removed once idle exceeds `scale_down_delay`.";
          } else {
            tt = "No scale event";
            bd = "Smoothed traffic " + s + " concurrent requests, " + r + " replica" + (r === 1 ? "" : "s") + ". The autoscaler is steady: load is within current capacity.";
          }
          sDot.style.background = dot;
          setRich(sTit, tt);
          setRich(sBod, bd);
          sBod.style.color = col.body;
        } else {
          sDot.style.background = "#9CA59E";
          setRich(sTit, "Hover to inspect");
          setRich(sBod, "Move your cursor across the chart to see what the autoscaler is doing at any point in time.");
          sBod.style.color = col.body;
        }
        strip.style.cssText = "display:flex;align-items:flex-start;gap:10px;padding:10px 14px;margin:8px 0 0;border-radius:6px;min-height:56px;" + (isDark() ? "background:#0C1D13;border:1px solid #203026" : "background:#f4f9f3;border:1px solid #dee4de");
      }
      let prevHover = null;
      function loop() {
        raf = requestAnimationFrame(loop);
        if (!visible) return;
        if (dirty || hover !== prevHover) {
          prevHover = hover;
          dirty = false;
          draw();
        }
      }
      raf = requestAnimationFrame(loop);
      return () => {
        cancelAnimationFrame(raf);
        obs.disconnect();
        themeObs.disconnect();
        cv.remove();
        strip.remove();
      };
    };
  }, []);
  return <span />;
};

<AutoscaleChartEngine />

Different traffic patterns require different autoscaling configurations.
Identify your pattern below for recommended starting settings.

<Note>
  These are **starting points**, not final answers. Monitor your
  deployment's performance and adjust based on observed behavior. See
  [Autoscaling](/deployment/autoscaling/overview) for parameter details.
</Note>

***

## Identifying your pattern

Not sure which pattern you have? Check your metrics:

1. Go to your model's **Metrics** tab in the Baseten dashboard.
2. Look at **Inference volume** and **Replicas** over the past week.
3. Compare to the patterns below.

| You see...                                            | Your pattern is...              |
| ----------------------------------------------------- | ------------------------------- |
| Frequent small spikes that quickly return to baseline | [Jittery](#jittery-traffic)     |
| Sharp jumps that stay high for a while                | [Bursty](#bursty-traffic)       |
| Long flat periods with occasional large bursts        | [Scheduled](#scheduled-traffic) |
| Gradual rises and falls, smooth curves                | [Steady](#steady-traffic)       |

<Note>
  Some workloads are a mix of patterns. If your traffic has both smooth diurnal patterns AND occasional bursts, optimize for the bursts (they cause the most pain) and accept slightly higher cost during steady periods.
</Note>

***

## Jittery traffic

Small, frequent spikes that quickly return to baseline.

<AutoscaleJittery />

### Characteristics

* Baseline replica count is steady, but **spikes up by 2x several times per hour**.
* Spikes are short-lived and return to baseline quickly.
* Often not real load growth, just temporary surges causing overreaction.

### Common causes

* Consumer products with intermittent usage bursts.
* Traffic splitting or A/B testing with low percentages.
* Polling clients with synchronized intervals.

### Recommended settings

| Parameter          | Value             | Why                                             |
| ------------------ | ----------------- | ----------------------------------------------- |
| Autoscaling window | **2-5 minutes**   | Smooth out noise, avoid reacting to every spike |
| Scale-down delay   | **300-600s**      | Moderate stability                              |
| Target utilization | **70%**           | Default is fine                                 |
| Concurrency target | Benchmarked value | Start conservative                              |

A longer autoscaling window averages out the jitter so the autoscaler doesn't chase every small spike. You're trading reaction speed for stability, which is acceptable when the spikes aren't sustained load increases.

<Tip>
  If you're still seeing oscillation with these settings, increase the scale-down delay before lowering target utilization.
</Tip>

***

## Bursty traffic

<AutoscaleBursty />

### Characteristics

* Traffic **jumps sharply** (2x+ within 60 seconds).
* Stays high for a sustained period before dropping.
* The "pain" is queueing and latency spikes while new replicas start.

### Common causes

* Daily morning ramp-up (users starting their day).
* Marketing events, product launches, viral moments.
* Top-of-hour scheduled jobs or cron-triggered traffic.

### Recommended settings

| Parameter          | Value      | Why                                           |
| ------------------ | ---------- | --------------------------------------------- |
| Autoscaling window | **30-60s** | React quickly to genuine load increases       |
| Scale-down delay   | **900s+**  | Handle back-to-back waves without thrashing   |
| Target utilization | **50-60%** | More headroom absorbs the burst while scaling |
| Min replicas       | **≥2**     | Redundancy + reduces cold start impact        |

Short window means fast reaction. Long delay prevents scaling down between waves. Lower utilization gives you buffer capacity while new replicas start.

### Pre-warming for predictable bursts

If your bursts are predictable (morning ramp, scheduled events), pre-warm by bumping min replicas before the expected spike:

```bash theme={"system"}
curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Api-Key $BASETEN_API_KEY" \
  -d '{"min_replica": 5}'
```

After the burst subsides, reset to your normal minimum:

```bash theme={"system"}
curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Api-Key $BASETEN_API_KEY" \
  -d '{"min_replica": 2}'
```

<Tip>
  Automate pre-warming with cron jobs or your orchestration system.
  Bumping min replicas 10-15 minutes before known peaks avoids cold starts for the first requests after the spike.
</Tip>

***

## Scheduled traffic

<AutoscaleScheduled />

### Characteristics

* **Long periods of low or zero traffic**.
* Large bursts tied to job schedules (hourly, daily, weekly).
* Traffic patterns are predictable but infrequent.

### Common causes

* ETL pipelines and data processing jobs.
* Embedding backfills and batch inference.
* Periodic evaluation or testing jobs.
* Document processing triggered by user uploads.

### Recommended settings

| Parameter          | Value                                                           | Why                                       |
| ------------------ | --------------------------------------------------------------- | ----------------------------------------- |
| Min replicas       | **0** (if cold starts acceptable) or **1** (during job windows) | Cost savings when idle                    |
| Scale-down delay   | **Moderate to high**                                            | Jobs often come in waves                  |
| Autoscaling window | **60-120s**                                                     | Don't overreact to the first few requests |
| Target utilization | **70%**                                                         | Default is fine                           |

Scale-to-zero saves significant cost during idle periods. The moderate window prevents overreacting to the initial requests of a batch. If jobs come in waves, a longer delay keeps replicas warm between them.

### Scheduled pre-warming

For predictable batch jobs, use cron + API to pre-warm.

5 minutes before the hourly job, scale up:

```bash theme={"system"}
0 * * * * curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Api-Key $BASETEN_API_KEY" \
  -d '{"min_replica": 3}'
```

30 minutes after the job completes, scale back down:

```bash theme={"system"}
30 * * * * curl -X PATCH \
  https://api.baseten.co/v1/models/{model_id}/deployments/{deployment_id}/autoscaling_settings \
  -H "Authorization: Api-Key $BASETEN_API_KEY" \
  -d '{"min_replica": 0}'
```

<Warning>
  If you use scale-to-zero, the first request of each batch will experience a [cold start](/deployment/autoscaling/cold-starts). For latency-sensitive batch jobs, keep min replicas at 1 during expected job windows.
</Warning>

***

## Steady traffic

<AutoscaleSteady />

### Characteristics

* Traffic **rises and falls gradually** over the day.
* Classic diurnal pattern with no sharp edges.
* Predictable, cyclical behavior.

### Common causes

* Always-on inference APIs with consistent user base.
* B2B applications with business-hours usage.
* Production workloads with stable, mature traffic.

### Recommended settings

| Parameter          | Value        | Why                            |
| ------------------ | ------------ | ------------------------------ |
| Target utilization | **70-80%**   | Can run replicas hotter safely |
| Autoscaling window | **60-120s**  | Moderate reaction speed        |
| Scale-down delay   | **300-600s** | Moderate                       |
| Min replicas       | **≥2**       | Redundancy for production      |

Without sudden spikes, you don't need as much headroom. You can run replicas at higher utilization (lower cost) because load changes are gradual and predictable. The autoscaler has time to react.

<Tip>
  Smooth traffic is the easiest to tune. Start with defaults, monitor for a week, then optimize for cost by gradually raising target utilization while watching p95 latency.
</Tip>

***

## Next steps

* [Autoscaling](/deployment/autoscaling/overview): Full parameter documentation.
* [Troubleshooting autoscaling](/troubleshooting/deployments#autoscaling-issues): Diagnose and fix common problems.
* [Truss configuration reference](/reference/truss-configuration): Configure predict\_concurrency in your model.
