> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Model APIs

> OpenAI-compatible endpoints for high-performance LLMs

export const FeatureSupportTable = () => {
  const rows = [{
    model: "DeepSeek V3.1",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Enabled by default",
    vision: "–"
  }, {
    model: "DeepSeek V4 Pro",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Enabled by default",
    vision: "–"
  }, {
    model: "GLM 4.7",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Opt-in",
    vision: "–"
  }, {
    model: "GLM 5",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Opt-in",
    vision: "–"
  }, {
    model: "GLM 5.1",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Opt-in",
    vision: "–"
  }, {
    model: "Kimi K2.5",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Opt-in",
    vision: "✓"
  }, {
    model: "Kimi K2.6",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Opt-in",
    vision: "✓"
  }, {
    model: "Minimax M2.5",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Enabled by default",
    vision: "–"
  }, {
    model: "Nemotron Super",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Enabled by default",
    vision: "–"
  }, {
    model: "OpenAI GPT 120B",
    tools: "✓",
    structured: "✓",
    json: "✓",
    reasoning: "Enabled by default",
    vision: "–"
  }];
  const [sortKey, setSortKey] = useState("model");
  const [sortDir, setSortDir] = useState("asc");
  const toggle = key => {
    if (key === sortKey) {
      setSortDir(d => d === "asc" ? "desc" : "asc");
    } else {
      setSortKey(key);
      setSortDir("asc");
    }
  };
  const sorted = [...rows].sort((a, b) => {
    const cmp = a[sortKey].localeCompare(b[sortKey]);
    return sortDir === "asc" ? cmp : -cmp;
  });
  const cols = [{
    key: "model",
    label: "Model"
  }, {
    key: "tools",
    label: "Tool calling"
  }, {
    key: "structured",
    label: "Structured outputs"
  }, {
    key: "json",
    label: "JSON mode"
  }, {
    key: "reasoning",
    label: "Reasoning"
  }, {
    key: "vision",
    label: "Vision"
  }];
  return <div className="not-prose overflow-x-auto">
      <table className="w-full text-sm border-collapse">
        <thead>
          <tr>
            {cols.map(({key, label}) => <th key={key} onClick={() => toggle(key)} className="cursor-pointer select-none border-b border-zinc-200 dark:border-zinc-700 px-4 py-2 font-medium text-left text-zinc-600 dark:text-zinc-400 hover:text-zinc-900 dark:hover:text-zinc-100">
                {label}
              </th>)}
          </tr>
        </thead>
        <tbody>
          {sorted.map(row => <tr key={row.model} className="border-b border-zinc-100 dark:border-zinc-800 hover:bg-zinc-50 dark:hover:bg-zinc-800/50">
              <td className="px-4 py-2 text-zinc-900 dark:text-zinc-100">{row.model}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300">{row.tools}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300">{row.structured}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300">{row.json}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300">{row.reasoning}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300">{row.vision}</td>
            </tr>)}
        </tbody>
      </table>
    </div>;
};

export const SupportedModelsTable = () => {
  const rows = [{
    model: "DeepSeek V3.1",
    slug: "deepseek-ai/DeepSeek-V3.1",
    context: 163,
    maxOutput: 131
  }, {
    model: "DeepSeek V4 Pro",
    slug: "deepseek-ai/DeepSeek-V4-Pro",
    context: 131,
    maxOutput: 131
  }, {
    model: "GLM 4.7",
    slug: "zai-org/GLM-4.7",
    context: 200,
    maxOutput: 200
  }, {
    model: "GLM 5",
    slug: "zai-org/GLM-5",
    context: 202,
    maxOutput: 202
  }, {
    model: "GLM 5.1",
    slug: "zai-org/GLM-5.1",
    context: 202,
    maxOutput: 202
  }, {
    model: "Kimi K2.5",
    slug: "moonshotai/Kimi-K2.5",
    context: 262,
    maxOutput: 262
  }, {
    model: "Kimi K2.6",
    slug: "moonshotai/Kimi-K2.6",
    context: 262,
    maxOutput: 262
  }, {
    model: "Minimax M2.5",
    slug: "MiniMaxAI/MiniMax-M2.5",
    context: 204,
    maxOutput: 204
  }, {
    model: "Nemotron Super",
    slug: "nvidia/Nemotron-120B-A12B",
    context: 202,
    maxOutput: 202
  }, {
    model: "OpenAI GPT 120B",
    slug: "openai/gpt-oss-120b",
    context: 128,
    maxOutput: 128
  }];
  const [sortKey, setSortKey] = useState("model");
  const [sortDir, setSortDir] = useState("asc");
  const toggle = key => {
    if (key === sortKey) {
      setSortDir(d => d === "asc" ? "desc" : "asc");
    } else {
      setSortKey(key);
      setSortDir("asc");
    }
  };
  const sorted = [...rows].sort((a, b) => {
    const av = a[sortKey], bv = b[sortKey];
    const cmp = typeof av === "string" ? av.localeCompare(bv) : av - bv;
    return sortDir === "asc" ? cmp : -cmp;
  });
  const cols = [{
    key: "model",
    label: "Model",
    right: false
  }, {
    key: "slug",
    label: "Slug",
    right: false
  }, {
    key: "context",
    label: "Context",
    right: true
  }, {
    key: "maxOutput",
    label: "Max output",
    right: true
  }];
  return <div className="not-prose overflow-x-auto">
      <table className="w-full text-sm border-collapse">
        <thead>
          <tr>
            {cols.map(({key, label, right}) => <th key={key} onClick={() => toggle(key)} className={`cursor-pointer select-none border-b border-zinc-200 dark:border-zinc-700 px-4 py-2 font-medium text-zinc-600 dark:text-zinc-400 hover:text-zinc-900 dark:hover:text-zinc-100 ${right ? "text-right" : "text-left"}`}>
                {label}
              </th>)}
          </tr>
        </thead>
        <tbody>
          {sorted.map(row => <tr key={row.model} className="border-b border-zinc-100 dark:border-zinc-800 hover:bg-zinc-50 dark:hover:bg-zinc-800/50">
              <td className="px-4 py-2 text-zinc-900 dark:text-zinc-100">{row.model}</td>
              <td className="px-4 py-2 text-zinc-700 dark:text-zinc-300"><code className="text-xs">{row.slug}</code></td>
              <td className="px-4 py-2 text-right tabular-nums text-zinc-700 dark:text-zinc-300">{row.context}k</td>
              <td className="px-4 py-2 text-right tabular-nums text-zinc-700 dark:text-zinc-300">{row.maxOutput}k</td>
            </tr>)}
        </tbody>
      </table>
    </div>;
};

Model APIs provide instant access to high-performance LLMs through endpoints that are compatible with both the [OpenAI Chat Completions API](/reference/inference-api/chat-completions) and the [Anthropic Messages API](/reference/inference-api/messages). Point your existing OpenAI or Anthropic SDK at Baseten's inference endpoint and start making calls, no model deployment required.

Unlike [dedicated deployments](/development/model/build-your-first-model), where you'd configure hardware, engines, and scaling yourself, Model APIs run on shared infrastructure that Baseten manages. You get a fixed set of popular models with optimized serving out of the box. When you need a model that isn't in the supported list, or want dedicated GPUs with custom scaling, deploy your own with [Truss](/development/model/overview).

## Supported models

[Run inference](#run-inference) against any Model API to get started.

<SupportedModelsTable />

## Pricing

Model APIs bill per million tokens.
For current per-model rates, see the [Model APIs pricing page](https://www.baseten.co/pricing).

Cached input tokens are prompt tokens served from the KV cache, billed at a discounted rate.
Every request participates in caching automatically, with no flags or opt-in steps.

## Feature support

All models support [tool calling](/inference/function-calling) (also known as function calling), [structured outputs](/inference/structured-outputs), and [JSON mode](/inference/json-mode). See the table below for per-model coverage of reasoning and vision. For reasoning-specific configuration, see [Reasoning](/inference/model-apis/reasoning). For image and video inputs, see [Vision](/inference/model-apis/vision).

<FeatureSupportTable />

<Note>GLM models and Nemotron Super also support `top_p` and `top_k` sampling parameters.</Note>

## Run inference

Model APIs support both OpenAI's Chat Completions and Anthropic's Messages APIs. Set your base URL, API key, and [model name](#supported-models) to start making requests.

### Use the OpenAI SDK

Call supported models using the [OpenAI Chat Completions API](/reference/inference-api/chat-completions) at `https://inference.baseten.co/v1/chat/completions`.

<Tabs>
  <Tab title="Python">
    ```python theme={"system"}
    from openai import OpenAI
    import os

    client = OpenAI(
        base_url="https://inference.baseten.co/v1",
        api_key=os.environ["BASETEN_API_KEY"],
    )

    response = client.chat.completions.create(
        model="deepseek-ai/DeepSeek-V4-Pro",
        messages=[
            {"role": "system", "content": "You are a concise technical writer."},
            {"role": "user", "content": "What is gradient descent?"},
            {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."},
            {"role": "user", "content": "How does the learning rate affect it?"}
        ],
    )

    print(response.choices[0].message.content)
    ```
  </Tab>

  <Tab title="JavaScript">
    ```javascript theme={"system"}
    import OpenAI from "openai";

    const client = new OpenAI({
        baseURL: "https://inference.baseten.co/v1",
        apiKey: process.env.BASETEN_API_KEY,
    });

    const response = await client.chat.completions.create({
        model: "deepseek-ai/DeepSeek-V4-Pro",
        messages: [
            { role: "system", content: "You are a concise technical writer." },
            { role: "user", content: "What is gradient descent?" },
            { role: "assistant", content: "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function." },
            { role: "user", content: "How does the learning rate affect it?" }
        ],
    });

    console.log(response.choices[0].message.content);
    ```
  </Tab>

  <Tab title="cURL">
    ```bash theme={"system"}
    curl https://inference.baseten.co/v1/chat/completions \
      -H "Content-Type: application/json" \
      -H "Authorization: Bearer $BASETEN_API_KEY" \
      -d '{
        "model": "deepseek-ai/DeepSeek-V4-Pro",
        "messages": [
          {"role": "system", "content": "You are a concise technical writer."},
          {"role": "user", "content": "What is gradient descent?"},
          {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."},
          {"role": "user", "content": "How does the learning rate affect it?"}
        ]
      }'
    ```
  </Tab>
</Tabs>

Replace the model slug with any model from the supported models table.

### Use the Anthropic SDK

Call supported models using the [Anthropic Messages API](/reference/inference-api/messages) at `https://inference.baseten.co/v1/messages`.

<Tabs>
  <Tab title="Python">
    ```python theme={"system"}
    import anthropic
    import os

    API_KEY = os.environ["BASETEN_API_KEY"]

    client = anthropic.Anthropic(
        base_url="https://inference.baseten.co",
        api_key=API_KEY,
        default_headers={"Authorization": f"Bearer {API_KEY}"},
    )

    response = client.messages.create(
        model="deepseek-ai/DeepSeek-V4-Pro",
        max_tokens=4096,
        system="You are a concise technical writer.",
        messages=[
            {"role": "user", "content": "What is gradient descent?"},
            {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."},
            {"role": "user", "content": "How does the learning rate affect it?"}
        ],
    )

    for block in response.content:
        if block.type == "text":
            print(block.text)
    ```
  </Tab>

  <Tab title="JavaScript">
    ```javascript theme={"system"}
    import Anthropic from "@anthropic-ai/sdk";

    const apiKey = process.env.BASETEN_API_KEY;

    const client = new Anthropic({
        baseURL: "https://inference.baseten.co",
        apiKey: apiKey,
        defaultHeaders: { Authorization: `Bearer ${apiKey}` },
    });

    const response = await client.messages.create({
        model: "deepseek-ai/DeepSeek-V4-Pro",
        max_tokens: 4096,
        system: "You are a concise technical writer.",
        messages: [
            { role: "user", content: "What is gradient descent?" },
            { role: "assistant", content: "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function." },
            { role: "user", content: "How does the learning rate affect it?" }
        ],
    });

    for (const block of response.content) {
        if (block.type === "text") console.log(block.text);
    }
    ```
  </Tab>

  <Tab title="cURL">
    ```bash theme={"system"}
    curl https://inference.baseten.co/v1/messages \
      -H "Content-Type: application/json" \
      -H "Authorization: Bearer $BASETEN_API_KEY" \
      -d '{
        "model": "deepseek-ai/DeepSeek-V4-Pro",
        "max_tokens": 4096,
        "system": "You are a concise technical writer.",
        "messages": [
          {"role": "user", "content": "What is gradient descent?"},
          {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."},
          {"role": "user", "content": "How does the learning rate affect it?"}
        ]
      }'
    ```
  </Tab>
</Tabs>

The Anthropic SDK sends the API key as `x-api-key` by default. Baseten reads `Authorization`, so override `default_headers` as shown.

## List available models

Query the `/v1/models` endpoint for the current list of models with metadata including pricing, context lengths, and supported features.

```bash theme={"system"}
curl https://inference.baseten.co/v1/models \
  -H "Authorization: Bearer $BASETEN_API_KEY"
```

## Migrate

To migrate to Baseten, change the base URL, API key, and model name.

<Tabs>
  <Tab title="OpenAI SDK">
    1. Replace your OpenAI API key with a [Baseten API key](https://app.baseten.co/settings/api_keys).
    2. Change the base URL to `https://inference.baseten.co/v1`.
    3. Update the model name to a Baseten model slug.

    ```python theme={"system"}
    from openai import OpenAI
    import os

    client = OpenAI(
        base_url="https://inference.baseten.co/v1",  # [!code ++]
        api_key=os.environ["BASETEN_API_KEY"]  # [!code ++]
    )

    response = client.chat.completions.create(
        model="deepseek-ai/DeepSeek-V4-Pro",  # [!code ++]
        messages=[{"role": "user", "content": "Hello"}]
    )
    ```
  </Tab>

  <Tab title="Anthropic SDK">
    1. Replace your Anthropic API key with a [Baseten API key](https://app.baseten.co/settings/api_keys).
    2. Change the base URL to `https://inference.baseten.co`.
    3. Override `default_headers` so the SDK sends `Authorization` instead of `x-api-key`.
    4. Update the model name to a [supported Baseten model slug](#supported-models).

    ```python theme={"system"}
    import anthropic
    import os

    API_KEY = os.environ["BASETEN_API_KEY"]

    client = anthropic.Anthropic(
        base_url="https://inference.baseten.co",  # [!code ++]
        api_key=API_KEY,  # [!code ++]
        default_headers={"Authorization": f"Bearer {API_KEY}"},  # [!code ++]
    )

    response = client.messages.create(
        model="deepseek-ai/DeepSeek-V4-Pro",  # [!code ++]
        max_tokens=1024,
        messages=[{"role": "user", "content": "Hello"}]
    )
    ```
  </Tab>
</Tabs>

## Handle errors

Model APIs return standard HTTP error codes:

| Code | Meaning                                 |
| ---- | --------------------------------------- |
| 400  | Invalid request (check your parameters) |
| 401  | Invalid or missing API key              |
| 402  | Payment required                        |
| 404  | Model not found                         |
| 429  | Rate limit exceeded                     |
| 500  | Internal server error                   |

Each error response includes a JSON body with details about the issue and suggested resolutions.

## Next steps

<CardGroup cols={2}>
  <Card title="Reasoning" icon="brain" href="/inference/model-apis/reasoning">
    Control extended thinking for complex tasks
  </Card>

  <Card title="Vision" icon="image" href="/inference/model-apis/vision">
    Send images and videos alongside text
  </Card>

  <Card title="Rate limits" icon="gauge" href="/inference/model-apis/rate-limits-and-budgets">
    Understand and configure rate limits
  </Card>

  <Card title="API reference" icon="code" href="/reference/inference-api/chat-completions">
    Complete parameter documentation
  </Card>
</CardGroup>
