> ## Documentation Index > Fetch the complete documentation index at: https://docs.baseten.co/llms.txt > Use this file to discover all available pages before exploring further. # Model APIs > OpenAI-compatible endpoints for high-performance LLMs export const FeatureSupportTable = () => { const rows = [{ model: "DeepSeek V4 Pro", tools: "✓", structured: "✓", json: "✓", reasoning: "Enabled by default", vision: "–" }, { model: "GLM 4.7", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "GLM 5", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "GLM 5.1", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "GLM 5.2", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "Kimi K2.5", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "✓" }, { model: "Kimi K2.6", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "✓" }, { model: "Kimi K2.7 Code", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "Nemotron Super", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "Nemotron Ultra", tools: "✓", structured: "✓", json: "✓", reasoning: "Opt-in", vision: "–" }, { model: "OpenAI GPT 120B", tools: "✓", structured: "✓", json: "✓", reasoning: "Enabled by default", vision: "–" }]; const [sortKey, setSortKey] = useState("model"); const [sortDir, setSortDir] = useState("asc"); const toggle = key => { if (key === sortKey) { setSortDir(d => d === "asc" ? "desc" : "asc"); } else { setSortKey(key); setSortDir("asc"); } }; const sorted = [...rows].sort((a, b) => { const cmp = a[sortKey].localeCompare(b[sortKey]); return sortDir === "asc" ? cmp : -cmp; }); const cols = [{ key: "model", label: "Model" }, { key: "tools", label: "Tool calling" }, { key: "structured", label: "Structured outputs" }, { key: "json", label: "JSON mode" }, { key: "reasoning", label: "Reasoning" }, { key: "vision", label: "Vision" }]; return

{cols.map(({key, label}) => )} {sorted.map(row => )}

toggle(key)} className="cursor-pointer select-none border-b border-zinc-200 dark:border-zinc-700 px-4 py-2 font-medium text-left text-zinc-600 dark:text-zinc-400 hover:text-zinc-900 dark:hover:text-zinc-100"> {label}
{row.model}	{row.tools}	{row.structured}	{row.json}	{row.reasoning}	{row.vision}

; }; export const SupportedModelsTable = () => { const rows = [{ model: "DeepSeek V4 Pro", slug: "deepseek-ai/DeepSeek-V4-Pro", context: 1048, maxOutput: 1048 }, { model: "GLM 4.7", slug: "zai-org/GLM-4.7", context: 200, maxOutput: 200 }, { model: "GLM 5", slug: "zai-org/GLM-5", context: 202, maxOutput: 202 }, { model: "GLM 5.1", slug: "zai-org/GLM-5.1", context: 202, maxOutput: 202 }, { model: "GLM 5.2", slug: "zai-org/GLM-5.2", context: 202, maxOutput: 202 }, { model: "Kimi K2.5", slug: "moonshotai/Kimi-K2.5", context: 262, maxOutput: 262 }, { model: "Kimi K2.6", slug: "moonshotai/Kimi-K2.6", context: 262, maxOutput: 262 }, { model: "Kimi K2.7 Code", slug: "moonshotai/Kimi-K2.7-Code", context: 262, maxOutput: 262 }, { model: "Nemotron Super", slug: "nvidia/Nemotron-120B-A12B", context: 202, maxOutput: 202 }, { model: "Nemotron Ultra", slug: "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B", context: 202, maxOutput: 202 }, { model: "OpenAI GPT 120B", slug: "openai/gpt-oss-120b", context: 128, maxOutput: 128 }]; const [sortKey, setSortKey] = useState("model"); const [sortDir, setSortDir] = useState("asc"); const toggle = key => { if (key === sortKey) { setSortDir(d => d === "asc" ? "desc" : "asc"); } else { setSortKey(key); setSortDir("asc"); } }; const sorted = [...rows].sort((a, b) => { const av = a[sortKey], bv = b[sortKey]; const cmp = typeof av === "string" ? av.localeCompare(bv) : av - bv; return sortDir === "asc" ? cmp : -cmp; }); const cols = [{ key: "model", label: "Model", right: false }, { key: "slug", label: "Slug", right: false }, { key: "context", label: "Context", right: true }, { key: "maxOutput", label: "Max output", right: true }]; return

{cols.map(({key, label, right}) => )} {sorted.map(row => )}

toggle(key)} className={`cursor-pointer select-none border-b border-zinc-200 dark:border-zinc-700 px-4 py-2 font-medium text-zinc-600 dark:text-zinc-400 hover:text-zinc-900 dark:hover:text-zinc-100 ${right ? "text-right" : "text-left"}`}> {label}
{row.model}	`{row.slug}`	{row.context}k	{row.maxOutput}k

; }; Model APIs provide instant access to high-performance LLMs through endpoints that are compatible with both the [OpenAI Chat Completions API](/reference/inference-api/chat-completions) and the [Anthropic Messages API](/reference/inference-api/messages) (beta). Point your existing OpenAI or Anthropic SDK at Baseten's inference endpoint and start making calls, no model deployment required. Unlike [dedicated deployments](/development/model/build-your-first-model), where you'd configure hardware, engines, and scaling yourself, Model APIs run on shared infrastructure that Baseten manages. You get a fixed set of popular models with optimized serving out of the box. When you need a model that isn't in the supported list, or want dedicated GPUs with custom scaling, deploy your own with [Truss](/development/model/overview). ## Supported models [Run inference](#run-inference) against any Model API to get started. ## Pricing Model APIs bill per million tokens. For current per-model rates, see the [Model APIs pricing page](https://www.baseten.co/pricing). Cached input tokens are prompt tokens served from the KV cache, billed at a discounted rate. Every request participates in caching automatically, with no flags or opt-in steps. ## Feature support All models support [tool calling](/inference/function-calling) (also known as function calling), [structured outputs](/inference/structured-outputs), and [JSON mode](/inference/json-mode). See the table below for per-model coverage of reasoning and vision. For reasoning-specific configuration, see [Reasoning](/inference/model-apis/reasoning). For image and video inputs, see [Vision](/inference/model-apis/vision). GLM models, Nemotron Super, and Nemotron Ultra also support `top_p` and `top_k` sampling parameters. ## Run inference Model APIs support both OpenAI's Chat Completions and Anthropic's Messages APIs. Set your base URL, API key, and [model name](#supported-models) to start making requests. ### Use the OpenAI SDK Call supported models using the [OpenAI Chat Completions API](/reference/inference-api/chat-completions) at `https://inference.baseten.co/v1/chat/completions`. ```python chat_completions.py theme={"system"} from openai import OpenAI import os client = OpenAI( base_url="https://inference.baseten.co/v1", api_key=os.environ["BASETEN_API_KEY"], ) response = client.chat.completions.create( model="deepseek-ai/DeepSeek-V4-Pro", messages=[ {"role": "system", "content": "You are a concise technical writer."}, {"role": "user", "content": "What is gradient descent?"}, {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."}, {"role": "user", "content": "How does the learning rate affect it?"} ], ) print(response.choices[0].message.content) ``` ```javascript chat_completions.js theme={"system"} import OpenAI from "openai"; const client = new OpenAI({ baseURL: "https://inference.baseten.co/v1", apiKey: process.env.BASETEN_API_KEY, }); const response = await client.chat.completions.create({ model: "deepseek-ai/DeepSeek-V4-Pro", messages: [ { role: "system", content: "You are a concise technical writer." }, { role: "user", content: "What is gradient descent?" }, { role: "assistant", content: "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function." }, { role: "user", content: "How does the learning rate affect it?" } ], }); console.log(response.choices[0].message.content); ``` ```bash Request theme={"system"} curl https://inference.baseten.co/v1/chat/completions \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $BASETEN_API_KEY" \ -d '{ "model": "deepseek-ai/DeepSeek-V4-Pro", "messages": [ {"role": "system", "content": "You are a concise technical writer."}, {"role": "user", "content": "What is gradient descent?"}, {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."}, {"role": "user", "content": "How does the learning rate affect it?"} ] }' ``` Replace the model slug with any model from the supported models table. ### Use the Anthropic SDK Call supported models using the [Anthropic Messages API](/reference/inference-api/messages) at `https://inference.baseten.co/v1/messages`. Anthropic Messages API support is in **beta**. Behavior may change before general availability. For production workloads, use the [OpenAI Chat Completions API](/reference/inference-api/chat-completions). ```python messages_api.py theme={"system"} import anthropic import os API_KEY = os.environ["BASETEN_API_KEY"] client = anthropic.Anthropic( base_url="https://inference.baseten.co", api_key=API_KEY, default_headers={"Authorization": f"Bearer {API_KEY}"}, ) response = client.messages.create( model="deepseek-ai/DeepSeek-V4-Pro", max_tokens=4096, system="You are a concise technical writer.", messages=[ {"role": "user", "content": "What is gradient descent?"}, {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."}, {"role": "user", "content": "How does the learning rate affect it?"} ], ) for block in response.content: if block.type == "text": print(block.text) ``` ```javascript messages_api.js theme={"system"} import Anthropic from "@anthropic-ai/sdk"; const apiKey = process.env.BASETEN_API_KEY; const client = new Anthropic({ baseURL: "https://inference.baseten.co", apiKey: apiKey, defaultHeaders: { Authorization: `Bearer ${apiKey}` }, }); const response = await client.messages.create({ model: "deepseek-ai/DeepSeek-V4-Pro", max_tokens: 4096, system: "You are a concise technical writer.", messages: [ { role: "user", content: "What is gradient descent?" }, { role: "assistant", content: "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function." }, { role: "user", content: "How does the learning rate affect it?" } ], }); for (const block of response.content) { if (block.type === "text") console.log(block.text); } ``` ```bash Request theme={"system"} curl https://inference.baseten.co/v1/messages \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $BASETEN_API_KEY" \ -d '{ "model": "deepseek-ai/DeepSeek-V4-Pro", "max_tokens": 4096, "system": "You are a concise technical writer.", "messages": [ {"role": "user", "content": "What is gradient descent?"}, {"role": "assistant", "content": "An optimization algorithm that iteratively adjusts model parameters by moving in the direction of steepest decrease in the loss function."}, {"role": "user", "content": "How does the learning rate affect it?"} ] }' ``` The Anthropic SDK sends the API key as `x-api-key` by default. Baseten reads `Authorization`, so override `default_headers` as shown. ## List available models Query the `/v1/models` endpoint for the current list of models with metadata including pricing, context lengths, and supported features: ```bash Request theme={"system"} curl https://inference.baseten.co/v1/models \ -H "Authorization: Bearer $BASETEN_API_KEY" ``` ## Migrate To migrate to Baseten, change the base URL, API key, and model name. 1. Replace your OpenAI API key with a [Baseten API key](https://app.baseten.co/settings/api_keys). 2. Change the base URL to `https://inference.baseten.co/v1`. 3. Update the model name to a Baseten model slug. ```python migrate.py theme={"system"} from openai import OpenAI import os client = OpenAI( base_url="https://inference.baseten.co/v1", # [!code ++] api_key=os.environ["BASETEN_API_KEY"] # [!code ++] ) response = client.chat.completions.create( model="deepseek-ai/DeepSeek-V4-Pro", # [!code ++] messages=[{"role": "user", "content": "Hello"}] ) ``` 1. Replace your Anthropic API key with a [Baseten API key](https://app.baseten.co/settings/api_keys). 2. Change the base URL to `https://inference.baseten.co`. 3. Override `default_headers` so the SDK sends `Authorization` instead of `x-api-key`. 4. Update the model name to a [supported Baseten model slug](#supported-models). ```python migrate.py theme={"system"} import anthropic import os API_KEY = os.environ["BASETEN_API_KEY"] client = anthropic.Anthropic( base_url="https://inference.baseten.co", # [!code ++] api_key=API_KEY, # [!code ++] default_headers={"Authorization": f"Bearer {API_KEY}"}, # [!code ++] ) response = client.messages.create( model="deepseek-ai/DeepSeek-V4-Pro", # [!code ++] max_tokens=1024, messages=[{"role": "user", "content": "Hello"}] ) ``` ## Handle errors Model APIs return standard HTTP error codes: | Code | Meaning | | ---- | --------------------------------------- | | 400 | Invalid request (check your parameters) | | 401 | Invalid or missing API key | | 402 | Payment required | | 404 | Model not found | | 429 | Rate limit exceeded | | 500 | Internal server error | Each error response includes a JSON body with details about the issue and suggested resolutions. ## Next steps Control extended thinking for complex tasks Send images and videos alongside text Understand and configure rate limits Complete parameter documentation