Serve a wide range of models
Create an API key for your Baseten account
export BASETEN_API_KEY="abcd.123456"
Add an access token for Hugging Face
hf_access_token
Install Truss in your local development environment
pip install --upgrade truss openai
mkdir qwen-2-5-3b-engine touch qwen-2-5-3b-engine/config.yaml
config.yaml
model_metadata: engine_args: model: Qwen/Qwen2.5-3B-Instruct example_model_input: # Loads sample request into Baseten playground messages: - role: system content: "You are a helpful assistant." - role: user content: "What does Tongyi Qianwen mean?" stream: true max_tokens: 512 temperature: 0.6 base_image: image: vllm/vllm-openai:v0.7.3 docker_server: start_command: sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) vllm serve Qwen/Qwen2.5-3B-Instruct --enable-prefix-caching --enable-chunked-prefill" readiness_endpoint: /health liveness_endpoint: /health predict_endpoint: /v1/completions server_port: 8000 runtime: predict_concurrency: 256 resources: accelerator: A10G use_gpu: true environment_variables: hf_access_token: null
truss push qwen-2-5-3b-engine --publish
import os from openai import OpenAI # https://model-XXXXXXX.api.baseten.co/environments/production/sync/v1 model_url = "" client = OpenAI( base_url=model_url, api_key=os.environ.get("BASETEN_API_KEY"), ) stream = client.chat.completions.create( model="Qwen/Qwen2.5-3B-Instruct", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What does Tongyi Qianwen mean?"} ], stream=True, ) for chunk in stream: if chunk.choices[0].delta.content is not None: print(chunk.choices[0].delta.content, end="")
Was this page helpful?