Documentation Index
Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
Use this file to discover all available pages before exploring further.
Streaming output significantly reduces wait time for generative AI models by returning results as they are generated instead of waiting for the full response.
Why streaming?
- ✅ Faster response time – Get initial results in under 1 second instead of waiting 10+ seconds.
- ✅ Improved user experience – Partial outputs are immediately usable.
This guide walks through deploying Falcon 7B with streaming enabled.
1. Initialize Truss
truss init falcon-7b && cd falcon-7b
2: Implement Model (Non-Streaming)
This first version loads the Falcon 7B model without streaming:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from typing import Dict
CHECKPOINT = "tiiuae/falcon-7b-instruct"
DEFAULT_MAX_NEW_TOKENS = 150
DEFAULT_TOP_P = 0.95
class Model:
def __init__(self, **kwargs) -> None:
self.tokenizer = None
self.model = None
def load(self):
self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
self.model = AutoModelForCausalLM.from_pretrained(
CHECKPOINT, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
)
def predict(self, request: Dict) -> Dict:
prompt = request["prompt"]
inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True)
input_ids = inputs["input_ids"].to("cuda")
generation_config = GenerationConfig(temperature=1, top_p=DEFAULT_TOP_P, top_k=40)
with torch.no_grad():
return self.model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
pad_token_id=self.tokenizer.eos_token_id,
max_new_tokens=DEFAULT_MAX_NEW_TOKENS,
)
3. Add streaming support
To enable streaming, we:
- Use
TextIteratorStreamer to stream tokens as they are generated.
- Run
generate() in a separate thread to prevent blocking.
- Return a generator that streams results.
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextIteratorStreamer
from threading import Thread
from typing import Dict
CHECKPOINT = "tiiuae/falcon-7b-instruct"
class Model:
def __init__(self, **kwargs) -> None:
self.tokenizer = None
self.model = None
def load(self):
self.tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
self.model = AutoModelForCausalLM.from_pretrained(
CHECKPOINT, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
)
def predict(self, request: Dict):
prompt = request["prompt"]
inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True)
input_ids = inputs["input_ids"].to("cuda")
streamer = TextIteratorStreamer(self.tokenizer)
generation_config = GenerationConfig(temperature=1, top_p=0.95, top_k=40)
def generate():
self.model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
pad_token_id=self.tokenizer.eos_token_id,
max_new_tokens=150,
streamer=streamer,
)
thread = Thread(target=generate)
thread.start()
def stream_output():
for text in streamer:
yield text
thread.join()
return stream_output()
model_name: falcon-streaming
requirements:
- torch==2.0.1
- peft==0.4.0
- scipy==1.11.1
- sentencepiece==0.1.99
- accelerate==0.21.0
- bitsandbytes==0.41.1
- einops==0.6.1
- transformers==4.31.0
resources:
cpu: "4"
memory: 16Gi
use_gpu: true
accelerator: L4
5. Deploy and invoke
Deploy the model:
Invoke with:
truss predict -d '{"prompt": "Tell me about falcons", "do_sample": true}'