How to Set Up Fireworks AI for Production Inference
Deploy low-latency AI inference with Fireworks AI optimized serving.
Jay Banlasan
The AI Systems Guy
Fireworks AI production inference setup gives you sub-second response times on open-source models at lower cost than most managed alternatives. Where Groq uses custom hardware, Fireworks uses optimized serving on standard GPUs with smart batching and caching. I use Fireworks when a client needs fast Llama or Mixtral inference with reliable uptime SLAs and the ability to deploy fine-tuned models without managing infrastructure.
The platform's differentiator is FireFunction, their function-calling optimized model, and the ability to deploy custom models as private endpoints. If you have fine-tuned a Llama variant on your own data, Fireworks is the fastest path to serving it in production without standing up your own GPU cluster.
What You Need Before Starting
- A Fireworks AI account at fireworks.ai
- Your API key from the Fireworks dashboard
- Python 3.10+ with
openailibrary (pip install openai) - Fireworks is OpenAI-compatible - Set
FIREWORKS_API_KEYin your environment
Step 1: Make Your First Fireworks API Call
import os
from openai import OpenAI
client = OpenAI(
api_key=os.environ["FIREWORKS_API_KEY"],
base_url="https://api.fireworks.ai/inference/v1"
)
def chat_fireworks(
prompt: str,
model: str = "accounts/fireworks/models/llama-v3p1-8b-instruct",
system_prompt: str = "You are a helpful assistant.",
max_tokens: int = 1024
) -> str:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
max_tokens=max_tokens,
temperature=0.7
)
return response.choices[0].message.content
# Test
result = chat_fireworks("Explain the difference between latency and throughput in two sentences.")
print(result)
Fireworks model IDs use the accounts/fireworks/models/ prefix for their hosted models. Find the full list at fireworks.ai/models.
Step 2: Select the Right Model for Your Use Case
Fireworks hosts multiple model families. Match the model to the task.
# Model reference - update with current Fireworks catalog
FIREWORKS_MODELS = {
# Fast, cheap for simple tasks
"fast": "accounts/fireworks/models/llama-v3p1-8b-instruct",
# Balanced quality/speed
"balanced": "accounts/fireworks/models/llama-v3p1-70b-instruct",
# Highest quality open-source
"quality": "accounts/fireworks/models/llama-v3p1-405b-instruct",
# Function calling optimized
"function_calling": "accounts/fireworks/models/firefunction-v2",
# Code generation
"code": "accounts/fireworks/models/deepseek-coder-v2-instruct",
# Low latency for chat
"chat": "accounts/fireworks/models/mixtral-8x22b-instruct",
}
def select_model(task_type: str) -> str:
mapping = {
"classify": "fast",
"summarize": "balanced",
"generate": "quality",
"function_call": "function_calling",
"code": "code",
"chat": "chat"
}
model_key = mapping.get(task_type, "balanced")
return FIREWORKS_MODELS[model_key]
Step 3: Use Function Calling with FireFunction
FireFunction is Fireworks' model optimized for structured output via function calling. Use it when you need reliable JSON output.
import json
def extract_with_function_calling(text: str) -> dict:
tools = [
{
"type": "function",
"function": {
"name": "extract_contact_info",
"description": "Extract contact information from text",
"parameters": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Full name"},
"email": {"type": "string", "description": "Email address"},
"company": {"type": "string", "description": "Company name"},
"phone": {"type": "string", "description": "Phone number"},
"intent": {
"type": "string",
"enum": ["buy", "learn", "compare", "support", "unknown"],
"description": "Primary intent"
}
},
"required": ["intent"]
}
}
}
]
response = client.chat.completions.create(
model=FIREWORKS_MODELS["function_calling"],
messages=[{"role": "user", "content": f"Extract info from this inquiry: {text}"}],
tools=tools,
tool_choice={"type": "function", "function": {"name": "extract_contact_info"}}
)
tool_call = response.choices[0].message.tool_calls[0]
return json.loads(tool_call.function.arguments)
# Test
result = extract_with_function_calling(
"Hi, I'm James from BuildCorp ([email protected]). We're looking to compare enterprise AI solutions for our team of 50."
)
print(result)
# Output: {"name": "James", "email": "[email protected]", "company": "BuildCorp", "intent": "compare"}
Step 4: Enable Streaming for Real-Time Output
def stream_response(prompt: str, model: str = None) -> str:
if model is None:
model = FIREWORKS_MODELS["balanced"]
full_response = ""
stream = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=1024,
stream=True
)
for chunk in stream:
delta = chunk.choices[0].delta
if delta.content:
print(delta.content, end="", flush=True)
full_response += delta.content
print() # newline after stream
return full_response
Step 5: Deploy a Custom Fine-Tuned Model
If you have fine-tuned a model, upload it to Fireworks as a private deployment.
import requests
FIREWORKS_API_KEY = os.environ["FIREWORKS_API_KEY"]
def upload_model(
model_path: str, # Local path to model files or HuggingFace model ID
display_name: str,
account_id: str
) -> str:
"""Upload a model to Fireworks for private serving."""
headers = {
"Authorization": f"Bearer {FIREWORKS_API_KEY}",
"Content-Type": "application/json"
}
# For HuggingFace models, use the HF model ID directly
payload = {
"displayName": display_name,
"modelId": f"{account_id}/{display_name.lower().replace(' ', '-')}",
"baseModel": model_path, # e.g., "meta-llama/Meta-Llama-3.1-8B"
}
response = requests.post(
"https://api.fireworks.ai/v1/accounts/{}/models".format(account_id),
headers=headers,
json=payload
)
if response.status_code == 200:
model_data = response.json()
return model_data["name"]
else:
raise RuntimeError(f"Upload failed: {response.text}")
def deploy_model(model_name: str, account_id: str) -> dict:
"""Deploy an uploaded model to a serving endpoint."""
headers = {
"Authorization": f"Bearer {FIREWORKS_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": model_name,
"gpuType": "NVIDIA_A100_80G",
"minReplicas": 1,
"maxReplicas": 3,
"acceleratorCount": 1
}
response = requests.post(
f"https://api.fireworks.ai/v1/accounts/{account_id}/deployedModels",
headers=headers,
json=payload
)
return response.json()
Step 6: Build a Latency Monitor
For production inference, track your actual latency percentiles, not just averages.
import time
import statistics
def measure_latency(
prompt: str,
model: str,
n_samples: int = 10
) -> dict:
latencies = []
for i in range(n_samples):
start = time.time()
client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=100 # Fixed token count for fair comparison
)
latencies.append((time.time() - start) * 1000)
sorted_latencies = sorted(latencies)
return {
"model": model,
"samples": n_samples,
"p50_ms": round(statistics.median(latencies), 1),
"p95_ms": round(sorted_latencies[int(n_samples * 0.95)], 1),
"p99_ms": round(sorted_latencies[int(n_samples * 0.99)], 1),
"min_ms": round(min(latencies), 1),
"max_ms": round(max(latencies), 1),
"avg_ms": round(statistics.mean(latencies), 1)
}
# Compare models
if __name__ == "__main__":
test_prompt = "Summarize this in one sentence: AI inference optimization reduces latency by batching requests."
for model_key in ["fast", "balanced"]:
model = FIREWORKS_MODELS[model_key]
print(f"\nMeasuring {model_key}...")
stats = measure_latency(test_prompt, model, n_samples=5)
print(f" p50: {stats['p50_ms']}ms | p95: {stats['p95_ms']}ms | p99: {stats['p99_ms']}ms")
What to Build Next
- Set up Fireworks' embedding models for vector search to pair with their generative models
- Build an auto-scaling wrapper that switches to a smaller model during high traffic to control costs
- Implement request caching using exact-match SHA256 hashing to avoid re-running identical prompts
Related Reading
- AI for Podcast and Video Production - ai podcast video production
- Implementing AI for Podcast Production - ai podcast production implementation
- The Guard Rail Pattern for Production AI - guard rail pattern production ai
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment