Systems Library / AI Model Setup / How to Set Up Fireworks AI for Production Inference
AI Model Setup foundations

How to Set Up Fireworks AI for Production Inference

Deploy low-latency AI inference with Fireworks AI optimized serving.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Fireworks AI production inference setup gives you sub-second response times on open-source models at lower cost than most managed alternatives. Where Groq uses custom hardware, Fireworks uses optimized serving on standard GPUs with smart batching and caching. I use Fireworks when a client needs fast Llama or Mixtral inference with reliable uptime SLAs and the ability to deploy fine-tuned models without managing infrastructure.

The platform's differentiator is FireFunction, their function-calling optimized model, and the ability to deploy custom models as private endpoints. If you have fine-tuned a Llama variant on your own data, Fireworks is the fastest path to serving it in production without standing up your own GPU cluster.

What You Need Before Starting

Step 1: Make Your First Fireworks API Call

import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ["FIREWORKS_API_KEY"],
    base_url="https://api.fireworks.ai/inference/v1"
)

def chat_fireworks(
    prompt: str,
    model: str = "accounts/fireworks/models/llama-v3p1-8b-instruct",
    system_prompt: str = "You are a helpful assistant.",
    max_tokens: int = 1024
) -> str:
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0.7
    )
    return response.choices[0].message.content

# Test
result = chat_fireworks("Explain the difference between latency and throughput in two sentences.")
print(result)

Fireworks model IDs use the accounts/fireworks/models/ prefix for their hosted models. Find the full list at fireworks.ai/models.

Step 2: Select the Right Model for Your Use Case

Fireworks hosts multiple model families. Match the model to the task.

# Model reference - update with current Fireworks catalog
FIREWORKS_MODELS = {
    # Fast, cheap for simple tasks
    "fast": "accounts/fireworks/models/llama-v3p1-8b-instruct",

    # Balanced quality/speed
    "balanced": "accounts/fireworks/models/llama-v3p1-70b-instruct",

    # Highest quality open-source
    "quality": "accounts/fireworks/models/llama-v3p1-405b-instruct",

    # Function calling optimized
    "function_calling": "accounts/fireworks/models/firefunction-v2",

    # Code generation
    "code": "accounts/fireworks/models/deepseek-coder-v2-instruct",

    # Low latency for chat
    "chat": "accounts/fireworks/models/mixtral-8x22b-instruct",
}

def select_model(task_type: str) -> str:
    mapping = {
        "classify": "fast",
        "summarize": "balanced",
        "generate": "quality",
        "function_call": "function_calling",
        "code": "code",
        "chat": "chat"
    }
    model_key = mapping.get(task_type, "balanced")
    return FIREWORKS_MODELS[model_key]

Step 3: Use Function Calling with FireFunction

FireFunction is Fireworks' model optimized for structured output via function calling. Use it when you need reliable JSON output.

import json

def extract_with_function_calling(text: str) -> dict:
    tools = [
        {
            "type": "function",
            "function": {
                "name": "extract_contact_info",
                "description": "Extract contact information from text",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "name": {"type": "string", "description": "Full name"},
                        "email": {"type": "string", "description": "Email address"},
                        "company": {"type": "string", "description": "Company name"},
                        "phone": {"type": "string", "description": "Phone number"},
                        "intent": {
                            "type": "string",
                            "enum": ["buy", "learn", "compare", "support", "unknown"],
                            "description": "Primary intent"
                        }
                    },
                    "required": ["intent"]
                }
            }
        }
    ]

    response = client.chat.completions.create(
        model=FIREWORKS_MODELS["function_calling"],
        messages=[{"role": "user", "content": f"Extract info from this inquiry: {text}"}],
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "extract_contact_info"}}
    )

    tool_call = response.choices[0].message.tool_calls[0]
    return json.loads(tool_call.function.arguments)

# Test
result = extract_with_function_calling(
    "Hi, I'm James from BuildCorp ([email protected]). We're looking to compare enterprise AI solutions for our team of 50."
)
print(result)
# Output: {"name": "James", "email": "[email protected]", "company": "BuildCorp", "intent": "compare"}

Step 4: Enable Streaming for Real-Time Output

def stream_response(prompt: str, model: str = None) -> str:
    if model is None:
        model = FIREWORKS_MODELS["balanced"]

    full_response = ""

    stream = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1024,
        stream=True
    )

    for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.content:
            print(delta.content, end="", flush=True)
            full_response += delta.content

    print()  # newline after stream
    return full_response

Step 5: Deploy a Custom Fine-Tuned Model

If you have fine-tuned a model, upload it to Fireworks as a private deployment.

import requests

FIREWORKS_API_KEY = os.environ["FIREWORKS_API_KEY"]

def upload_model(
    model_path: str,  # Local path to model files or HuggingFace model ID
    display_name: str,
    account_id: str
) -> str:
    """Upload a model to Fireworks for private serving."""
    headers = {
        "Authorization": f"Bearer {FIREWORKS_API_KEY}",
        "Content-Type": "application/json"
    }

    # For HuggingFace models, use the HF model ID directly
    payload = {
        "displayName": display_name,
        "modelId": f"{account_id}/{display_name.lower().replace(' ', '-')}",
        "baseModel": model_path,  # e.g., "meta-llama/Meta-Llama-3.1-8B"
    }

    response = requests.post(
        "https://api.fireworks.ai/v1/accounts/{}/models".format(account_id),
        headers=headers,
        json=payload
    )

    if response.status_code == 200:
        model_data = response.json()
        return model_data["name"]
    else:
        raise RuntimeError(f"Upload failed: {response.text}")

def deploy_model(model_name: str, account_id: str) -> dict:
    """Deploy an uploaded model to a serving endpoint."""
    headers = {
        "Authorization": f"Bearer {FIREWORKS_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model_name,
        "gpuType": "NVIDIA_A100_80G",
        "minReplicas": 1,
        "maxReplicas": 3,
        "acceleratorCount": 1
    }

    response = requests.post(
        f"https://api.fireworks.ai/v1/accounts/{account_id}/deployedModels",
        headers=headers,
        json=payload
    )

    return response.json()

Step 6: Build a Latency Monitor

For production inference, track your actual latency percentiles, not just averages.

import time
import statistics

def measure_latency(
    prompt: str,
    model: str,
    n_samples: int = 10
) -> dict:
    latencies = []

    for i in range(n_samples):
        start = time.time()
        client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=100  # Fixed token count for fair comparison
        )
        latencies.append((time.time() - start) * 1000)

    sorted_latencies = sorted(latencies)

    return {
        "model": model,
        "samples": n_samples,
        "p50_ms": round(statistics.median(latencies), 1),
        "p95_ms": round(sorted_latencies[int(n_samples * 0.95)], 1),
        "p99_ms": round(sorted_latencies[int(n_samples * 0.99)], 1),
        "min_ms": round(min(latencies), 1),
        "max_ms": round(max(latencies), 1),
        "avg_ms": round(statistics.mean(latencies), 1)
    }

# Compare models
if __name__ == "__main__":
    test_prompt = "Summarize this in one sentence: AI inference optimization reduces latency by batching requests."

    for model_key in ["fast", "balanced"]:
        model = FIREWORKS_MODELS[model_key]
        print(f"\nMeasuring {model_key}...")
        stats = measure_latency(test_prompt, model, n_samples=5)
        print(f"  p50: {stats['p50_ms']}ms | p95: {stats['p95_ms']}ms | p99: {stats['p99_ms']}ms")

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems