How to Set Up Together AI for Open-Source Models
Access Llama, Mixtral, and other open-source models through Together AI API.
Jay Banlasan
The AI Systems Guy
Together AI gives you cloud access to the full ecosystem of open-source models through a single API. The together ai open source models setup is what I use when a client needs open-source model access but does not want to manage local hardware or Ollama deployments. You get Llama, Mixtral, Qwen, DeepSeek, and dozens of others through one OpenAI-compatible endpoint, priced competitively for high volume.
The business case is straightforward: you get the pricing benefits of open-source without the infrastructure burden of running Ollama on a VPS. And together.ai's inferencing is fast enough for most production workflows, especially on the smaller 7B to 70B parameter models.
What You Need Before Starting
- A Together AI account at together.ai
- Python 3.9+ with the
togetheroropenaipackage - Your Together AI API key
- A use case that benefits from open-source models (high volume, cost-sensitive, or specific model requirements)
Step 1: Get Your API Key
Sign up at together.ai. Go to Settings > API Keys. Click "Create API Key." Name it. Copy it.
Add to .env:
TOGETHER_API_KEY=your-together-key-here
Install the SDK:
pip install together python-dotenv
Or use the OpenAI SDK with a base URL swap (covered in step 4).
Step 2: List Available Models and Make Your First Call
import os
from together import Together
from dotenv import load_dotenv
load_dotenv()
client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
def ask_together(
prompt: str,
model: str = "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
system: str = None,
temperature: float = 0.3
) -> str:
"""
Send a prompt to Together AI and return the response.
Args:
prompt: User message
model: Together AI model ID
system: Optional system prompt
temperature: 0.0 deterministic, 1.0 creative
Returns:
Response text
"""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
max_tokens=1000
)
return response.choices[0].message.content
# Test it
result = ask_together("What are the top 3 use cases for Llama models in business?")
print(result)
Step 3: Know the Key Models and When to Use Each
TOGETHER_MODELS = {
# Llama family - general purpose
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": {
"use_case": "Fast, cheap, general tasks",
"params": "8B",
"price_per_million": "$0.18"
},
"meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": {
"use_case": "High quality general tasks",
"params": "70B",
"price_per_million": "$0.88"
},
"meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {
"use_case": "Near-frontier quality at open-source cost",
"params": "405B",
"price_per_million": "$3.50"
},
# Mixtral - strong on multilingual and reasoning
"mistralai/Mixtral-8x7B-Instruct-v0.1": {
"use_case": "European languages, efficient reasoning",
"params": "47B MoE",
"price_per_million": "$0.60"
},
# Qwen - strong on code and Chinese
"Qwen/Qwen2.5-72B-Instruct-Turbo": {
"use_case": "Code tasks, multilingual",
"params": "72B",
"price_per_million": "$1.20"
},
# DeepSeek - exceptional on code and math
"deepseek-ai/DeepSeek-R1": {
"use_case": "Reasoning, math, complex analysis",
"params": "671B MoE",
"price_per_million": "$3.00"
}
}
def print_model_guide():
print("Together AI Model Guide\n")
for model_id, info in TOGETHER_MODELS.items():
short_name = model_id.split("/")[-1]
print(f"{short_name}")
print(f" Use case: {info['use_case']}")
print(f" Size: {info['params']} | Cost: {info['price_per_million']}/1M tokens")
print()
Step 4: Use the OpenAI SDK with Together AI
If you have existing OpenAI code, swap in Together AI with a base URL change:
from openai import OpenAI
together_via_openai = OpenAI(
api_key=os.getenv("TOGETHER_API_KEY"),
base_url="https://api.together.xyz/v1"
)
response = together_via_openai.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
messages=[
{"role": "system", "content": "You are a business analyst. Be concise."},
{"role": "user", "content": "List 5 industries most likely to adopt AI automation in 2025."}
],
temperature=0.3,
max_tokens=400
)
print(response.choices[0].message.content)
Step 5: Build a Multi-Model Router That Uses Together AI for Cost Savings
import os
from openai import OpenAI as OAI
import anthropic
# Initialize all clients
openai_client = OAI(api_key=os.getenv("OPENAI_API_KEY"))
claude_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
together_client = OAI(
api_key=os.getenv("TOGETHER_API_KEY"),
base_url="https://api.together.xyz/v1"
)
def smart_route(
prompt: str,
task_type: str,
system: str = None,
max_tokens: int = 500
) -> dict:
"""
Route to the most cost-effective model for each task type.
Task routing:
- classify, extract, summarize -> Together AI Llama 8B ($0.18/M)
- translate, format, simple_qa -> Together AI Llama 70B ($0.88/M)
- complex_analysis, strategy -> Claude Haiku ($0.25/M input)
- creative, multi-step_reasoning -> Claude Opus or GPT-4o
"""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
CHEAP_TASKS = ["classify", "extract", "summarize_short", "format"]
MEDIUM_TASKS = ["translate", "summarize_long", "simple_qa", "rewrite"]
COMPLEX_TASKS = ["analyze", "strategy", "multi_step", "creative"]
if task_type in CHEAP_TASKS:
response = together_client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
messages=messages,
temperature=0.1,
max_tokens=max_tokens
)
return {
"provider": "together-llama8b",
"text": response.choices[0].message.content,
"approx_cost_per_1k_tokens": "$0.00018"
}
elif task_type in MEDIUM_TASKS:
response = together_client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
messages=messages,
temperature=0.3,
max_tokens=max_tokens
)
return {
"provider": "together-llama70b",
"text": response.choices[0].message.content,
"approx_cost_per_1k_tokens": "$0.00088"
}
else:
# Fall back to Claude for complex tasks
kwargs = {
"model": "claude-haiku-20240307",
"max_tokens": max_tokens,
"messages": [{"role": "user", "content": prompt}]
}
if system:
kwargs["system"] = system
response = claude_client.messages.create(**kwargs)
return {
"provider": "claude-haiku",
"text": response.content[0].text,
"approx_cost_per_1k_tokens": "$0.00025"
}
# Test the router
tasks = [
("classify", "Invoice from Acme Corp $1,500", "Classify as INVOICE, CONTRACT, EMAIL. One word."),
("summarize_short", "Long article text...", "Summarize in one sentence."),
("analyze", "Our Q3 revenue dropped 15% while competitor grew 20%. What should we investigate?", None)
]
for task_type, prompt, system in tasks:
result = smart_route(prompt, task_type, system)
print(f"[{task_type}] via {result['provider']} ({result['approx_cost_per_1k_tokens']}/1k):")
print(f" {result['text'][:100]}\n")
Step 6: Run Fine-Tuned Models on Together AI
Together AI supports custom fine-tuned model deployment:
def list_my_models() -> list:
"""List your fine-tuned models on Together AI."""
import requests
headers = {"Authorization": f"Bearer {os.getenv('TOGETHER_API_KEY')}"}
response = requests.get("https://api.together.xyz/v1/models", headers=headers)
data = response.json()
my_models = [m for m in data if m.get("owned_by") == "your-org"]
return my_models
def call_finetuned_model(model_id: str, prompt: str) -> str:
"""Call a fine-tuned model by its Together AI model ID."""
response = together_client.chat.completions.create(
model=model_id,
messages=[{"role": "user", "content": prompt}],
max_tokens=500
)
return response.choices[0].message.content
What to Build Next
- Run a cost comparison across your current AI spend and see where Together AI saves money on volume tasks
- Deploy a fine-tuned Llama model on Together AI for a domain-specific use case (legal, medical, industry-specific)
- Build a fallback chain: try Together AI first, fall back to OpenAI if the response quality is insufficient
Related Reading
- How to Set Up Groq for Ultra-Fast AI Inference - Groq also runs open-source models, but faster and with stricter rate limits
- How to Install and Run Local LLMs with Ollama - The self-hosted alternative to Together AI for open-source models
- How to Handle AI API Rate Limits Gracefully - Rate limit handling matters for all providers including Together AI
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment