How to Set Up AI Model Versioning
Manage model version transitions without breaking production systems.
Jay Banlasan
The AI Systems Guy
Anthropic deprecated claude-2.1 with 30 days notice. I had 14 workflows using it, spread across three different codebases. Finding them all and updating them took a full day, and two workflows broke in production during the transition because the output format changed between versions. Proper ai model version management production practices means one config file controls every model reference in your stack, and version transitions happen in a controlled test-then-swap process, not a fire drill.
Model versioning is infrastructure hygiene. You probably don't hardcode database connection strings in application code. Model names should get the same treatment.
What You Need Before Starting
- Python 3.10+
- A central config file or environment variables for model references
- At least one AI workflow that's in production and will eventually need a version update
- A test suite or evaluation set for your key workflows (even 10 sample inputs is enough)
Step 1: Build a Central Model Registry
One file defines all model aliases used across your stack. Nothing hardcodes a model string directly.
# model_registry.py
from dataclasses import dataclass, field
from typing import Optional
import json, os
from pathlib import Path
@dataclass
class ModelVersion:
alias: str # What your code uses: "fast", "balanced", "powerful"
provider: str # "anthropic", "openai", "google"
model_id: str # The actual model string the API expects
max_tokens: int # Default max output for this model
context_window: int # Total context limit
deprecated: bool = False
deprecated_on: Optional[str] = None
notes: str = ""
REGISTRY: dict[str, ModelVersion] = {
"fast": ModelVersion(
alias="fast",
provider="anthropic",
model_id="claude-haiku-3",
max_tokens=4096,
context_window=200_000,
notes="Default for classification, extraction, simple generation"
),
"balanced": ModelVersion(
alias="balanced",
provider="anthropic",
model_id="claude-sonnet-4-5",
max_tokens=8192,
context_window=200_000,
notes="Default for complex reasoning, structured outputs"
),
"powerful": ModelVersion(
alias="powerful",
provider="anthropic",
model_id="claude-opus-4-5",
max_tokens=8192,
context_window=200_000,
notes="Reserved for highest complexity tasks"
),
"code": ModelVersion(
alias="code",
provider="openai",
model_id="gpt-4o",
max_tokens=4096,
context_window=128_000,
notes="Code review and generation"
),
}
def get_model(alias: str) -> ModelVersion:
if alias not in REGISTRY:
raise KeyError(f"Unknown model alias: '{alias}'. "
f"Known aliases: {list(REGISTRY.keys())}")
model = REGISTRY[alias]
if model.deprecated:
import warnings
warnings.warn(f"Model alias '{alias}' is deprecated since {model.deprecated_on}. "
f"Update your code to use a current alias.")
return model
def get_model_id(alias: str) -> str:
return get_model(alias).model_id
Every AI call in your stack uses get_model_id("fast") instead of "claude-haiku-3". When the model ID needs to change, you update one line in the registry.
Step 2: Build an Environment Override Layer
Production, staging, and development can point at different model versions without code changes.
import os
ENV_OVERRIDES = {
# Format: MODEL_OVERRIDE_{ALIAS_UPPER}=model-id
# e.g. MODEL_OVERRIDE_FAST=claude-haiku-3-5
}
def get_model_id(alias: str) -> str:
# Check environment override first
env_key = f"MODEL_OVERRIDE_{alias.upper().replace('-', '_')}"
override = os.getenv(env_key)
if override:
return override
return get_model(alias).model_id
# Usage in .env file for staged rollout:
# MODEL_OVERRIDE_FAST=claude-haiku-3-5 <-- test new version in staging
# (production .env has no override — uses registry default)
This lets you test a new model version in staging by setting an environment variable, with zero code changes.
Step 3: Build a Transition Plan for Version Updates
When a model version needs to change (provider deprecation, cost change, quality improvement), follow a structured process.
# version_transition.py
@dataclass
class TransitionPlan:
alias: str
current_model_id: str
candidate_model_id: str
test_inputs: list[str]
evaluator: callable # function(old_output, new_output) -> float (0-1 similarity/quality)
rollout_pct: float = 0.0 # 0.0 = testing only, 1.0 = fully rolled out
status: str = "planned" # planned | testing | partial | complete | rolled_back
import anthropic
import random
_client = anthropic.Anthropic()
def run_transition_test(plan: TransitionPlan) -> dict:
current_results = []
candidate_results = []
for inp in plan.test_inputs:
for model_id, results_list in [
(plan.current_model_id, current_results),
(plan.candidate_model_id, candidate_results)
]:
response = _client.messages.create(
model=model_id, max_tokens=512,
messages=[{"role": "user", "content": inp}]
)
results_list.append(response.content[0].text)
scores = [
plan.evaluator(curr, cand)
for curr, cand in zip(current_results, candidate_results)
]
return {
"alias": plan.alias,
"current": plan.current_model_id,
"candidate": plan.candidate_model_id,
"test_count": len(plan.test_inputs),
"avg_quality_score": round(sum(scores) / len(scores), 3),
"min_score": round(min(scores), 3),
"passed": all(s >= 0.85 for s in scores),
"results": list(zip(current_results, candidate_results))
}
Step 4: Implement Canary Rollout
Don't switch 100% of traffic at once. Route a small percentage to the candidate model first.
import hashlib
def select_model_for_request(alias: str, request_id: str,
rollout_pct: float = 0.0) -> str:
"""
Consistently routes a given request_id to the same model
(hash-based so the same user always gets the same experience).
rollout_pct: 0.0-1.0 fraction of traffic to route to candidate.
"""
if rollout_pct <= 0.0:
return get_model_id(alias)
candidate_id = get_active_candidate(alias) # from your transition plan
if not candidate_id:
return get_model_id(alias)
# Deterministic routing based on request_id hash
hash_val = int(hashlib.md5(request_id.encode()).hexdigest(), 16)
pct = (hash_val % 10000) / 10000.0
if pct < rollout_pct:
return candidate_id
return get_model_id(alias)
# Canary plan stored in Redis or a simple JSON file
ACTIVE_TRANSITIONS: dict[str, dict] = {}
def get_active_candidate(alias: str) -> str | None:
transition = ACTIVE_TRANSITIONS.get(alias)
if transition and transition["status"] in ("partial",):
return transition["candidate_model_id"]
return None
def start_canary(alias: str, candidate_id: str, rollout_pct: float = 0.05):
ACTIVE_TRANSITIONS[alias] = {
"candidate_model_id": candidate_id,
"rollout_pct": rollout_pct,
"status": "partial"
}
print(f"Canary started: {alias} -> {candidate_id} at {rollout_pct*100}% traffic")
def promote_to_full(alias: str):
"""Once canary looks good, update registry and retire transition."""
candidate = ACTIVE_TRANSITIONS.get(alias, {}).get("candidate_model_id")
if candidate:
REGISTRY[alias].model_id = candidate
ACTIVE_TRANSITIONS.pop(alias, None)
print(f"Promoted {alias} to {candidate}")
Step 5: Track Version-Level Performance
Log which model version handled each request so you can compare performance before and after a transition.
import sqlite3
from datetime import datetime
def log_model_call(alias: str, model_id: str, latency_ms: int,
input_tokens: int, output_tokens: int,
cost_usd: float, quality_score: float = None):
conn = sqlite3.connect("model_versions.db")
conn.execute("""
CREATE TABLE IF NOT EXISTS model_version_calls (
ts TEXT, alias TEXT, model_id TEXT, latency_ms INTEGER,
input_tokens INTEGER, output_tokens INTEGER, cost_usd REAL,
quality_score REAL
)
""")
conn.execute("INSERT INTO model_version_calls VALUES (?,?,?,?,?,?,?,?)",
(datetime.utcnow().isoformat(), alias, model_id,
latency_ms, input_tokens, output_tokens, cost_usd, quality_score))
conn.commit()
conn.close()
def compare_versions(alias: str, days: int = 7) -> list:
conn = sqlite3.connect("model_versions.db")
rows = conn.execute("""
SELECT model_id,
COUNT(*) as calls,
AVG(latency_ms) as avg_latency,
AVG(cost_usd) as avg_cost,
AVG(quality_score) as avg_quality
FROM model_version_calls
WHERE alias = ? AND ts >= datetime('now', ?)
GROUP BY model_id
""", (alias, f'-{days} days')).fetchall()
conn.close()
return [{"model": r[0], "calls": r[1], "avg_latency_ms": round(r[2] or 0),
"avg_cost": round(r[3] or 0, 6), "avg_quality": round(r[4] or 0, 3)}
for r in rows]
Step 6: Add Deprecation Warnings Ahead of Cutoff
When you know a model is being deprecated, mark it in the registry early so developers get warnings during development.
def mark_deprecated(alias: str, deprecated_on: str, replacement_alias: str = None):
if alias not in REGISTRY:
raise KeyError(f"Unknown alias: {alias}")
REGISTRY[alias].deprecated = True
REGISTRY[alias].deprecated_on = deprecated_on
if replacement_alias:
REGISTRY[alias].notes += f" DEPRECATED: use '{replacement_alias}' instead."
print(f"Marked {alias} as deprecated since {deprecated_on}")
# Run this when you get deprecation notice from a provider
# mark_deprecated("fast", "2025-01-01", "fast-v2")
What to Build Next
- Add a CI check that runs
compare_versions()automatically in your pipeline and fails if candidate quality drops below 90% of current - Build a model changelog doc that auto-updates when transitions are promoted, so your team always knows what version is live
- Set up provider deprecation email alerts that auto-create a transition plan in your registry when a model is announced for retirement
Related Reading
- How to Build a Multi-Model AI Router - versioning and routing work together; the router reads the registry
- How to Build Automatic Model Failover Systems - model version transitions and failover chains need to stay in sync
- How to Implement Semantic Caching for AI Queries - cached responses from an old model version need invalidation when you transition
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment