How to Build a Custom AI Knowledge Base
Feed your business documents into an AI system for accurate, sourced answers.
Jay Banlasan
The AI Systems Guy
Building a custom AI knowledge base from business data is how you get an AI that answers questions about your specific products, processes, and policies instead of making things up. The technique is called RAG: Retrieval-Augmented Generation. The model does not memorize your documents. Instead, your system retrieves the most relevant document chunks at query time and injects them into the prompt. The model answers using the retrieved content, not its training data. I have built this for a law firm's policy library, a SaaS company's help documentation, and an internal HR knowledge base. All three replaced "ask a person" workflows that were bottlenecks.
The quality of a RAG system depends more on how you process documents than on which model you use. Chunking strategy and retrieval quality are where most implementations fail.
What You Need Before Starting
- Python 3.10+ with
anthropic,chromadb, andsentence-transformers(pip install anthropic chromadb sentence-transformers) - Your source documents (PDF, txt, markdown, or docx files)
- An Anthropic API key in your environment
Step 1: Install Dependencies and Set Up the Vector Store
pip install anthropic chromadb sentence-transformers pypdf2 python-docx
ChromaDB is a local vector store that persists to disk. No cloud account needed to get started.
import chromadb
from chromadb.utils import embedding_functions
# Use a local embedding model - free, runs on CPU, good quality
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=EMBEDDING_MODEL
)
def get_collection(persist_dir: str = "./kb_store", collection_name: str = "knowledge_base"):
client = chromadb.PersistentClient(path=persist_dir)
collection = client.get_or_create_collection(
name=collection_name,
embedding_function=embedding_fn,
metadata={"hnsw:space": "cosine"}
)
return collection
Step 2: Build the Document Processor
How you chunk documents determines retrieval quality. Chunk too large: you retrieve irrelevant content with relevant content. Chunk too small: you lose context.
import re
from pathlib import Path
def chunk_text(
text: str,
chunk_size: int = 400,
overlap: int = 50
) -> list[str]:
"""Split text into overlapping chunks."""
words = text.split()
chunks = []
start = 0
while start < len(words):
end = min(start + chunk_size, len(words))
chunk = " ".join(words[start:end])
chunks.append(chunk)
if end == len(words):
break
start += chunk_size - overlap
return chunks
def chunk_by_section(text: str, max_chunk_size: int = 500) -> list[dict]:
"""Chunk by markdown headers or paragraph breaks, preserving section context."""
sections = re.split(r'\n#{1,3} ', text)
chunks = []
current_section_title = "Introduction"
for section in sections:
lines = section.strip().split('\n')
if lines:
# First line might be the section title if split on header
title_line = lines[0].strip()
if title_line and not title_line.startswith('#'):
current_section_title = title_line
body = '\n'.join(lines[1:]).strip()
else:
body = section.strip()
if len(body.split()) > max_chunk_size:
# Section is too long, sub-chunk it
sub_chunks = chunk_text(body, max_chunk_size, 50)
for i, sub in enumerate(sub_chunks):
chunks.append({
"text": sub,
"section": f"{current_section_title} (part {i+1})"
})
elif body:
chunks.append({
"text": body,
"section": current_section_title
})
return chunks
def load_document(filepath: str) -> str:
path = Path(filepath)
suffix = path.suffix.lower()
if suffix == ".txt" or suffix == ".md":
with open(filepath, encoding="utf-8") as f:
return f.read()
elif suffix == ".pdf":
import PyPDF2
with open(filepath, "rb") as f:
reader = PyPDF2.PdfReader(f)
return "\n\n".join(page.extract_text() for page in reader.pages if page.extract_text())
elif suffix == ".docx":
import docx
doc = docx.Document(filepath)
return "\n\n".join(para.text for para in doc.paragraphs if para.text.strip())
else:
raise ValueError(f"Unsupported file type: {suffix}")
Step 3: Index Documents Into the Vector Store
import uuid
from datetime import datetime
def index_document(
filepath: str,
collection,
source_label: str = None
) -> dict:
print(f"Loading: {filepath}")
raw_text = load_document(filepath)
source = source_label or Path(filepath).name
# Chunk the document
chunks = chunk_by_section(raw_text)
if not chunks:
# Fall back to simple chunking
chunk_texts = chunk_text(raw_text)
chunks = [{"text": t, "section": "main"} for t in chunk_texts]
print(f" {len(chunks)} chunks")
# Prepare for ChromaDB
ids = [str(uuid.uuid4()) for _ in chunks]
documents = [c["text"] for c in chunks]
metadatas = [
{
"source": source,
"section": c.get("section", ""),
"indexed_at": datetime.utcnow().isoformat()
}
for c in chunks
]
# Add to collection in batches (ChromaDB has a batch limit)
batch_size = 100
for i in range(0, len(documents), batch_size):
collection.add(
ids=ids[i:i+batch_size],
documents=documents[i:i+batch_size],
metadatas=metadatas[i:i+batch_size]
)
return {"source": source, "chunks_indexed": len(chunks)}
def index_directory(directory: str, collection, extensions: list = None):
extensions = extensions or [".txt", ".md", ".pdf", ".docx"]
results = []
for filepath in Path(directory).rglob("*"):
if filepath.suffix.lower() in extensions:
try:
result = index_document(str(filepath), collection)
results.append(result)
except Exception as e:
print(f" Error indexing {filepath}: {e}")
total_chunks = sum(r["chunks_indexed"] for r in results)
print(f"\nIndexed {len(results)} documents, {total_chunks} total chunks")
return results
Step 4: Build the Retrieval Function
def retrieve(
query: str,
collection,
n_results: int = 5,
min_relevance: float = 0.0
) -> list[dict]:
results = collection.query(
query_texts=[query],
n_results=n_results,
include=["documents", "metadatas", "distances"]
)
documents = results["documents"][0]
metadatas = results["metadatas"][0]
distances = results["distances"][0]
chunks = []
for doc, meta, dist in zip(documents, metadatas, distances):
relevance_score = 1 - dist # Convert distance to similarity (cosine)
if relevance_score >= min_relevance:
chunks.append({
"text": doc,
"source": meta.get("source", "unknown"),
"section": meta.get("section", ""),
"relevance": round(relevance_score, 3)
})
return sorted(chunks, key=lambda x: x["relevance"], reverse=True)
Step 5: Build the RAG Query Function
import anthropic
ai_client = anthropic.Anthropic()
SYSTEM_PROMPT = """You are a knowledge base assistant. Answer questions based ONLY on the provided context.
Rules:
- If the answer is in the context, give it directly with the source
- If the answer is NOT in the context, say "I don't have information about that in the knowledge base"
- Never invent information
- Quote the source document when relevant
- Be concise"""
def rag_query(
question: str,
collection,
n_context_chunks: int = 4,
model: str = "claude-haiku-4-5"
) -> dict:
# Retrieve relevant chunks
chunks = retrieve(question, collection, n_results=n_context_chunks)
if not chunks:
return {
"answer": "I don't have any relevant information in the knowledge base for that question.",
"sources": [],
"context_used": 0
}
# Build context string with source citations
context_parts = []
sources_used = []
for chunk in chunks:
source_label = f"[{chunk['source']} - {chunk['section']}]" if chunk['section'] else f"[{chunk['source']}]"
context_parts.append(f"{source_label}\n{chunk['text']}")
if chunk['source'] not in sources_used:
sources_used.append(chunk['source'])
context = "\n\n---\n\n".join(context_parts)
# Generate answer
user_message = f"""Context from knowledge base:
{context}
Question: {question}"""
response = ai_client.messages.create(
model=model,
max_tokens=600,
system=SYSTEM_PROMPT,
messages=[{"role": "user", "content": user_message}]
)
return {
"answer": response.content[0].text,
"sources": sources_used,
"context_used": len(chunks),
"top_relevance": chunks[0]["relevance"] if chunks else 0
}
Step 6: Wire It Into a Complete System
def build_knowledge_base(docs_directory: str) -> object:
collection = get_collection()
existing_count = collection.count()
if existing_count == 0:
print("Building knowledge base from scratch...")
index_directory(docs_directory, collection)
else:
print(f"Knowledge base loaded: {existing_count} chunks indexed")
return collection
def interactive_kb_session(docs_directory: str):
print("Building/loading knowledge base...")
collection = build_knowledge_base(docs_directory)
print(f"Ready. {collection.count()} chunks indexed.\n")
while True:
question = input("Ask a question (or 'quit'): ").strip()
if question.lower() in ["quit", "exit", "q"]:
break
if not question:
continue
result = rag_query(question, collection)
print(f"\nAnswer: {result['answer']}")
print(f"Sources: {', '.join(result['sources'])}")
print(f"Relevance: {result['top_relevance']:.2f}")
print()
def add_document_to_kb(filepath: str, docs_directory: str):
collection = get_collection()
result = index_document(filepath, collection)
print(f"Added: {result['source']} ({result['chunks_indexed']} chunks)")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
interactive_kb_session(sys.argv[1])
else:
interactive_kb_session("./documents")
Run it: python knowledge_base.py ./your-docs-folder
What to Build Next
- Add a reranking step using a cross-encoder model to improve retrieval precision for ambiguous queries
- Build a web interface with a chat history so users can ask follow-up questions that reference previous answers
- Add document update detection so re-indexing only processes changed files, not the entire library
Related Reading
- Building an AI-Powered Knowledge Base for Your Team - ai knowledge base team
- Building a Knowledge Base for Your AI - knowledge base ai operations
- Prompt: Build a Knowledge Base Article - prompt knowledge base article
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment