How to Build a Document Q&A System with RAG
Build a system that answers questions accurately from your document library.
Jay Banlasan
The AI Systems Guy
A document qa system with rag and ai gives your team instant answers from contracts, policies, and handbooks without reading 50-page documents. I build these for organizations drowning in documentation where finding a specific clause or policy takes 20 minutes of searching. The system retrieves the exact passage and generates a clear answer.
This goes beyond system 409 by adding conversation memory, answer confidence scoring, and multi-document reasoning.
What You Need Before Starting
- An indexed vector store (see systems 409-410)
- Python 3.8+ with anthropic and chromadb
- Documents chunked and embedded
- A chat interface for asking questions
Step 1: Build the Q&A Engine with Confidence
import anthropic
import json
client = anthropic.Anthropic()
def answer_question(question, collection, top_k=5):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode(question).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
context = "\n\n---\n\n".join(results["documents"][0])
sources = results["metadatas"][0]
distances = results["distances"][0]
best_relevance = 1 - min(distances) if distances else 0
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=500,
system="""Answer from the provided context ONLY. Rate your confidence:
- HIGH: answer is directly stated in context
- MEDIUM: answer can be inferred from context
- LOW: context is tangentially related
- NONE: context does not contain the answer
Format: Start with [CONFIDENCE: level] then the answer.""",
messages=[{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}"}]
)
answer = response.content[0].text
return {
"answer": answer,
"retrieval_relevance": round(best_relevance, 3),
"sources": [{"file": s["source"], "page": s.get("page", "N/A")} for s in sources]
}
Step 2: Add Conversation Memory
Let users ask follow-up questions:
from collections import defaultdict
conversations = defaultdict(list)
def ask_with_context(session_id, question, collection):
history = conversations[session_id]
history.append({"role": "user", "content": question})
# Use last question + conversation context for retrieval
retrieval_query = question
if len(history) > 2:
retrieval_query = f"{history[-3]['content'] if len(history) > 2 else ''} {question}"
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode(retrieval_query).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=5)
context = "\n\n".join(results["documents"][0])
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=500,
system=f"Answer from context only. Be precise.\n\nContext:\n{context}",
messages=history[-6:] # Last 3 exchanges
)
reply = response.content[0].text
history.append({"role": "assistant", "content": reply})
return reply
Step 3: Handle Multi-Document Questions
Some questions need information from multiple documents:
def multi_doc_answer(question, collection, top_k=10):
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
query_embedding = model.encode(question).tolist()
results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
# Group chunks by source document
doc_chunks = {}
for i in range(len(results["ids"][0])):
source = results["metadatas"][0][i]["source"]
if source not in doc_chunks:
doc_chunks[source] = []
doc_chunks[source].append(results["documents"][0][i])
context_by_doc = "\n\n".join([
f"FROM {source}:\n{' '.join(chunks[:2])}"
for source, chunks in doc_chunks.items()
])
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=600,
system="Synthesize information from multiple documents to answer the question. Cite which document each piece of information comes from.",
messages=[{"role": "user", "content": f"Documents:\n{context_by_doc}\n\nQuestion: {question}"}]
)
return {"answer": response.content[0].text, "documents_used": list(doc_chunks.keys())}
Step 4: Log Questions for Improvement
import sqlite3
def log_qa(question, answer, sources, feedback=None):
conn = sqlite3.connect("qa_log.db")
conn.execute("""
INSERT INTO qa_log (question, answer, sources, feedback, asked_at)
VALUES (?, ?, ?, ?, datetime('now'))
""", (question, answer, json.dumps(sources), feedback))
conn.commit()
def get_unanswered_questions(days=30):
conn = sqlite3.connect("qa_log.db")
return conn.execute("""
SELECT question, COUNT(*) as freq FROM qa_log
WHERE answer LIKE '%could not find%' OR answer LIKE '%NONE%'
AND asked_at > datetime('now', ?)
GROUP BY question ORDER BY freq DESC
""", (f"-{days} days",)).fetchall()
Step 5: Serve the Q&A Interface
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route("/api/qa", methods=["POST"])
def qa_endpoint():
data = request.json
session_id = data.get("session_id", "default")
question = data["question"]
result = ask_with_context(session_id, question, collection)
log_qa(question, result, [])
return jsonify({"answer": result})
What to Build Next
Add answer verification. After the AI generates an answer, run a second check that compares the answer claims against the source chunks. Flag any claims not directly supported by the retrieved text.
Related Reading
- The Centralized Brain Concept - document Q&A as the interface to your business brain
- AI-Powered Reporting That Actually Gets Read - generating answers from data, not assumptions
- The Trust Framework for AI Decisions - building trust in AI-generated answers from documents
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment