How to Build AI-Powered Search for Your Data
Replace keyword search with AI-powered semantic search across your documents.
Jay Banlasan
The AI Systems Guy
Keyword search fails the moment a user phrases their query differently from the exact words in your documents. Semantic search finds results based on meaning, not exact words. A user searching for "how to lower my ad costs" will find your article titled "reducing cost per acquisition" even though neither phrase appears in the other. That is the difference between a useful search and a frustrating one.
I use semantic search for internal knowledge bases, client document libraries, and customer-facing help centers. The setup is 3-4 hours. The impact is immediate. Users stop asking support questions that the docs already answer because they can finally find those docs.
What You Need Before Starting
- Python 3.9+
- OpenAI API key (for embeddings)
- A collection of documents to search (at least 10-20 to make it worthwhile)
chromadbfor vector storage (simplest local option) or Pinecone for production scale
Step 1: Install Dependencies
pip install openai chromadb
ChromaDB runs locally with zero infrastructure setup. For production with millions of documents, switch to Pinecone or Weaviate. The code pattern is nearly identical.
Step 2: Prepare and Chunk Your Documents
Long documents need to be split into chunks before embedding. The chunk size is a tuning parameter: smaller chunks are more precise, larger chunks give more context.
from pathlib import Path
from typing import Generator
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[dict]:
"""Split text into overlapping chunks for better retrieval."""
words = text.split()
chunks = []
i = 0
chunk_num = 0
while i < len(words):
chunk_words = words[i:i + chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append({
"text": chunk_text,
"chunk_num": chunk_num,
"word_start": i,
"word_end": min(i + chunk_size, len(words))
})
i += chunk_size - overlap
chunk_num += 1
return chunks
def load_documents(folder_path: str) -> list[dict]:
"""Load all text/markdown files from a folder."""
documents = []
folder = Path(folder_path)
for file_path in folder.glob("**/*.{txt,md}"):
text = file_path.read_text(encoding="utf-8")
chunks = chunk_text(text)
for chunk in chunks:
documents.append({
"id": f"{file_path.stem}_{chunk['chunk_num']}",
"text": chunk["text"],
"source_file": str(file_path),
"filename": file_path.name,
"chunk_num": chunk["chunk_num"]
})
return documents
Step 3: Generate Embeddings and Store in ChromaDB
Embeddings are numerical representations of meaning. Documents with similar meaning have similar embeddings, which is how semantic matching works.
import chromadb
import openai
client = openai.OpenAI(api_key="YOUR_API_KEY")
chroma_client = chromadb.PersistentClient(path="./search_db")
def get_collection(name: str = "documents"):
return chroma_client.get_or_create_collection(
name=name,
metadata={"hnsw:space": "cosine"}
)
def embed_text(texts: list[str]) -> list[list[float]]:
"""Get embeddings for a list of texts."""
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
return [item.embedding for item in response.data]
def index_documents(folder_path: str, collection_name: str = "documents"):
"""Index all documents in a folder."""
collection = get_collection(collection_name)
documents = load_documents(folder_path)
if not documents:
print("No documents found.")
return
# Process in batches to avoid API limits
batch_size = 100
indexed = 0
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
texts = [doc["text"] for doc in batch]
ids = [doc["id"] for doc in batch]
metadatas = [{
"source_file": doc["source_file"],
"filename": doc["filename"],
"chunk_num": doc["chunk_num"]
} for doc in batch]
embeddings = embed_text(texts)
collection.add(
ids=ids,
documents=texts,
embeddings=embeddings,
metadatas=metadatas
)
indexed += len(batch)
print(f"Indexed {indexed}/{len(documents)} chunks...")
print(f"Done. {indexed} chunks indexed from {len(set(d['filename'] for d in documents))} files.")
Step 4: Build the Search Function
Query the vector store with a natural language question.
def semantic_search(
query: str,
collection_name: str = "documents",
top_k: int = 5,
min_relevance: float = 0.5
) -> list[dict]:
collection = get_collection(collection_name)
# Embed the query
query_embedding = embed_text([query])[0]
# Search
results = collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
include=["documents", "metadatas", "distances"]
)
hits = []
for i, (doc, meta, distance) in enumerate(zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0]
)):
relevance = 1 - distance # Convert cosine distance to similarity
if relevance >= min_relevance:
hits.append({
"rank": i + 1,
"text": doc,
"source": meta.get("filename", "unknown"),
"source_file": meta.get("source_file", ""),
"relevance_score": round(relevance, 3)
})
return hits
# Test it
results = semantic_search("how do I reduce my cost per lead?")
for r in results:
print(f"[{r['rank']}] {r['source']} (score: {r['relevance_score']})")
print(f" {r['text'][:200]}...")
print()
Step 5: Add AI-Powered Answer Synthesis
Combine semantic search with a generative model to produce a direct answer with source citations.
def search_and_answer(
query: str,
collection_name: str = "documents",
top_k: int = 5
) -> dict:
# Get relevant chunks
hits = semantic_search(query, collection_name, top_k=top_k, min_relevance=0.4)
if not hits:
return {
"answer": "I couldn't find relevant information for that question in the knowledge base.",
"sources": [],
"query": query
}
# Build context from top results
context_parts = []
for hit in hits:
context_parts.append(f"[Source: {hit['source']}]\n{hit['text']}")
context = "\n\n---\n\n".join(context_parts)
# Generate answer
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """You are a helpful assistant answering questions based only on the provided context.
Be direct and specific. If the context doesn't fully answer the question, say so.
Always cite which source(s) your answer comes from."""
},
{
"role": "user",
"content": f"Question: {query}\n\nContext:\n{context}"
}
],
temperature=0.2
)
answer = response.choices[0].message.content
sources = list(set(hit["source"] for hit in hits))
return {
"answer": answer,
"sources": sources,
"top_hits": hits,
"query": query
}
# Usage
result = search_and_answer("What is the best budget to start with for Facebook ads?")
print(result["answer"])
print(f"\nSources: {', '.join(result['sources'])}")
Step 6: Keep the Index Fresh
Re-index when documents change. Track which files have been indexed to avoid duplicates.
import hashlib
import json
from pathlib import Path
INDEX_MANIFEST = Path("./search_db/manifest.json")
def get_file_hash(file_path: str) -> str:
content = Path(file_path).read_bytes()
return hashlib.md5(content).hexdigest()
def update_index(folder_path: str, collection_name: str = "documents"):
"""Only re-index changed or new files."""
manifest = {}
if INDEX_MANIFEST.exists():
manifest = json.loads(INDEX_MANIFEST.read_text())
folder = Path(folder_path)
files = list(folder.glob("**/*.{txt,md}"))
changed = []
for f in files:
current_hash = get_file_hash(str(f))
if manifest.get(str(f)) != current_hash:
changed.append(f)
manifest[str(f)] = current_hash
if changed:
print(f"Re-indexing {len(changed)} changed files...")
# For simplicity, re-index all. In production, delete old chunks by source first.
index_documents(folder_path, collection_name)
INDEX_MANIFEST.write_text(json.dumps(manifest, indent=2))
else:
print("Index is up to date.")
What to Build Next
- Build a chat interface on top of the search-and-answer function so users can have a multi-turn conversation with your knowledge base
- Add metadata filtering to let users scope their search to a specific document category, date range, or author
- Track search queries and no-result searches so you know exactly which questions your knowledge base does not answer yet
Related Reading
- How to Write System Prompts That Control AI Behavior - the answer synthesis prompt determines how well retrieved context is used
- How to Build AI Guardrails for Safe Outputs - prevent the model from hallucinating answers when context is thin
- How to Use AI for Automated Data Extraction - combine extraction and search so newly extracted data is immediately searchable
Want this system built for your business?
Get a free assessment. We will map every system your business needs and show you the ROI.
Get Your Free Assessment