Systems Library / AI Model Setup / How to Build AI-Powered Search for Your Data
AI Model Setup advanced

How to Build AI-Powered Search for Your Data

Replace keyword search with AI-powered semantic search across your documents.

Jay Banlasan

Jay Banlasan

The AI Systems Guy

Keyword search fails the moment a user phrases their query differently from the exact words in your documents. Semantic search finds results based on meaning, not exact words. A user searching for "how to lower my ad costs" will find your article titled "reducing cost per acquisition" even though neither phrase appears in the other. That is the difference between a useful search and a frustrating one.

I use semantic search for internal knowledge bases, client document libraries, and customer-facing help centers. The setup is 3-4 hours. The impact is immediate. Users stop asking support questions that the docs already answer because they can finally find those docs.

What You Need Before Starting

Step 1: Install Dependencies

pip install openai chromadb

ChromaDB runs locally with zero infrastructure setup. For production with millions of documents, switch to Pinecone or Weaviate. The code pattern is nearly identical.

Step 2: Prepare and Chunk Your Documents

Long documents need to be split into chunks before embedding. The chunk size is a tuning parameter: smaller chunks are more precise, larger chunks give more context.

from pathlib import Path
from typing import Generator

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[dict]:
    """Split text into overlapping chunks for better retrieval."""
    words = text.split()
    chunks = []
    i = 0
    chunk_num = 0

    while i < len(words):
        chunk_words = words[i:i + chunk_size]
        chunk_text = " ".join(chunk_words)
        chunks.append({
            "text": chunk_text,
            "chunk_num": chunk_num,
            "word_start": i,
            "word_end": min(i + chunk_size, len(words))
        })
        i += chunk_size - overlap
        chunk_num += 1

    return chunks

def load_documents(folder_path: str) -> list[dict]:
    """Load all text/markdown files from a folder."""
    documents = []
    folder = Path(folder_path)

    for file_path in folder.glob("**/*.{txt,md}"):
        text = file_path.read_text(encoding="utf-8")
        chunks = chunk_text(text)
        for chunk in chunks:
            documents.append({
                "id": f"{file_path.stem}_{chunk['chunk_num']}",
                "text": chunk["text"],
                "source_file": str(file_path),
                "filename": file_path.name,
                "chunk_num": chunk["chunk_num"]
            })

    return documents

Step 3: Generate Embeddings and Store in ChromaDB

Embeddings are numerical representations of meaning. Documents with similar meaning have similar embeddings, which is how semantic matching works.

import chromadb
import openai

client = openai.OpenAI(api_key="YOUR_API_KEY")
chroma_client = chromadb.PersistentClient(path="./search_db")

def get_collection(name: str = "documents"):
    return chroma_client.get_or_create_collection(
        name=name,
        metadata={"hnsw:space": "cosine"}
    )

def embed_text(texts: list[str]) -> list[list[float]]:
    """Get embeddings for a list of texts."""
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    return [item.embedding for item in response.data]

def index_documents(folder_path: str, collection_name: str = "documents"):
    """Index all documents in a folder."""
    collection = get_collection(collection_name)
    documents = load_documents(folder_path)

    if not documents:
        print("No documents found.")
        return

    # Process in batches to avoid API limits
    batch_size = 100
    indexed = 0

    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        texts = [doc["text"] for doc in batch]
        ids = [doc["id"] for doc in batch]
        metadatas = [{
            "source_file": doc["source_file"],
            "filename": doc["filename"],
            "chunk_num": doc["chunk_num"]
        } for doc in batch]

        embeddings = embed_text(texts)

        collection.add(
            ids=ids,
            documents=texts,
            embeddings=embeddings,
            metadatas=metadatas
        )

        indexed += len(batch)
        print(f"Indexed {indexed}/{len(documents)} chunks...")

    print(f"Done. {indexed} chunks indexed from {len(set(d['filename'] for d in documents))} files.")

Step 4: Build the Search Function

Query the vector store with a natural language question.

def semantic_search(
    query: str,
    collection_name: str = "documents",
    top_k: int = 5,
    min_relevance: float = 0.5
) -> list[dict]:
    collection = get_collection(collection_name)

    # Embed the query
    query_embedding = embed_text([query])[0]

    # Search
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    hits = []
    for i, (doc, meta, distance) in enumerate(zip(
        results["documents"][0],
        results["metadatas"][0],
        results["distances"][0]
    )):
        relevance = 1 - distance  # Convert cosine distance to similarity
        if relevance >= min_relevance:
            hits.append({
                "rank": i + 1,
                "text": doc,
                "source": meta.get("filename", "unknown"),
                "source_file": meta.get("source_file", ""),
                "relevance_score": round(relevance, 3)
            })

    return hits

# Test it
results = semantic_search("how do I reduce my cost per lead?")
for r in results:
    print(f"[{r['rank']}] {r['source']} (score: {r['relevance_score']})")
    print(f"   {r['text'][:200]}...")
    print()

Step 5: Add AI-Powered Answer Synthesis

Combine semantic search with a generative model to produce a direct answer with source citations.

def search_and_answer(
    query: str,
    collection_name: str = "documents",
    top_k: int = 5
) -> dict:
    # Get relevant chunks
    hits = semantic_search(query, collection_name, top_k=top_k, min_relevance=0.4)

    if not hits:
        return {
            "answer": "I couldn't find relevant information for that question in the knowledge base.",
            "sources": [],
            "query": query
        }

    # Build context from top results
    context_parts = []
    for hit in hits:
        context_parts.append(f"[Source: {hit['source']}]\n{hit['text']}")

    context = "\n\n---\n\n".join(context_parts)

    # Generate answer
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a helpful assistant answering questions based only on the provided context.
Be direct and specific. If the context doesn't fully answer the question, say so.
Always cite which source(s) your answer comes from."""
            },
            {
                "role": "user",
                "content": f"Question: {query}\n\nContext:\n{context}"
            }
        ],
        temperature=0.2
    )

    answer = response.choices[0].message.content
    sources = list(set(hit["source"] for hit in hits))

    return {
        "answer": answer,
        "sources": sources,
        "top_hits": hits,
        "query": query
    }

# Usage
result = search_and_answer("What is the best budget to start with for Facebook ads?")
print(result["answer"])
print(f"\nSources: {', '.join(result['sources'])}")

Step 6: Keep the Index Fresh

Re-index when documents change. Track which files have been indexed to avoid duplicates.

import hashlib
import json
from pathlib import Path

INDEX_MANIFEST = Path("./search_db/manifest.json")

def get_file_hash(file_path: str) -> str:
    content = Path(file_path).read_bytes()
    return hashlib.md5(content).hexdigest()

def update_index(folder_path: str, collection_name: str = "documents"):
    """Only re-index changed or new files."""
    manifest = {}
    if INDEX_MANIFEST.exists():
        manifest = json.loads(INDEX_MANIFEST.read_text())

    folder = Path(folder_path)
    files = list(folder.glob("**/*.{txt,md}"))
    changed = []

    for f in files:
        current_hash = get_file_hash(str(f))
        if manifest.get(str(f)) != current_hash:
            changed.append(f)
            manifest[str(f)] = current_hash

    if changed:
        print(f"Re-indexing {len(changed)} changed files...")
        # For simplicity, re-index all. In production, delete old chunks by source first.
        index_documents(folder_path, collection_name)
        INDEX_MANIFEST.write_text(json.dumps(manifest, indent=2))
    else:
        print("Index is up to date.")

What to Build Next

Related Reading

Want this system built for your business?

Get a free assessment. We will map every system your business needs and show you the ROI.

Get Your Free Assessment

Related Systems