When building AI applications that need to answer questions about your own data — internal documents, product manuals, customer support databases — two approaches dominate: fine-tuning a language model and Retrieval-Augmented Generation (RAG). For most real-world use cases, RAG wins decisively. It's cheaper, faster to set up, keeps information current without retraining, and is far easier to debug.
In this tutorial, you'll build a fully working RAG chatbot from scratch using LangChain and OpenAI. By the end, you'll have a system that can answer questions over any document corpus — PDFs, Word files, websites — and you'll deploy it as a FastAPI service.
What Is RAG and Why Does It Work?
Large Language Models like GPT-4 are trained on data with a knowledge cutoff. They don't know about your internal documents, recent events, or proprietary data. RAG solves this by dynamically retrieving relevant context from a knowledge base and injecting it into the LLM's prompt at inference time.
RAG vs Fine-tuning: When to Use Each
| Criterion | RAG | Fine-tuning |
|---|---|---|
| Cost | Low — no GPU training needed, just API calls | High — requires GPU hours ($50–$500+) |
| Data freshness | Excellent — update vector store in seconds | Poor — requires full retraining to update |
| Accuracy on domain | Good — depends on retrieval quality | Excellent — model learns domain language deeply |
| Transparency | High — can show retrieved sources | Low — knowledge is baked into weights |
| Best use case | Q&A over docs, customer support, search | Custom tone/style, classification, structured output |
| Setup complexity | Medium — pipeline + vector DB to manage | High — data prep, training, evaluation loop |
| Cold start time | Hours — just index your documents | Days to weeks — data prep + training + eval |
System Architecture
Building the RAG Chatbot — Step by Step
Step 1: Install Dependencies
# Install all required packages
pip install langchain langchain-openai langchain-community \
chromadb openai tiktoken pypdf python-dotenv
# Create .env file (NEVER commit this to git!)
echo "OPENAI_API_KEY=sk-your-key-here" > .envStep 2: Load and Split Documents
# document_loader.py
# ──────────────────────────────────────────────────────────
# Load documents from multiple sources and split into chunks
# ──────────────────────────────────────────────────────────
from langchain_community.document_loaders import (
PyPDFLoader,
WebBaseLoader,
TextLoader,
DirectoryLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
def load_documents(source_path: str) -> list:
"""Load documents from a file, directory, or URL."""
if source_path.startswith("http"):
# Load from URL
loader = WebBaseLoader(source_path)
docs = loader.load()
print(f"Loaded {len(docs)} pages from web")
elif source_path.endswith(".pdf"):
# Load PDF
loader = PyPDFLoader(source_path)
docs = loader.load()
print(f"Loaded PDF with {len(docs)} pages")
elif Path(source_path).is_dir():
# Load all text/markdown files from directory
loader = DirectoryLoader(
source_path,
glob="**/*.txt",
loader_cls=TextLoader
)
docs = loader.load()
print(f"Loaded {len(docs)} files from directory")
else:
loader = TextLoader(source_path)
docs = loader.load()
print(f"Loaded text file")
return docs
def split_documents(docs: list) -> list:
"""
Split documents into overlapping chunks.
chunk_size=1000: each chunk ≈ 1000 characters
chunk_overlap=200: 200 character overlap between consecutive chunks
(prevents losing context at boundaries)
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["
", "
", ". ", " ", ""], # try larger splits first
length_function=len,
add_start_index=True, # track position in original doc
)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")
# Show sample chunk
if chunks:
print(f"Sample chunk (first 200 chars): {chunks[0].page_content[:200]}...")
return chunks
# Example usage
if __name__ == "__main__":
# Load a sample document
docs = load_documents("https://hexcodenepal.com/about")
chunks = split_documents(docs)
Step 3: Create Embeddings and Vector Store
# vector_store.py
# ──────────────────────────────────────────────────────────
# Create embeddings and store in ChromaDB
# ──────────────────────────────────────────────────────────
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import os
from dotenv import load_dotenv
load_dotenv() # Load OPENAI_API_KEY from .env
def create_vector_store(chunks: list[Document], persist_dir: str = "./chroma_db") -> Chroma:
"""
Embed document chunks and store in ChromaDB.
Using text-embedding-3-small:
- 1536 dimensions
- Cost: $0.02 per 1M tokens (very cheap)
- Excellent quality for retrieval tasks
"""
embedding_model = OpenAIEmbeddings(
model="text-embedding-3-small",
openai_api_key=os.getenv("OPENAI_API_KEY")
)
print(f"Creating embeddings for {len(chunks)} chunks...")
print("This calls the OpenAI embeddings API — will use tokens")
# Create vector store (this embeds all chunks and persists to disk)
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embedding_model,
persist_directory=persist_dir,
collection_name="rag_collection",
)
print(f"Vector store created at: {persist_dir}")
print(f"Total vectors: {vectorstore._collection.count()}")
return vectorstore
def load_vector_store(persist_dir: str = "./chroma_db") -> Chroma:
"""Load an existing vector store from disk (no API calls needed)."""
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma(
persist_directory=persist_dir,
embedding_function=embedding_model,
collection_name="rag_collection",
)
print(f"Loaded vector store: {vectorstore._collection.count()} vectors")
return vectorstore
def test_retrieval(vectorstore: Chroma, query: str, k: int = 4):
"""Test that retrieval is working correctly."""
docs = vectorstore.similarity_search(query, k=k)
print(f"
Top {k} chunks for query: '{query}'")
for i, doc in enumerate(docs):
print(f"
[{i+1}] Score relevance:")
print(f" Source: {doc.metadata.get('source', 'unknown')}")
print(f" Content: {doc.page_content[:150]}...")
The text-embedding-3-small model costs $0.02 per 1 million tokens. For a typical document corpus:
- 100-page PDF ≈ 50,000 tokens ≈ $0.001 (essentially free)
- 1,000 documents ≈ 2M tokens ≈ $0.04
- Full company wiki (10k pages) ≈ 5M tokens ≈ $0.10
Embeddings are a one-time cost — once indexed, retrieval only costs the query embedding (~$0.000001 per query).
Step 4: Build the Retrieval Chain
# rag_chain.py
# ──────────────────────────────────────────────────────────
# Build the RAG chain using LCEL (LangChain Expression Language)
# ──────────────────────────────────────────────────────────
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_community.vectorstores import Chroma
import os
SYSTEM_PROMPT = """You are a helpful AI assistant. Answer the user's question
using ONLY the provided context. If the answer is not in the context, say
"I don't have information about that in my knowledge base."
Always cite which document/section your answer comes from.
Context:
{context}
"""
def format_docs(docs) -> str:
"""Format retrieved documents into a single context string."""
formatted = []
for i, doc in enumerate(docs):
source = doc.metadata.get("source", "Unknown")
formatted.append(f"[Source {i+1}: {source}]
{doc.page_content}")
return "
---
".join(formatted)
def build_rag_chain(vectorstore: Chroma):
"""
Build a RAG chain using LangChain Expression Language (LCEL).
Chain flow:
question → retriever → format_docs → prompt → LLM → string output
"""
# Retriever: finds top-4 most relevant chunks
retriever = vectorstore.as_retriever(
search_type="mmr", # Maximal Marginal Relevance (reduces redundancy)
search_kwargs={
"k": 4, # Return top 4 chunks
"fetch_k": 20, # Fetch 20, then pick 4 diverse ones
}
)
# LLM: gpt-4o-mini is fast and cost-effective for RAG
llm = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.1, # Low temperature for factual answers
openai_api_key=os.getenv("OPENAI_API_KEY")
)
# Prompt template
prompt = ChatPromptTemplate.from_messages([
("system", SYSTEM_PROMPT),
("human", "{question}"),
])
# LCEL Chain — reads right to left:
# 1. Retrieve docs for the question
# 2. Format them into context
# 3. Fill the prompt template
# 4. Send to LLM
# 5. Parse output to string
rag_chain = (
{
"context": retriever | format_docs,
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser()
)
return rag_chain
def chat_with_sources(vectorstore: Chroma, question: str):
"""Query the RAG system and return answer with sources."""
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
chain = build_rag_chain(vectorstore)
# Get answer
answer = chain.invoke(question)
# Get source documents separately
sources = retriever.invoke(question)
return {
"question": question,
"answer": answer,
"sources": [
{
"content": doc.page_content[:200],
"source": doc.metadata.get("source", "unknown"),
}
for doc in sources
]
}
Step 5: End-to-End Query Example
# main.py — Putting it all together
from document_loader import load_documents, split_documents
from vector_store import create_vector_store, load_vector_store
from rag_chain import chat_with_sources
from pathlib import Path
import json
PERSIST_DIR = "./chroma_db"
def build_knowledge_base(source: str):
"""Build the vector store from scratch."""
print("Building knowledge base...")
docs = load_documents(source)
chunks = split_documents(docs)
vectorstore = create_vector_store(chunks, PERSIST_DIR)
print("Knowledge base ready!")
return vectorstore
def main():
# Check if we have an existing vector store
if Path(PERSIST_DIR).exists():
print("Loading existing knowledge base...")
vectorstore = load_vector_store(PERSIST_DIR)
else:
# Build from your documents (replace with your source)
vectorstore = build_knowledge_base("./documents/")
# Interactive chat loop
print("
" + "="*50)
print("RAG Chatbot Ready! Type 'quit' to exit.")
print("="*50 + "
")
while True:
question = input("You: ").strip()
if question.lower() in ["quit", "exit", "q"]:
break
if not question:
continue
result = chat_with_sources(vectorstore, question)
print(f"
Assistant: {result['answer']}")
print("
Sources used:")
for i, src in enumerate(result['sources']):
print(f" [{i+1}] {src['source']}")
print()
if __name__ == "__main__":
main()
Your OpenAI API key gives full billing access to your account. Common mistakes that lead to costly leaks:
- Hardcoding keys in Python files and pushing to GitHub (bots scan public repos constantly)
- Storing keys in Jupyter notebooks — these get shared easily
- Printing keys in logs
Safe practices: Always use .env files + python-dotenv. Add .env to .gitignore. Use OpenAI's usage limits to cap spending. Rotate keys immediately if exposed.
Real-World Use Case: Nepali Bank Customer Support Bot
Let's apply this to a concrete Nepal use case: a customer support chatbot for a Nepali bank (e.g., Nepal SBI, NMB Bank, or Global IME). The bank has hundreds of pages of FAQs, product brochures, terms and conditions, and loan documents. Instead of building a rigid rule-based FAQ system, a RAG chatbot can answer any question naturally.
The knowledge base would include: loan product PDFs, interest rate schedules, branch information, account opening requirements, remittance procedures, and digital banking guides. All documents are loaded, chunked, and indexed once. When a customer asks "What documents do I need for a home loan?", the system retrieves the relevant sections from the loan PDF and generates a precise, sourced answer.
Deploying as a FastAPI Service
# api.py — Production-ready FastAPI wrapper for the RAG chatbot
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from contextlib import asynccontextmanager
from vector_store import load_vector_store
from rag_chain import chat_with_sources
import uvicorn
import logging
import time
logger = logging.getLogger(__name__)
# Global vectorstore (loaded once at startup)
vectorstore = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load the vector store when the app starts."""
global vectorstore
logger.info("Loading vector store...")
vectorstore = load_vector_store("./chroma_db")
logger.info("RAG system ready!")
yield
logger.info("Shutting down...")
app = FastAPI(
title="RAG Chatbot API",
description="Document Q&A powered by LangChain + OpenAI",
version="1.0.0",
lifespan=lifespan,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
class QueryRequest(BaseModel):
question: str = Field(..., min_length=1, max_length=500)
include_sources: bool = Field(default=True)
class QueryResponse(BaseModel):
question: str
answer: str
sources: list[dict]
latency_ms: float
@app.post("/chat", response_model=QueryResponse)
async def chat(request: QueryRequest):
"""Query the RAG chatbot."""
if vectorstore is None:
raise HTTPException(status_code=503, detail="Knowledge base not loaded")
start = time.time()
try:
result = chat_with_sources(vectorstore, request.question)
except Exception as e:
logger.error(f"RAG error: {e}")
raise HTTPException(status_code=500, detail="Failed to generate answer")
latency = (time.time() - start) * 1000
return QueryResponse(
question=result["question"],
answer=result["answer"],
sources=result["sources"] if request.include_sources else [],
latency_ms=round(latency, 1),
)
@app.get("/health")
def health():
return {
"status": "healthy",
"vectorstore_loaded": vectorstore is not None,
"doc_count": vectorstore._collection.count() if vectorstore else 0,
}
if __name__ == "__main__":
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
What You've Built
You now have a production-ready RAG pipeline that can answer questions over any document corpus. The system loads and chunks documents, creates semantic embeddings, stores them in ChromaDB, and retrieves relevant context to augment LLM responses. The FastAPI wrapper makes it easy to integrate with any frontend.
Next steps to make this production-grade: add authentication to the API, implement conversation history (using LangChain's memory modules), add a reranker to improve retrieval precision, monitor latency and quality with LangSmith, and consider switching to Pinecone for a managed cloud vector store that scales automatically.
RAG chatbots are one of the highest-ROI AI projects you can build today. Start with a simple document collection you know well — your company wiki, a research paper collection, or even this blog — and iterate from there.