I built a chatbot using FastAPI with Langchain and deployed it on Render’s free tier. Right now I’m using threading to handle multiple users but I’m not sure if this is the right approach. The bot streams responses and works fine but I’m concerned about some potential problems.
Memory Issues: Each user gets their own thread which seems like it will eat up too much memory and CPU when more people start using it.
State Storage: All conversation states live in memory right now. When the app restarts everything gets wiped out. I know I could use a database but then I need to figure out how to clean up old conversations and manage the data properly.
Is threading the best way to handle multiple users? What’s the standard way to manage conversation states without keeping them forever? This is my first time building something like this so any advice would be helpful.
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from vector_db import DocumentRetriever
from fastapi.middleware.cors import CORSMiddleware
import os
api_app = FastAPI()
api_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
graph = StateGraph(state_schema=MessagesState)
llm_client = ChatNVIDIA(model="meta/llama-3.1-70b-instruct")
doc_retriever = DocumentRetriever()
async def process_request(state: MessagesState, config):
recent_messages = state['messages'][-15:] # Keep last 15 messages
query = recent_messages[-1].content
docs = doc_retriever.search_documents(query, top_k=2)
context = "\n".join([doc.page_content for doc in docs])
bot_config = config["configurable"].get("bot_type")
system_msg = f"You are a helpful assistant. Context: {context}"
full_messages = [SystemMessage(content=system_msg)] + recent_messages
try:
result = await llm_client.ainvoke(full_messages)
return {"messages": result}
except Exception as err:
raise Exception(f"Processing failed: {err}")
graph.add_node("processor", process_request)
graph.add_edge(START, "processor")
storage = MemorySaver()
bot_app = graph.compile(checkpointer=storage)
@api_app.post("/message")
async def handle_message(request: Request):
payload = await request.json()
text = payload.get("text", "")
session_id = payload.get("session_id", "")
if not text:
return JSONResponse(content={"error": "Text required"}, status_code=400)
if not session_id:
return JSONResponse(content={"error": "Session ID required"}, status_code=400)
async def generate_response():
settings = {"configurable": {"thread_id": session_id}}
input_data = {"messages": [HumanMessage(content=text)]}
try:
async for chunk, meta in bot_app.astream(input_data, config=settings, stream_mode="messages"):
yield chunk.content
except Exception as err:
yield f"Error: {str(err)}"
return StreamingResponse(generate_response(), media_type="text/plain")