I’m building a document search system using LangChain and OpenAI that lets people ask questions about PDF files. The problem I’m facing is with tables inside these PDFs. When someone asks about table information, the AI only shows the first few rows instead of the complete data. For example, if there’s a table with 100+ rows and I ask for all the data, I only get around 20 rows back.
I need help figuring out how to make the system return complete table information from PDF documents. Here’s my current implementation:
class DocumentSearcher():
def __init__(self):
self.api_key = os.getenv('OPENAI_API_KEY')
def fetch_directory_files(self, directory_path):
file_list = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)
if os.path.isfile(os.path.join(directory_path, filename))]
return file_list
def extract_document_content(self, document_path):
file_loader = TextLoader(document_path, encoding='utf8')
documents = file_loader.load()
return documents
def build_search_chain(self, processed_folder):
document_files = self.fetch_directory_files(processed_folder)
combined_documents = []
for doc_file in document_files:
content = self.extract_document_content(doc_file)
combined_documents.append(content[0])
document_texts = [document.page_content for document in combined_documents]
splitter = RecursiveCharacterTextSplitter(
chunk_size = 6200,
chunk_overlap = 400,
length_function = len
)
text_chunks = splitter.create_documents(document_texts)
vector_embeddings = OpenAIEmbeddings()
vector_store = Chroma.from_documents(text_chunks, vector_embeddings)
search_chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0.3),
retriever=
vector_store.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True)
return search_chain
def process_user_query(self, user_question, conversation_history, processed_folder):
chain = self.build_search_chain(processed_folder)
response = chain({"question": user_question, "chat_history": conversation_history}, return_only_outputs=True)
response['question'] = user_question
response['chat_history'] = conversation_history
return response
I’ve tried adjusting different settings but the responses haven’t improved. What approach would work better for getting complete table data from PDFs?