I’m trying to extract structured data from a document question-answering chain but running into parsing issues. My setup works fine for getting basic answers, but I need the language model to return structured responses that can be processed by a PydanticOutputParser. The chain successfully returns formatted text, but when I try to parse it, I get a JSON parsing error. The response looks like this:
{'result_text': '\n1. Document section: Privacy Policy. \n2. Privacy Policy: No.'}
When I attempt to parse with output_parser.parse(response['result_text']), I get JSONDecodeError: Expecting value: line 1 column 1 (char 0). Here’s my current implementation:
from langchain.chains.question_answering import load_qa_chain
from langchain import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import AzureOpenAI
from langchain.document_loaders.csv_loader import CSVLoader
import os
def setup_vector_store(document_index):
embeddings = OpenAIEmbeddings(
openai_api_key=os.getenv("OPENAI_KEY"),
deployment=os.getenv('EMB_DEPLOYMENT'),
model=os.getenv('EMB_MODEL'),
chunk_size=1
)
data_loader = CSVLoader(
file_path='documents/agreements.csv',
source_column="Content"
)
documents = data_loader.load()
selected_doc = [documents[document_index].page_content]
vector_db = Chroma.from_documents(selected_doc, embeddings)
return vector_db
class DocumentAnalysis(BaseModel):
searched_clause: str = Field(description="The clause being searched in the document")
clause_exists: str = Field(description="Whether the clause exists in the document - Yes or No")
vectorstore = setup_vector_store(0)
search_term = "Confidentiality Agreement"
model = AzureOpenAI(
deployment_name=os.getenv('LLM_DEPLOYMENT'),
model_name=os.getenv('LLM_MODEL'),
temperature=0
)
search_query = f"Does the document contain {search_term}?"
relevant_docs = vectorstore.similarity_search(search_query)
prompt_template = """
You are analyzing legal documents. Based on the content below, answer this question:
Question: Does {query} appear in this document?
{content}
Provide your response in this format:
1. Searched clause: {query}
2. {query}: Yes or No
{format_instructions}
"""
output_parser = PydanticOutputParser(pydantic_object=DocumentAnalysis)
qa_prompt = PromptTemplate(
template=prompt_template,
input_variables=["content", "query"],
partial_variables={
"format_instructions": output_parser.get_format_instructions()
}
)
qa_chain = load_qa_chain(
model,
chain_type="stuff",
prompt=qa_prompt,
document_variable_name="content"
)
response = qa_chain(
{"input_documents": relevant_docs, "query": search_term},
return_only_outputs=True
)
How can I modify this setup to make the output parser work correctly with the QA chain results?