I’m getting a frustrating error when trying to process multiple PDF files using Google Document AI batch processing. The error message is:
google.api_core.exceptions.InvalidArgument: 400 Failed to process all documents. 3: Failed to process all documents
My setup includes:
import os
import re
from google.cloud import documentai
from google.cloud import storage
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import RetryError, InternalServerError
# Configuration
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/credentials.json"
PROJECT_ID = "my-project"
LOCATION = "us"
PROCESSOR_ID = "my-processor-id"
INPUT_BUCKET = "gs://my-bucket/input/document.pdf"
OUTPUT_BUCKET = "gs://my-bucket/output/"
MIME_TYPE = "application/pdf"
FIELD_MASK = "text,entities,pages.pageNumber"
PROCESSOR_VERSION = "pretrained-ocr-v2.1-2024-08-07"
def process_documents_batch(
project_id: str,
location: str,
processor_id: str,
input_uri: str,
output_uri: str,
version_id: str = None,
mime_type: str = None,
field_mask: str = None,
timeout: int = 400
):
# Set endpoint for non-US locations
client_opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=client_opts)
# Configure input documents
if not input_uri.endswith("/") and "." in input_uri:
doc = documentai.GcsDocument(gcs_uri=input_uri, mime_type=mime_type)
docs = documentai.GcsDocuments(documents=[doc])
input_cfg = documentai.BatchDocumentsInputConfig(gcs_documents=docs)
else:
prefix = documentai.GcsPrefix(gcs_uri_prefix=input_uri)
input_cfg = documentai.BatchDocumentsInputConfig(gcs_prefix=prefix)
# Configure output
gcs_out_cfg = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=output_uri, field_mask=field_mask
)
output_cfg = documentai.DocumentOutputConfig(gcs_output_config=gcs_out_cfg)
# Set processor name
if version_id:
processor_name = client.processor_version_path(
project_id, location, processor_id, version_id
)
else:
processor_name = client.processor_path(project_id, location, processor_id)
# Create and execute batch request
batch_request = documentai.BatchProcessRequest(
name=processor_name,
input_documents=input_cfg,
document_output_config=output_cfg
)
operation = client.batch_process_documents(batch_request)
try:
print(f"Processing operation: {operation.operation.name}")
operation.result(timeout=timeout)
except (RetryError, InternalServerError) as error:
print(f"Operation failed: {error.message}")
return
# Check results
metadata = documentai.BatchProcessMetadata(operation.metadata)
if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
raise ValueError(f"Processing failed: {metadata.state_message}")
# Download results
storage_client = storage.Client()
for process in metadata.individual_process_statuses:
match = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
if not match:
continue
bucket_name, prefix = match.groups()
blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
for blob in blobs:
if blob.content_type == "application/json":
print(f"Processing result: {blob.name}")
doc = documentai.Document.from_json(
blob.download_as_bytes(), ignore_unknown_fields=True
)
print("Extracted text:")
print(doc.text)
if __name__ == "__main__":
process_documents_batch(
project_id=PROJECT_ID,
location=LOCATION,
processor_id=PROCESSOR_ID,
input_uri=INPUT_BUCKET,
output_uri=OUTPUT_BUCKET,
mime_type=MIME_TYPE,
field_mask=FIELD_MASK
)
Has anyone encountered this issue before? I’ve checked my credentials and bucket permissions. What could be causing this batch processing failure?