0% found this document useful (0 votes)

8 views7 pages

Updated Code

The document outlines a FastAPI application that processes image and PDF files to extract text using OCR and structures the data using Google's Gemini API. It includes various functionalities such as logging, caching, and error handling for unsupported file formats. The application processes multiple files concurrently, providing structured output in JSON format while logging memory usage and processing times.

Uploaded by

joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

8 views7 pages

Updated Code

Uploaded by

joshi

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 7

from fastapi import FastAPI, File, UploadFile, HTTPException

import pytesseract
import cv2
import os
from PIL import Image
import json
import unicodedata
from pdf2image import convert_from_bytes
from pypdf import PdfReader
import numpy as np
from typing import List
import io
import logging
import time
import asyncio
import psutil
import cachetools
import hashlib
import google.generativeai as genai
from dotenv import load_dotenv

app = FastAPI()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %
(message)s')
logger = logging.getLogger(__name__)

# Load environment variables

load_dotenv()

# Configure Gemini API

api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
logger.error("GOOGLE_API_KEY not set")
raise HTTPException(status_code=500, detail="GOOGLE_API_KEY not set")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-2.0-flash")

# Set Tesseract path

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# In-memory caches (1-hour TTL)

raw_text_cache = cachetools.TTLCache(maxsize=100, ttl=3600)
structured_data_cache = cachetools.TTLCache(maxsize=100, ttl=3600)

def log_memory_usage():
"""Log current memory usage."""
process = psutil.Process()
mem_info = process.memory_info()
return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"

def get_file_hash(file_bytes):
"""Generate MD5 hash of file content."""
return hashlib.md5(file_bytes).hexdigest()

def get_text_hash(raw_text):
"""Generate MD5 hash of raw text."""
return hashlib.md5(raw_text.encode('utf-8')).hexdigest()
def get_poppler_path():
"""Determine the correct poppler path based on the system."""
import platform
import shutil

# Check if poppler utilities are in PATH

if shutil.which('pdftoppm'):
return None # Use system PATH

# Common poppler paths for different systems

common_paths = [
"/usr/bin", # Linux
"/usr/local/bin", # macOS with Homebrew
"/opt/homebrew/bin", # macOS with Apple Silicon Homebrew
"/usr/share/poppler/bin", # Some Linux distributions
"C:\\poppler\\Library\\bin", # Windows
"C:\\Program Files\\poppler\\bin", # Windows alternative
]

for path in common_paths:

if os.path.exists(os.path.join(path, "pdftoppm")) or
os.path.exists(os.path.join(path, "pdftoppm.exe")):
return path

return None

async def process_image(img_bytes, filename, idx):

"""Process a single image (JPG/JPEG/PNG) with OCR."""
start_time = time.time()
logger.info(f"Starting OCR for {filename} image {idx}, {log_memory_usage()}")
try:
img = Image.open(io.BytesIO(img_bytes))
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
logger.info(f"Completed OCR for {filename} image {idx}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return page_text + "\n"
except Exception as e:
logger.error(f"OCR failed for {filename} image {idx}: {str(e)},
{log_memory_usage()}")
return ""

async def process_pdf_page(img, page_idx):

"""Process a single PDF page with OCR."""
start_time = time.time()
logger.info(f"Starting OCR for PDF page {page_idx}, {log_memory_usage()}")
try:
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
custom_config = r'--oem 1 --psm 6 -l eng+ara' # Reduced for performance
page_text = pytesseract.image_to_string(img_pil, config=custom_config)
logger.info(f"Completed OCR for PDF page {page_idx}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return page_text + "\n"
except Exception as e:
logger.error(f"OCR failed for PDF page {page_idx}: {str(e)},
{log_memory_usage()}")
return ""

async def process_with_gemini(filename: str, raw_text: str):

"""Process raw text with Gemini to extract structured data."""
start_time = time.time()
logger.info(f"Starting Gemini processing for {filename}, {log_memory_usage()}")

# Check structured data cache

text_hash = get_text_hash(raw_text)
if text_hash in structured_data_cache:
logger.info(f"Structured data cache hit for {filename},
{log_memory_usage()}")
return structured_data_cache[text_hash]

# Truncate text for Gemini

if len(raw_text) > 10000:
raw_text = raw_text[:10000]
logger.info(f"Truncated raw text for {filename} to 10000 characters,
{log_memory_usage()}")

try:
prompt = f"""
You are an intelligent invoice data extractor. Given raw text from an
invoice in any language and extract key business fields in the specified JSON
format. Support English. Handle synonyms (e.g., 'total' = 'net', 'tax' =
'GST'/'TDS'). The 'Products' field is dynamic and may contain multiple items, each
with 'qty', 'description', 'unit_price', and 'amount'. Detect the currency (e.g.,
USD, INR, EUR) from symbols ($, ₹, €) or text; default to USD if unclear. If a
field is missing, include it with an empty string ("") or appropriate default
(e.g., 0 for numbers).

Raw text:
{raw_text}

Output JSON:
{{
"invoice": {{
"invoice_number": "",
"invoice_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"purchase_order_number": "",
"vendor": {{
"vendor_id": "",
"name": "",
"address": {{
"line1": "",
"line2": "",
"city": "",
"state": "",
"postal_code": "",
"country": ""
}},
"contact": {{
"email": "",
"phone": ""
}},
"tax_id": ""
}},
"buyer": {{
"buyer_id": "",
"name": "",
"address": {{
"line1": "",
"line2": "",
"city": "",
"state": "",
"postal_code": "",
"country": ""
}},
"contact": {{
"email": "",
"phone": ""
}},
"tax_id": ""
}},
"items": [
{{
"item_id": "",
"description": "",
"quantity": 0,
"unit_of_measure": "",
"unit_price": 0,
"total_price": 0,
"tax_rate": 0,
"tax_amount": 0,
"discount": 0,
"net_amount": 0
}}
],
"sub_total": 0,
"tax_total": 0,
"discount_total": 0,
"total_amount": 0,
"currency": ""
}}
}}
"""
response = model.generate_content(prompt)
llm_output = response.text
json_start = llm_output.find("{")
json_end = llm_output.rfind("}") + 1
json_str = llm_output[json_start:json_end]
structured_data = json.loads(json_str)
structured_data_cache[text_hash] = structured_data
logger.info(f"Gemini processing for {filename}, took {time.time() -
start_time:.2f} seconds, {log_memory_usage()}")
return structured_data
except Exception as e:
logger.error(f"Gemini processing failed for {filename}: {str(e)},
{log_memory_usage()}")
return {"error": f"Gemini processing failed: {str(e)}"}

@app.post("/ocr")
async def extract_and_structure(files: List[UploadFile] = File(...)):
output_json = {
"success": True,
"message": "",
"data": []
}
success_count = 0
fail_count = 0

logger.info(f"Starting processing for {len(files)} files,

{log_memory_usage()}")

for file in files:

total_start_time = time.time()
logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")

# Validate file format

valid_extensions = {'.pdf', '.jpg', '.jpeg', '.png'}
file_ext = os.path.splitext(file.filename.lower())[1]
if file_ext not in valid_extensions:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Unsupported file format:
{file_ext}"},
"error": f"Unsupported file format: {file_ext}"
})
logger.error(f"Unsupported file format for {file.filename}:
{file_ext}")
continue

# Read file into memory

try:
file_start_time = time.time()
file_bytes = await file.read()
file_stream = io.BytesIO(file_bytes)
file_hash = get_file_hash(file_bytes)
logger.info(f"Read file {file.filename}, took {time.time() -
file_start_time:.2f} seconds, size: {len(file_bytes)/1024:.2f} KB,
{log_memory_usage()}")
except Exception as e:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Failed to read file: {str(e)}"},
"error": f"Failed to read file: {str(e)}"
})
logger.error(f"Failed to read file {file.filename}: {str(e)},
{log_memory_usage()}")
continue

# Check raw text cache

raw_text = ""
if file_hash in raw_text_cache:
raw_text = raw_text_cache[file_hash]
logger.info(f"Raw text cache hit for {file.filename},
{log_memory_usage()}")
else:
if file_ext == '.pdf':
# Try extracting embedded text
try:
extract_start_time = time.time()
reader = PdfReader(file_stream)
for page in reader.pages:
text = page.extract_text()
if text:
raw_text += text + "\n"
logger.info(f"Embedded text extraction for {file.filename},
took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
logger.warning(f"Embedded text extraction failed for
{file.filename}: {str(e)}, {log_memory_usage()}")

# If no embedded text, perform OCR

if not raw_text.strip():
try:
convert_start_time = time.time()

# Get the correct poppler path

poppler_path = get_poppler_path()

# Convert PDF to images with proper poppler path handling

if poppler_path:
images = convert_from_bytes(file_bytes,
poppler_path=poppler_path, dpi=100)
logger.info(f"Using poppler path: {poppler_path}")
else:
# Try without specifying poppler_path (use system PATH)
images = convert_from_bytes(file_bytes, dpi=100)
logger.info("Using poppler from system PATH")

logger.info(f"PDF to images conversion for {file.filename},

{len(images)} pages, took {time.time() - convert_start_time:.2f} seconds,
{log_memory_usage()}")

ocr_start_time = time.time()
page_texts = []
for i, img in enumerate(images):
page_text = await process_pdf_page(img, i)
page_texts.append(page_text)
raw_text = "".join(page_texts)
logger.info(f"Total OCR for {file.filename}, took
{time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
fail_count += 1
error_msg = f"OCR failed: {str(e)}"
if "poppler" in str(e).lower():
error_msg += ". Please ensure Poppler is installed and
accessible in PATH."
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": error_msg},
"error": error_msg
})
logger.error(f"OCR failed for {file.filename}: {str(e)},
{log_memory_usage()}")
continue
else: # JPG/JPEG/PNG
try:
ocr_start_time = time.time()
raw_text = await process_image(file_bytes, file.filename, 0)
logger.info(f"Image OCR for {file.filename}, took {time.time()
- ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
except Exception as e:
fail_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": {"error": f"Image OCR failed:
{str(e)}"},
"error": f"Image OCR failed: {str(e)}"
})
logger.error(f"Image OCR failed for {file.filename}: {str(e)},
{log_memory_usage()}")
continue

# Normalize text
try:
normalize_start_time = time.time()
raw_text = unicodedata.normalize('NFKC', raw_text)
raw_text = raw_text.encode().decode('utf-8')
raw_text_cache[file_hash] = raw_text
logger.info(f"Text normalization for {file.filename}, took
{time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)},
{log_memory_usage()}")
except Exception as e:
logger.warning(f"Text normalization failed for {file.filename}:
{str(e)}, {log_memory_usage()}")

# Process with Gemini

structured_data = await process_with_gemini(file.filename, raw_text)
success_count += 1
output_json["data"].append({
"filename": file.filename,
"structured_data": structured_data,
"error": ""
})

logger.info(f"Total processing for {file.filename}, took {time.time() -

total_start_time:.2f} seconds, {log_memory_usage()}")

output_json["message"] = f"Processed {len(files)} files. {success_count}

succeeded, {fail_count} failed."
if fail_count > 0 and success_count == 0:
output_json["success"] = False

logger.info(f"Completed processing for {len(files)} files, {success_count}

succeeded, {fail_count} failed, {log_memory_usage()}")
return output_json

UNIX System Administration
100% (2)
UNIX System Administration
542 pages
FrontArena MPA Format v100
No ratings yet
FrontArena MPA Format v100
87 pages
Building An Image Processing Pipeline With Python
100% (1)
Building An Image Processing Pipeline With Python
31 pages
Linux Virtual Delivery Agent 1912 LTSR
No ratings yet
Linux Virtual Delivery Agent 1912 LTSR
292 pages
4-Channel YOLO Training Guide for RGB+IR Drone Detection
No ratings yet
4-Channel YOLO Training Guide for RGB+IR Drone Detection
22 pages
DeepSeek Email Classification Overview
No ratings yet
DeepSeek Email Classification Overview
8 pages
Deltacode Ajay 2025
No ratings yet
Deltacode Ajay 2025
17 pages
Mini Project
No ratings yet
Mini Project
18 pages
Code_Save
No ratings yet
Code_Save
56 pages
Unlocking Rapid Data Extraction: Groq + OCR and Claude Vision - by Júlio Almeida - Python in Plain E
No ratings yet
Unlocking Rapid Data Extraction: Groq + OCR and Claude Vision - by Júlio Almeida - Python in Plain E
17 pages
Unit6 Python Class
No ratings yet
Unit6 Python Class
39 pages
Response PO
No ratings yet
Response PO
20 pages
Lang Chain Agent
No ratings yet
Lang Chain Agent
9 pages
Code For Project Documentation
No ratings yet
Code For Project Documentation
69 pages
Fastapi
No ratings yet
Fastapi
14 pages
SafeguardAuthenticationServices 5.0.2 InstallationGuide
No ratings yet
SafeguardAuthenticationServices 5.0.2 InstallationGuide
139 pages
ai_scraping_techniques
No ratings yet
ai_scraping_techniques
9 pages
Pfa Ieee
No ratings yet
Pfa Ieee
59 pages
Image Detection
No ratings yet
Image Detection
5 pages
Description About The Document: Action 1: Action 2
No ratings yet
Description About The Document: Action 1: Action 2
7 pages
Folder Structure
No ratings yet
Folder Structure
31 pages
Mistral Complete Research
No ratings yet
Mistral Complete Research
20 pages
OCR (Optimal Character Recogintion)
No ratings yet
OCR (Optimal Character Recogintion)
7 pages
1 Notmnist - Ipynb
No ratings yet
1 Notmnist - Ipynb
15 pages
Module_02_OS V1
No ratings yet
Module_02_OS V1
42 pages
code
No ratings yet
code
11 pages
DL-9
No ratings yet
DL-9
10 pages
look - Copy (3)
No ratings yet
look - Copy (3)
6 pages
index
No ratings yet
index
6 pages
Git Manual
No ratings yet
Git Manual
19 pages
Message
No ratings yet
Message
2 pages
Finals-PIT-02OOP
No ratings yet
Finals-PIT-02OOP
10 pages
Python Applications
No ratings yet
Python Applications
8 pages
Documentation ML
No ratings yet
Documentation ML
10 pages
Airbnb
No ratings yet
Airbnb
3 pages
Document Management Application
No ratings yet
Document Management Application
10 pages
p4
No ratings yet
p4
21 pages
Openerp Reference
No ratings yet
Openerp Reference
26 pages
导入所需库
No ratings yet
导入所需库
20 pages
look - Copy
No ratings yet
look - Copy
14 pages
Pyhon FastAPI
No ratings yet
Pyhon FastAPI
10 pages
Ecinetwireless backend code4
No ratings yet
Ecinetwireless backend code4
5 pages
hybrid_scraping_techniques
No ratings yet
hybrid_scraping_techniques
8 pages
Smart Expense Tracker with AI-powered Budgeting
No ratings yet
Smart Expense Tracker with AI-powered Budgeting
15 pages
BCH 1
No ratings yet
BCH 1
7 pages
look up
No ratings yet
look up
2 pages
API Endpoints
No ratings yet
API Endpoints
2 pages
Lenovo - SAP HANA Implementation Guide X6-1.12.121-16 PDF
No ratings yet
Lenovo - SAP HANA Implementation Guide X6-1.12.121-16 PDF
232 pages
code
No ratings yet
code
1 page
trip_planner_example
No ratings yet
trip_planner_example
7 pages
LINUX SHELL PROGRAMMING LAB
No ratings yet
LINUX SHELL PROGRAMMING LAB
24 pages
Invoice File 2
No ratings yet
Invoice File 2
2 pages
How To Analyze A PDF With The Layout-Parser Package. - by Brendan Ferris - Towards Data Science
No ratings yet
How To Analyze A PDF With The Layout-Parser Package. - by Brendan Ferris - Towards Data Science
3 pages
Department of Computer Science & Engineering ST Joseph Engineering College, Mangaluru-575028 2020-2021
No ratings yet
Department of Computer Science & Engineering ST Joseph Engineering College, Mangaluru-575028 2020-2021
11 pages
SDFG
No ratings yet
SDFG
4 pages
Red Hat Enterprise Linux-8-Managing File systems-en-US
No ratings yet
Red Hat Enterprise Linux-8-Managing File systems-en-US
159 pages
This Python Script Reads a JSON File
No ratings yet
This Python Script Reads a JSON File
3 pages
Computer Configurations
No ratings yet
Computer Configurations
2 pages
OS Lab - Windows Commands
100% (1)
OS Lab - Windows Commands
20 pages
BBB
No ratings yet
BBB
8 pages
Conductor Log
No ratings yet
Conductor Log
17 pages
Mistral Ocr Llm Approach
No ratings yet
Mistral Ocr Llm Approach
6 pages
AA
No ratings yet
AA
6 pages
java errors
No ratings yet
java errors
15 pages
Extracting Text From Scanned PDF Using Pytesseract & Open CV
No ratings yet
Extracting Text From Scanned PDF Using Pytesseract & Open CV
9 pages
Ubuntu 18.04 Terminal Commands · GitHub
No ratings yet
Ubuntu 18.04 Terminal Commands · GitHub
9 pages
835 Companion Guide
No ratings yet
835 Companion Guide
17 pages
Main
No ratings yet
Main
9 pages
otp تحديث تلقايي @btt5bot
No ratings yet
otp تحديث تلقايي @btt5bot
4 pages
62076886-c07b-4b84-8f0d-9046b463afd2 (1)
No ratings yet
62076886-c07b-4b84-8f0d-9046b463afd2 (1)
4 pages
Cryptography Fundamentals Lab Assignment - 5: Code
No ratings yet
Cryptography Fundamentals Lab Assignment - 5: Code
8 pages
Odoo FastAPI Tutorial
No ratings yet
Odoo FastAPI Tutorial
2 pages
Optional Exercise (Linux Terminal Commands)
No ratings yet
Optional Exercise (Linux Terminal Commands)
12 pages
Coloab RDP
No ratings yet
Coloab RDP
12 pages
Operating Systems I Supervision Exercises: Stephen Kell
No ratings yet
Operating Systems I Supervision Exercises: Stephen Kell
23 pages
Mistral Doc
No ratings yet
Mistral Doc
4 pages
Mistral Simplified
No ratings yet
Mistral Simplified
3 pages
Docker Cheat Sheet - 1
No ratings yet
Docker Cheat Sheet - 1
8 pages
Differences between socket programming on Windows, Linux, and macOS
No ratings yet
Differences between socket programming on Windows, Linux, and macOS
4 pages
5 Notes For FKWO
No ratings yet
5 Notes For FKWO
5 pages
TCP/IP Attack Lab
No ratings yet
TCP/IP Attack Lab
8 pages
Lab Setup Instructions
No ratings yet
Lab Setup Instructions
10 pages
Fpse
No ratings yet
Fpse
4 pages
Practical 25
No ratings yet
Practical 25
4 pages
s7
No ratings yet
s7
2 pages
s15
No ratings yet
s15
2 pages
s1
No ratings yet
s1
2 pages
s2
No ratings yet
s2
2 pages
s5
No ratings yet
s5
2 pages
AI Comparison
No ratings yet
AI Comparison
2 pages
Jigna-Microsoft Certified Azure AI Fundamentals.pdf
No ratings yet
Jigna-Microsoft Certified Azure AI Fundamentals.pdf
1 page
Receipt
No ratings yet
Receipt
1 page
Ms Dos
No ratings yet
Ms Dos
3 pages
How to Draw to Unlock iPhone – ForYouTricks
No ratings yet
How to Draw to Unlock iPhone – ForYouTricks
1 page
MSDOS
No ratings yet
MSDOS
2 pages
SAP HANA Guidelines For SLES Operating System Installation
No ratings yet
SAP HANA Guidelines For SLES Operating System Installation
3 pages
Nathan Chufor - DevOps or SRE - Engineer Resume
No ratings yet
Nathan Chufor - DevOps or SRE - Engineer Resume
3 pages
Essential n8n Playbook
From Everand
Essential n8n Playbook
Leandro Calado
No ratings yet
NgRx SignalStore: An effortless solution for state management
From Everand
NgRx SignalStore: An effortless solution for state management
Abdelfattah Ragab
No ratings yet
Quick Python Guide
From Everand
Quick Python Guide
Coder1
No ratings yet
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
How to a Developers Guide to 4k: Developer edition, #3
From Everand
How to a Developers Guide to 4k: Developer edition, #3
Xinc Cyberwizard
No ratings yet
DevOps. How To Build Pipelines With Bitbucket Pipelines + Docker Container + AWS ECS + JDK 11 + Maven 3?
From Everand
DevOps. How To Build Pipelines With Bitbucket Pipelines + Docker Container + AWS ECS + JDK 11 + Maven 3?
John Edward Cooper Berg
No ratings yet

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Updated Code

Uploaded by

Updated Code

Uploaded by

from fastapi import FastAPI, File, UploadFile, HTTPException

# Load environment variables

# Configure Gemini API

# Set Tesseract path

# In-memory caches (1-hour TTL)

# Check if poppler utilities are in PATH

# Common poppler paths for different systems

for path in common_paths:

async def process_image(img_bytes, filename, idx):

async def process_pdf_page(img, page_idx):

async def process_with_gemini(filename: str, raw_text: str):

# Check structured data cache

# Truncate text for Gemini

logger.info(f"Starting processing for {len(files)} files,

for file in files:

# Validate file format

# Read file into memory

# Check raw text cache

# If no embedded text, perform OCR

# Get the correct poppler path

# Convert PDF to images with proper poppler path handling

logger.info(f"PDF to images conversion for {file.filename},

# Process with Gemini

logger.info(f"Total processing for {file.filename}, took {time.time() -

output_json["message"] = f"Processed {len(files)} files. {success_count}

logger.info(f"Completed processing for {len(files)} files, {success_count}

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.