Generate searchable PDFs with Azure Form Recognizer
Generate searchable PDFs with Azure Form Recognizer
PDF documents are widely used in business processes. Digitally created PDFs are very convenient to use.
Text can be searched, highlighted, and annotated. Unfortunately, a lot of PDFs are created by scanning or
converting images to PDFs. There is no digital text in these PDFs, so they cannot be searched. In this blog
post, we demonstrate how to convert such PDFs into searchable PDFs with a simple and easy to use code
and Azure Form Recognizer. The code will generate a searchable PDF file that will allow you to store the
document anywhere, search within the document and copy and paste. Blog content:
If PDF is image-based (example), text cannot be searched or selected. Image compression artifacts are
typically seen around text by zooming in:
Pre-requirement installation
Please install the following packages before running searchable pdf script:
1. Python packages:
fr_generate_searchable_pdf.py
1 # Script to create searchable PDF from scan PDF or images using Azure Form
2 Recognizer
3 # Required packages
4 # pip install --upgrade azure-ai-formrecognizer>=3.3 pypdf>=3.0 reportlab
5 pillow pdf2image
6 import sys
7 import io
8 import math
9 import argparse
10 from pdf2image import convert_from_path
11 from reportlab.pdfgen import canvas
12 from reportlab.lib import pagesizes
13 from reportlab import rl_config
14 from PIL import Image, ImageSequence
15 from pypdf import PdfWriter, PdfReader
16 from azure.core.credentials import AzureKeyCredential
17 from azure.ai.formrecognizer import DocumentAnalysisClient
18
19 # Please provide your Azure Form Recognizer endpoint and key
20 endpoint = YOUR_FORM_RECOGNIZER_ENDPOINT
21 key = YOUR_FORM_RECOGNIZER_KEY
22
23 def dist(p1, p2):
24 return math.sqrt((p1.x - p2.x)*(p1.x - p2.x) + (p1.y - p2.y) * (p1.y -
25 p2.y))
26
27 if __name__ == '__main__':
28 parser = argparse.ArgumentParser()
29 parser.add_argument('input_file', type=str, help="Input PDF or image
30 (jpg, jpeg, tif, tiff, bmp, png) file name")
31 parser.add_argument('-o', '--output', type=str, required=False,
32 default="", help="Output PDF file name. Default: input_file + .ocr.pdf")
33 args = parser.parse_args()
34
35 input_file = args.input_file
36 if args.output:
37 output_file = args.output
38 else:
39 output_file = input_file + ".ocr.pdf"
40
41 # Loading input file
42 print(f"Loading input file {input_file}")
43 if input_file.lower().endswith('.pdf'):
44 # read existing PDF as images
45 image_pages = convert_from_path(input_file)
46 elif input_file.lower().endswith(('.tif', '.tiff', '.jpg', '.jpeg',
47 '.png', '.bmp')):
48 # read input image (potential multi page Tiff)
49 image_pages = ImageSequence.Iterator(Image.open(input_file))
50 else:
51 sys.exit(f"Error: Unsupported input file extension {input_file}.
52 Supported extensions: PDF, TIF, TIFF, JPG, JPEG, PNG, BMP.")
53
54 # Running OCR using Azure Form Recognizer Read API
55 print(f"Starting Azure Form Recognizer OCR process...")
56 document_analysis_client = DocumentAnalysisClient(endpoint=endpoint,
57 credential=AzureKeyCredential(key), headers={"x-ms-useragent": "searchable-
58 pdf-blog/1.0.0"})
59
60 with open(input_file, "rb") as f:
61 poller = document_analysis_client.begin_analyze_document("prebuilt-
62 read", document = f)
63
64 ocr_results = poller.result()
65 print(f"Azure Form Recognizer finished OCR text for
66 {len(ocr_results.pages)} pages.")
67
67
68
# Generate OCR overlay layer
69
print(f"Generating searchable PDF...")
70
output = PdfWriter()
71
default_font = "Times-Roman"
72
for page_id, page in enumerate(ocr_results.pages):
73
ocr_overlay = io.BytesIO()
74
75
# Calculate overlay PDF page size
76
if image_pages[page_id].height > image_pages[page_id].width:
77
page_scale = float(image_pages[page_id].height) /
78
pagesizes.letter[1]
79
else:
80
page_scale = float(image_pages[page_id].width) /
81
pagesizes.letter[1]
82
83
page_width = float(image_pages[page_id].width) / page_scale
84
page_height = float(image_pages[page_id].height) / page_scale
85
86
scale = (page_width / page.width + page_height / page.height) / 2.0
87
pdf_canvas = canvas.Canvas(ocr_overlay, pagesize=(page_width,
88
page_height))
89
90
# Add image into PDF page
91
pdf_canvas.drawInlineImage(image_pages[page_id], 0, 0,
92
width=page_width, height=page_height, preserveAspectRatio=True)
93
94
text = pdf_canvas.beginText()
95
# Set text rendering mode to invisible
96
text.setTextRenderMode(3)
97
for word in page.words:
98
# Calculate optimal font size
99
desired_text_width = max(dist(word.polygon[0], word.polygon[1]),
100
dist(word.polygon[3], word.polygon[2])) * scale
101
desired_text_height = max(dist(word.polygon[1],
102
word.polygon[2]), dist(word.polygon[0], word.polygon[3])) * scale
103
font_size = desired_text_height
104
actual_text_width = pdf_canvas.stringWidth(word.content,
105
default_font, font_size)
106
107
# Calculate text rotation angle
108
text_angle = math.atan2((word.polygon[1].y - word.polygon[0].y +
109
word.polygon[2].y - word.polygon[3].y) / 2.0,
110
(word.polygon[1].x - word.polygon[0].x +
111
word.polygon[2].x - word.polygon[3].x) / 2.0)
text.setFont(default_font, font_size)
text.setTextTransform(math.cos(text_angle), -
math.sin(text_angle), math.sin(text_angle), math.cos(text_angle),
word.polygon[3].x * scale, page_height - word.polygon[3].y * scale)
text.setHorizScale(desired_text_width / actual_text_width * 100)
text.textOut(word.content + " ")
pdf_canvas.drawText(text)
pdf_canvas.save()
Comment
anatolip MICROSOFT
View Profile
Share
What's new
Surface Pro 9
Surface Laptop 5
Surface Studio 2+
Surface Laptop Go 2
Surface Duo 2
Microsoft 365
Windows 11 apps
Microsoft Store
Account profile
Download Center
Returns
Order tracking
Education
Microsoft in education
Business
Microsoft Cloud
Microsoft Security
Dynamics 365
Microsoft 365
Microsoft Teams
Microsoft Industry
Small Business
Developer & IT
Azure
Developer Center
Documentation
Microsoft Learn
Azure Marketplace
AppSource
Visual Studio
Company
Careers
About Microsoft
Company news
Privacy at Microsoft
Investors
Accessibility
Sustainability
Sitemap Contact Microsoft Privacy Manage cookies Terms of use Trademarks Safety & eco About our ads © Microsoft 2024