pdf2txt Py
pdf2txt Py
exe
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags."""
import argparse
import logging
import sys
from typing import Any, Container, Iterable, List, Optional
import pdfminer.high_level
from pdfminer.layout import LAParams
from pdfminer.utils import AnyIO
from pdfminer.pdfexceptions import PDFValueError
logging.basicConfig()
def extract_text(
files: Iterable[str] = [],
outfile: str = "-",
laparams: Optional[LAParams] = None,
output_type: str = "text",
codec: str = "utf-8",
strip_control: bool = False,
maxpages: int = 0,
page_numbers: Optional[Container[int]] = None,
password: str = "",
scale: float = 1.0,
rotation: int = 0,
layoutmode: str = "normal",
output_dir: Optional[str] = None,
debug: bool = False,
disable_caching: bool = False,
**kwargs: Any,
) -> AnyIO:
if not files:
raise PDFValueError("Must provide files to work upon!")
if outfile == "-":
outfp: AnyIO = sys.stdout
if sys.stdout.encoding is not None:
codec = "utf-8"
else:
outfp = open(outfile, "wb")
parser.add_argument(
"--version",
"-v",
action="version",
version=f"pdfminer.six v{pdfminer.__version__}",
)
parser.add_argument(
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parser.add_argument(
"--disable-caching",
"-C",
default=False,
action="store_true",
help="If caching or resources, such as fonts, should be disabled.",
)
parse_params = parser.add_argument_group(
"Parser", description="Used during PDF parsing"
)
parse_params.add_argument(
"--page-numbers",
type=int,
default=None,
nargs="+",
help="A space-seperated list of page numbers to parse.",
)
parse_params.add_argument(
"--pagenos",
"-p",
type=str,
help="A comma-separated list of page numbers to parse. "
"Included for legacy applications, use --page-numbers "
"for more idiomatic argument entry.",
)
parse_params.add_argument(
"--maxpages",
"-m",
type=int,
default=0,
help="The maximum number of pages to parse.",
)
parse_params.add_argument(
"--password",
"-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
parse_params.add_argument(
"--rotation",
"-R",
default=0,
type=int,
help="The number of degrees to rotate the PDF "
"before other types of processing.",
)
output_params = parser.add_argument_group(
"Output", description="Used during output generation."
)
output_params.add_argument(
"--outfile",
"-o",
type=str,
default="-",
help="Path to file where output is written. "
'Or "-" (default) to write to stdout.',
)
output_params.add_argument(
"--output_type",
"-t",
type=str,
default="text",
help="Type of output to generate {text,html,xml,tag}.",
)
output_params.add_argument(
"--codec",
"-c",
type=str,
default="utf-8",
help="Text encoding to use in output file.",
)
output_params.add_argument(
"--output-dir",
"-O",
default=None,
help="The output directory to put extracted images in. If not given, "
"images are not extracted.",
)
output_params.add_argument(
"--layoutmode",
"-Y",
default="normal",
type=str,
help="Type of layout to use when generating html "
"{normal,exact,loose}. If normal,each line is"
" positioned separately in the html. If exact"
", each character is positioned separately in"
" the html. If loose, same result as normal "
"but with an additional newline after each "
"text line. Only used when output_type is html.",
)
output_params.add_argument(
"--scale",
"-s",
type=float,
default=1.0,
help="The amount of zoom to use when generating html file. "
"Only used when output_type is html.",
)
output_params.add_argument(
"--strip-control",
"-S",
default=False,
action="store_true",
help="Remove control statement from text. "
"Only used when output_type is xml.",
)
return parser
if parsed_args.pagenos:
parsed_args.page_numbers = {int(x) - 1 for x in
parsed_args.pagenos.split(",")}
return parsed_args
if __name__ == "__main__":
sys.exit(main())