Skip to content

Commit b9d1498

Browse files
committed
query example using pdf as input
1 parent d157f34 commit b9d1498

File tree

4 files changed

+85
-1
lines changed

4 files changed

+85
-1
lines changed
924 KB
Binary file not shown.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from pgml import Database
2+
import os
3+
from datasets import load_dataset
4+
from time import time
5+
from dotenv import load_dotenv
6+
from rich.console import Console
7+
from pypdf import PdfReader
8+
from pgml.dbutils import run_select_statement
9+
10+
load_dotenv()
11+
console = Console()
12+
13+
local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
14+
15+
conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
16+
db = Database(conninfo)
17+
18+
collection_name = "pdf_collection"
19+
collection = db.create_or_get_collection(collection_name)
20+
21+
filename = "lincoln.pdf"
22+
reader = PdfReader("lincoln.pdf")
23+
number_of_pages = len(reader.pages)
24+
documents = []
25+
for page_number, page in enumerate(reader.pages):
26+
documents.append({"text": page.extract_text(),"page": page_number, "source": filename})
27+
28+
collection.upsert_documents(documents)
29+
collection.generate_chunks()
30+
collection.generate_embeddings()
31+
32+
start = time()
33+
query = "When was Lincoln born?"
34+
results = collection.vector_search(query, top_k=1)
35+
36+
# db.archive_collection(collection_name)
37+
conn = db.pool.getconn()
38+
context = " ".join(results[0]["chunk"].strip().split())
39+
context = context.replace('"', '\\"').replace("'", "''")
40+
41+
select_statement = """SELECT pgml.transform(
42+
'question-answering',
43+
inputs => ARRAY[
44+
'{
45+
\"question\": \"%s\",
46+
\"context\": \"%s\"
47+
}'
48+
]
49+
) AS answer;""" % (
50+
query,
51+
context,
52+
)
53+
54+
results = run_select_statement(conn, select_statement)
55+
db.pool.putconn(conn)
56+
57+
console.print("\nResults for query '%s'" % query)
58+
console.print(results)
59+
db.archive_collection(collection_name)
60+
_end = time()
61+
console.print("Query time = %0.3f" % (_end - start))

pgml-sdks/python/pgml/poetry.lock

Lines changed: 23 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgml-sdks/python/pgml/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ ipywidgets = "^8.0.6"
2121
datasets = "^2.12.0"
2222
python-dotenv = "^1.0.0"
2323
pypika = "^0.48.9"
24+
pypdf = "^3.9.1"
2425

2526

2627
[tool.poetry.group.dev.dependencies]

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy