diff --git a/pgml-sdks/python/pgml/README.md b/pgml-sdks/python/pgml/README.md index 33487fdb1..6368675a1 100644 --- a/pgml-sdks/python/pgml/README.md +++ b/pgml-sdks/python/pgml/README.md @@ -1,10 +1,14 @@ -# Table of Contents +# Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone + +## Table of Contents - [Overview](#overview) - [Quickstart](#quickstart) - [Usage](#usage) +- [Examples](./examples/README.md) - [Developer setup](#developer-setup) - [API Reference](#api-reference) +- [Roadmap](#roadmap) ## Overview Python SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases. With this SDK, you can seamlessly manage various database tables related to documents, text chunks, text splitters, LLM (Language Model) models, and embeddings. By leveraging the SDK's capabilities, you can efficiently index LLM embeddings using PgVector for fast and accurate queries. @@ -274,4 +278,13 @@ LOGLEVEL=INFO python -m unittest tests/test_collection.py ### API Reference - [Database](./docs/pgml/database.md) -- [Collection](./docs/pgml/collection.md) \ No newline at end of file +- [Collection](./docs/pgml/collection.md) + +### Roadmap + +- Enable filters on document metadata in `vector_search`. [Issue](https://github.com/postgresml/postgresml/issues/663) +- `text_search` functionality on documents using Postgres text search. [Issue](https://github.com/postgresml/postgresml/issues/664) +- `hybrid_search` functionality that does a combination of `vector_search` and `text_search` in an order specified by the user. [Issue](https://github.com/postgresml/postgresml/issues/665) +- Ability to call and manage OpenAI embeddings for comparison purposes. [Issue](https://github.com/postgresml/postgresml/issues/666) +- Save `vector_search` history for downstream monitoring of model performance. [Issue](https://github.com/postgresml/postgresml/issues/667) +- Perform chunking on the DB with multiple langchain splitters. [Issue](https://github.com/postgresml/postgresml/issues/668) \ No newline at end of file diff --git a/pgml-sdks/python/pgml/examples/README.md b/pgml-sdks/python/pgml/examples/README.md new file mode 100644 index 000000000..a77848eff --- /dev/null +++ b/pgml-sdks/python/pgml/examples/README.md @@ -0,0 +1,19 @@ +## Examples + +### [Semantic Search](./semantic_search.py) +This is a basic example to perform semantic search on a collection of documents. It loads the Quora dataset, creates a collection in a PostgreSQL database, upserts documents, generates chunks and embeddings, and then performs a vector search on a query. Embeddings are created using `intfloat/e5-small` model. The results are are semantically similar documemts to the query. Finally, the collection is archived. + +### [Question Answering](./question_answering.py) +This is an example to find documents relevant to a question from the collection of documents. It loads the Stanford Question Answering Dataset (SQuAD) into the database, generates chunks and embeddings. Query is passed to vector search to retrieve documents that match closely in the embeddings space. A score is returned with each of the search result. + +### [Question Answering using Instructore Model](./question_answering_instructor.py) +In this example, we will use `hknlp/instructor-base` model to build text embeddings instead of the default `intfloat/e5-small` model. We will show how to use `register_model` method and use the `model_id` to build and query embeddings. + +### [Extractive Question Answering](./extractive_question_answering.py) +In this example, we will show how to use `vector_search` result as a `context` to a HuggingFace question answering model. We will use `pgml.transform` to run the model on the database. + +### [Table Question Answering](./table_question_answering.py) +In this example, we will use [Open Table-and-Text Question Answering (OTT-QA) +](https://github.com/wenhuchen/OTT-QA) dataset to run queries on tables. We will use `deepset/all-mpnet-base-v2-table` model that is trained for embedding tabular data for retrieval tasks. + + diff --git a/pgml-sdks/python/pgml/examples/extractive_question_answering.py b/pgml-sdks/python/pgml/examples/extractive_question_answering.py new file mode 100644 index 000000000..02cac7071 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/extractive_question_answering.py @@ -0,0 +1,69 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +from psycopg import sql +from pgml.dbutils import run_select_statement + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "squad_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("squad", split="train") +data = data.to_pandas() +data = data.drop_duplicates(subset=["context"]) + +documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") +] + +collection.upsert_documents(documents[:200]) +collection.generate_chunks() +collection.generate_embeddings() + +start = time() +query = "Who won more than 20 grammy awards?" +results = collection.vector_search(query, top_k=5) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +# Get the context passage and use pgml.transform to get short answer to the question + + +conn = db.pool.getconn() +context = " ".join(results[0]["chunk"].strip().split()) +context = context.replace('"', '\\"').replace("'", "''") + +select_statement = """SELECT pgml.transform( + 'question-answering', + inputs => ARRAY[ + '{ + \"question\": \"%s\", + \"context\": \"%s\" + }' + ] +) AS answer;""" % ( + query, + context, +) + +results = run_select_statement(conn, select_statement) +db.pool.putconn(conn) + +console.print("\nResults for query '%s'" % query) +console.print(results) +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/vector_search.py b/pgml-sdks/python/pgml/examples/question_answering.py similarity index 59% rename from pgml-sdks/python/pgml/examples/vector_search.py rename to pgml-sdks/python/pgml/examples/question_answering.py index 4a55cdc10..a70930fb7 100644 --- a/pgml-sdks/python/pgml/examples/vector_search.py +++ b/pgml-sdks/python/pgml/examples/question_answering.py @@ -3,14 +3,18 @@ import json from datasets import load_dataset from time import time -from rich import print as rprint +from dotenv import load_dotenv +from rich.console import Console + +load_dotenv() +console = Console() local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" conninfo = os.environ.get("PGML_CONNECTION", local_pgml) db = Database(conninfo) -collection_name = "test_pgml_sdk_1" +collection_name = "squad_collection" collection = db.create_or_get_collection(collection_name) @@ -19,7 +23,7 @@ data = data.drop_duplicates(subset=["context"]) documents = [ - {'id': r['id'], "text": r["context"], "title": r["title"]} + {"id": r["id"], "text": r["context"], "title": r["title"]} for r in data.to_dict(orient="records") ] @@ -28,7 +32,11 @@ collection.generate_embeddings() start = time() -results = collection.vector_search("Who won 20 grammy awards?", top_k=2) -rprint("Query time %0.3f"%(time()-start)) -rprint(json.dumps(results, indent=2)) +query = "Who won 20 grammy awards?" +results = collection.vector_search(query, top_k=5) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/question_answering_instructor.py b/pgml-sdks/python/pgml/examples/question_answering_instructor.py new file mode 100644 index 000000000..6ed49f27e --- /dev/null +++ b/pgml-sdks/python/pgml/examples/question_answering_instructor.py @@ -0,0 +1,55 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "squad_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("squad", split="train") +data = data.to_pandas() +data = data.drop_duplicates(subset=["context"]) + +documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") +] + +collection.upsert_documents(documents[:200]) +collection.generate_chunks() + +# register instructor model +model_id = collection.register_model( + model_name="hkunlp/instructor-base", + model_params={"instruction": "Represent the Wikipedia document for retrieval: "}, +) +collection.generate_embeddings(model_id=model_id) + +start = time() +query = "Who won 20 grammy awards?" +results = collection.vector_search( + query, + top_k=5, + model_id=model_id, + query_parameters={ + "instruction": "Represent the Wikipedia question for retrieving supporting documents: " + }, +) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py new file mode 100644 index 000000000..2eafbf55b --- /dev/null +++ b/pgml-sdks/python/pgml/examples/semantic_search.py @@ -0,0 +1,50 @@ +from datasets import load_dataset +from pgml import Database +import os +from rich import print as rprint +from dotenv import load_dotenv +from time import time +from rich.console import Console + +load_dotenv() +console = Console() + +# Prepare Data +dataset = load_dataset("quora", split="train") +questions = [] + +for record in dataset["questions"]: + questions.extend(record["text"]) + +# remove duplicates +documents = [] +for question in list(set(questions)): + if question: + documents.append({"text": question}) + + +# Get Database connection +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo, min_connections=4) + +# Create or get collection +collection_name = "quora_collection" +collection = db.create_or_get_collection(collection_name) + +# Upsert documents, chunk text, and generate embeddings +collection.upsert_documents(documents[:200]) +collection.generate_chunks() +collection.generate_embeddings() + +# Query vector embeddings +start = time() +query = "What is a good mobile os?" +result = collection.vector_search(query) +_end = time() + +console.print("\nResults for '%s'" % (query), style="bold") +console.print(result) +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/table_question_answering.py b/pgml-sdks/python/pgml/examples/table_question_answering.py new file mode 100644 index 000000000..f208e3392 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/table_question_answering.py @@ -0,0 +1,56 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +from rich.progress import track +from psycopg import sql +from pgml.dbutils import run_select_statement +import pandas as pd + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "ott_qa_20k_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("ashraq/ott-qa-20k", split="train") +documents = [] + +# loop through the dataset and convert tabular data to pandas dataframes +for doc in track(data): + table = pd.DataFrame(doc["data"], columns=doc["header"]) + processed_table = "\n".join([table.to_csv(index=False)]) + documents.append( + { + "text": processed_table, + "title": doc["title"], + "url": doc["url"], + "uid": doc["uid"], + } + ) + +collection.upsert_documents(documents) +collection.generate_chunks() + +# SentenceTransformer model trained specifically for embedding tabular data for retrieval tasks +model_id = collection.register_model(model_name="deepset/all-mpnet-base-v2-table") +collection.generate_embeddings(model_id=model_id) + +start = time() +query = "which country has the highest GDP in 2020?" +results = collection.vector_search(query, top_k=5, model_id=model_id) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb deleted file mode 100644 index e14e8b4f2..000000000 --- a/pgml-sdks/python/pgml/examples/vector_search.ipynb +++ /dev/null @@ -1,236 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pgml import Database\n", - "import os\n", - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n", - "\n", - "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n", - "db = Database(conninfo,min_connections=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection_name = \"test_pgml_sdk_1\"\n", - "collection = db.create_or_get_collection(collection_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "data = load_dataset(\"squad\", split=\"train\")\n", - "data = data.to_pandas()\n", - "data.head()\n", - "\n", - "data = data.drop_duplicates(subset=[\"context\"])\n", - "print(len(data))\n", - "data.head()\n", - "\n", - "documents = [\n", - " {\n", - " 'text': r['context'],\n", - " 'metadata': {\n", - " 'title': r['title']\n", - " }\n", - " } for r in data.to_dict(orient='records')\n", - "]\n", - "documents[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.upsert_documents(documents[0:200])\n", - "collection.generate_chunks()\n", - "collection.generate_embeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.get_models()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(json.dumps(collection.get_models(),indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(model_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.get_models()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(model_id=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_chunks(splitter_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(splitter_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "db.delete_collection(collection_name)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pgml-zoggicR5-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pgml-sdks/python/pgml/pgml/collection.py b/pgml-sdks/python/pgml/pgml/collection.py index 38a8498b9..7501d0196 100644 --- a/pgml-sdks/python/pgml/pgml/collection.py +++ b/pgml-sdks/python/pgml/pgml/collection.py @@ -438,7 +438,7 @@ def register_model( task: Optional[str] = "embedding", model_name: Optional[str] = "intfloat/e5-small", model_params: Optional[Dict[str, Any]] = {}, - ) -> None: + ) -> int: """ This function registers a model in a database if it does not already exist. @@ -775,7 +775,9 @@ def vector_search( documents_table=self.documents_table, ) - search_results = run_select_statement(conn, cte_select_statement) + search_results = run_select_statement( + conn, cte_select_statement, order_by="score", ascending=False + ) self.pool.putconn(conn) return search_results diff --git a/pgml-sdks/python/pgml/pgml/dbutils.py b/pgml-sdks/python/pgml/pgml/dbutils.py index 5d58b56a5..95ce5b003 100644 --- a/pgml-sdks/python/pgml/pgml/dbutils.py +++ b/pgml-sdks/python/pgml/pgml/dbutils.py @@ -52,7 +52,9 @@ def run_create_or_insert_statement( cur.close() -def run_select_statement(conn: Connection, statement: str) -> List[Any]: +def run_select_statement( + conn: Connection, statement: str, order_by: str = "", ascending: bool = True +) -> List[Any]: """ The function runs a select statement on a database connection and returns the results as a list of dictionaries. @@ -70,12 +72,29 @@ def run_select_statement(conn: Connection, statement: str) -> List[Any]: statement = statement.strip().rstrip(";") cur = conn.cursor() - json_conversion_statement = """ - SELECT array_to_json(array_agg(row_to_json(t))) + order_statement = "" + if order_by: + order_statement = "ORDER BY t.%s" % order_by + if ascending: + order_statement += " ASC" + else: + order_statement += " DESC" + + if order_statement: + json_conversion_statement = """ + SELECT array_to_json(array_agg(row_to_json(t) {order_statement})) FROM ({select_statement}) t; """.format( - select_statement=statement - ) + select_statement=statement, + order_statement=order_statement, + ) + else: + json_conversion_statement = """ + SELECT array_to_json(array_agg(row_to_json(t))) + FROM ({select_statement}) t; + """.format( + select_statement=statement + ) log.info("Running %s .. " % json_conversion_statement) cur.execute(json_conversion_statement) results = cur.fetchall() diff --git a/pgml-sdks/python/pgml/poetry.lock b/pgml-sdks/python/pgml/poetry.lock index 87633c653..497468c5e 100644 --- a/pgml-sdks/python/pgml/poetry.lock +++ b/pgml-sdks/python/pgml/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1832,6 +1832,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, + {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2023.3" @@ -2561,4 +2576,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "51765e8059c760223993d3145da644935c0f77b091f1f2ea355f81c6249a3e1b" +content-hash = "7d20ef57aee0494f39dfbd3250c2654112393c9c8e0ef9257bce5d72506f0378" diff --git a/pgml-sdks/python/pgml/pyproject.toml b/pgml-sdks/python/pgml/pyproject.toml index 72322a567..c9020806d 100644 --- a/pgml-sdks/python/pgml/pyproject.toml +++ b/pgml-sdks/python/pgml/pyproject.toml @@ -19,6 +19,7 @@ psycopg-pool = "^3.1.7" langchain = "^0.0.167" ipywidgets = "^8.0.6" datasets = "^2.12.0" +python-dotenv = "^1.0.0" [tool.poetry.group.dev.dependencies]
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: