From 74efc3958e4cea30994b3fef1a5db6b58fe1f464 Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Tue, 30 May 2023 09:57:17 -0700 Subject: [PATCH 1/7] semantic search example --- .../pgml/examples/semantic_search.ipynb | 235 ++++++++++++++++++ .../python/pgml/examples/semantic_search.py | 33 +++ .../python/pgml/examples/vector_search.ipynb | 48 +++- 3 files changed, 312 insertions(+), 4 deletions(-) create mode 100644 pgml-sdks/python/pgml/examples/semantic_search.ipynb create mode 100644 pgml-sdks/python/pgml/examples/semantic_search.py diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb new file mode 100644 index 000000000..d9160f6d3 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/semantic_search.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "dataset = load_dataset('quora', split='train')\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "questions = []\n", + "\n", + "for record in dataset['questions']:\n", + " questions.extend(record['text'])\n", + " \n", + "# remove duplicates\n", + "documents = []\n", + "for question in list(set(questions)):\n", + " if question:\n", + " documents.append({\"text\": question})\n", + "\n", + "len(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from pgml import Database\n", + "import os\n", + "from rich import print as rprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n", + "\n", + "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n", + "db = Database(conninfo,min_connections=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "collection_name = \"quora_collection\"\n", + "collection = db.create_or_get_collection(collection_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[43], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m collection\u001b[39m.\u001b[39;49mupsert_documents(documents[:\u001b[39m10000\u001b[39;49m])\n", + "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:291\u001b[0m, in \u001b[0;36mCollection.upsert_documents\u001b[0;34m(self, documents, text_key, id_key)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mupsert_documents\u001b[39m(\n\u001b[1;32m 266\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 267\u001b[0m documents: List[Dict[\u001b[39mstr\u001b[39m, Any]],\n\u001b[1;32m 268\u001b[0m text_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 269\u001b[0m id_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 270\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 271\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 272\u001b[0m \u001b[39m The function `upsert_documents` inserts or updates documents in a database table based on their ID,\u001b[39;00m\n\u001b[1;32m 273\u001b[0m \u001b[39m text, and metadata.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[39m upsert process. If set to False, only essential information will be printed, defaults to False\u001b[39;00m\n\u001b[1;32m 290\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 291\u001b[0m conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m 292\u001b[0m \u001b[39mfor\u001b[39;00m document \u001b[39min\u001b[39;00m track(documents, description\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUpserting documents\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 293\u001b[0m \u001b[39mif\u001b[39;00m text_key \u001b[39min\u001b[39;00m \u001b[39mlist\u001b[39m(document\u001b[39m.\u001b[39mkeys()):\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 180\u001b[0m timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m 181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 184\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 744\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:734\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconn \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror):\n\u001b[1;32m 733\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 734\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_cond\u001b[39m.\u001b[39;49mwait(timeout):\n\u001b[1;32m 735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror \u001b[39m=\u001b[39m PoolTimeout(\n\u001b[1;32m 736\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcouldn\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt get a connection after \u001b[39m\u001b[39m{\u001b[39;00mtimeout\u001b[39m}\u001b[39;00m\u001b[39m sec\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 737\u001b[0m )\n\u001b[1;32m 738\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m ex:\n", + "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.3/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:324\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 323\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 324\u001b[0m gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39;49macquire(\u001b[39mTrue\u001b[39;49;00m, timeout)\n\u001b[1;32m 325\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 326\u001b[0m gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39macquire(\u001b[39mFalse\u001b[39;00m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "collection.upsert_documents(documents[:20000])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2KGenerating chunks \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:01:25\u001b[0m00:01\u001b[0m00:03\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "collection.generate_chunks()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating embeddings using intfloat/e5-small \u001b[33m...\u001b[0m \n" + ] + } + ], + "source": [ + "collection.generate_embeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "ename": "PoolTimeout", + "evalue": "couldn't get a connection after 30.0 sec", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mPoolTimeout\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m query \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mwhat are haiku poems?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m result \u001b[39m=\u001b[39m collection\u001b[39m.\u001b[39;49mvector_search(query)\n", + "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:711\u001b[0m, in \u001b[0;36mCollection.vector_search\u001b[0;34m(self, query, query_parameters, top_k, model_id, splitter_id)\u001b[0m\n\u001b[1;32m 679\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvector_search\u001b[39m(\n\u001b[1;32m 680\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 681\u001b[0m query: \u001b[39mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 685\u001b[0m splitter_id: \u001b[39mint\u001b[39m \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m,\n\u001b[1;32m 686\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[Dict[\u001b[39mstr\u001b[39m, Any]]:\n\u001b[1;32m 687\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 688\u001b[0m \u001b[39m This function performs a vector search on a database using a query and returns the top matching\u001b[39;00m\n\u001b[1;32m 689\u001b[0m \u001b[39m results.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[39m with the search result\u001b[39;00m\n\u001b[1;32m 710\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 711\u001b[0m conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m 713\u001b[0m \u001b[39mif\u001b[39;00m model_id \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names\u001b[39m.\u001b[39mkeys():\n\u001b[1;32m 714\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names[model_id]\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 180\u001b[0m timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m 181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 184\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 744\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n", + "\u001b[0;31mPoolTimeout\u001b[0m: couldn't get a connection after 30.0 sec" + ] + } + ], + "source": [ + "query = \"which city has the highest population in the world?\"\n", + "result = collection.vector_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.9226062759447833\u001b[0m,\n", + " \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some examples of haiku poems about animals?'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8803199702518053\u001b[0m,\n", + " \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do poems have rhythm?'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8562827400386631\u001b[0m,\n", + " \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do you get inspiration to write a poem?'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8304190744787252\u001b[0m,\n", + " \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some great examples of witty exchanges?'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8221808330080992\u001b[0m,\n", + " \u001b[32m'chunk'\u001b[0m: \u001b[32m'Which are some inspirational books I can read?'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + } + ], + "source": [ + "rprint(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pgml-zoggicR5-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py new file mode 100644 index 000000000..853a863ec --- /dev/null +++ b/pgml-sdks/python/pgml/examples/semantic_search.py @@ -0,0 +1,33 @@ +from datasets import load_dataset +from pgml import Database +import os +from rich import print as rprint + +dataset = load_dataset('quora', split='train') +questions = [] + +for record in dataset['questions']: + questions.extend(record['text']) + +# remove duplicates +documents = [] +for question in list(set(questions)): + if question: + documents.append({"text": question}) + + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION",local_pgml) +db = Database(conninfo,min_connections=4) + +collection_name = "quora_collection" +collection = db.create_or_get_collection(collection_name) + +collection.upsert_documents(documents[:20]) +collection.generate_chunks() +collection.generate_embeddings() + +query = "which city has the highest population in the world?" +result = collection.vector_search(query) +rprint(result) \ No newline at end of file diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb index e14e8b4f2..1c30f8647 100644 --- a/pgml-sdks/python/pgml/examples/vector_search.ipynb +++ b/pgml-sdks/python/pgml/examples/vector_search.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n", + "env: LOGLEVEL=INFO\n" + ] + } + ], + "source": [ + "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n", + "%env LOGLEVEL=INFO" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -25,14 +44,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "collection_name = \"test_pgml_sdk_1\"\n", + "collection_name = \"test_collection_1\"\n", "collection = db.create_or_get_collection(collection_name)" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "missing FROM-clause entry for table \"test_collection_1\"\n", + "LINE 1: ...forms (table_name, task, model, splitter) VALUES (test_colle...\n", + " ^\n" + ] + } + ], + "source": [ + "conn = db.pool.getconn()\n", + "emb_table = collection._create_or_get_embeddings_table(conn, model_id=1,splitter_id=1)\n", + "db.pool.putconn(conn)" + ] + }, { "cell_type": "code", "execution_count": null, From 1aa2c33d5bfaec3f4f9d93808e4c8d76ab691281 Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Tue, 30 May 2023 10:03:53 -0700 Subject: [PATCH 2/7] Removed env --- pgml-sdks/python/pgml/examples/semantic_search.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb index d9160f6d3..479e4a001 100644 --- a/pgml-sdks/python/pgml/examples/semantic_search.ipynb +++ b/pgml-sdks/python/pgml/examples/semantic_search.ipynb @@ -42,15 +42,6 @@ "from rich import print as rprint" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd" - ] - }, { "cell_type": "code", "execution_count": null, From 4ab0a60ffeb84e89b3ba23d6cebb0e9b9ef3b4bc Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Tue, 30 May 2023 11:51:55 -0700 Subject: [PATCH 3/7] Added semantic search --- .../pgml/examples/semantic_search.ipynb | 226 ------------------ .../python/pgml/examples/semantic_search.py | 13 +- pgml-sdks/python/pgml/poetry.lock | 19 +- pgml-sdks/python/pgml/pyproject.toml | 1 + 4 files changed, 30 insertions(+), 229 deletions(-) delete mode 100644 pgml-sdks/python/pgml/examples/semantic_search.ipynb diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb deleted file mode 100644 index 479e4a001..000000000 --- a/pgml-sdks/python/pgml/examples/semantic_search.ipynb +++ /dev/null @@ -1,226 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "dataset = load_dataset('quora', split='train')\n", - "dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "questions = []\n", - "\n", - "for record in dataset['questions']:\n", - " questions.extend(record['text'])\n", - " \n", - "# remove duplicates\n", - "documents = []\n", - "for question in list(set(questions)):\n", - " if question:\n", - " documents.append({\"text\": question})\n", - "\n", - "len(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "from pgml import Database\n", - "import os\n", - "from rich import print as rprint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n", - "\n", - "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n", - "db = Database(conninfo,min_connections=4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection_name = \"quora_collection\"\n", - "collection = db.create_or_get_collection(collection_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[43], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m collection\u001b[39m.\u001b[39;49mupsert_documents(documents[:\u001b[39m10000\u001b[39;49m])\n", - "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:291\u001b[0m, in \u001b[0;36mCollection.upsert_documents\u001b[0;34m(self, documents, text_key, id_key)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mupsert_documents\u001b[39m(\n\u001b[1;32m 266\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 267\u001b[0m documents: List[Dict[\u001b[39mstr\u001b[39m, Any]],\n\u001b[1;32m 268\u001b[0m text_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 269\u001b[0m id_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 270\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 271\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 272\u001b[0m \u001b[39m The function `upsert_documents` inserts or updates documents in a database table based on their ID,\u001b[39;00m\n\u001b[1;32m 273\u001b[0m \u001b[39m text, and metadata.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[39m upsert process. If set to False, only essential information will be printed, defaults to False\u001b[39;00m\n\u001b[1;32m 290\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 291\u001b[0m conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m 292\u001b[0m \u001b[39mfor\u001b[39;00m document \u001b[39min\u001b[39;00m track(documents, description\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUpserting documents\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m 293\u001b[0m \u001b[39mif\u001b[39;00m text_key \u001b[39min\u001b[39;00m \u001b[39mlist\u001b[39m(document\u001b[39m.\u001b[39mkeys()):\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 180\u001b[0m timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m 181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 184\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 744\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:734\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconn \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror):\n\u001b[1;32m 733\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 734\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_cond\u001b[39m.\u001b[39;49mwait(timeout):\n\u001b[1;32m 735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror \u001b[39m=\u001b[39m PoolTimeout(\n\u001b[1;32m 736\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcouldn\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt get a connection after \u001b[39m\u001b[39m{\u001b[39;00mtimeout\u001b[39m}\u001b[39;00m\u001b[39m sec\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 737\u001b[0m )\n\u001b[1;32m 738\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m ex:\n", - "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.3/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:324\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 323\u001b[0m \u001b[39mif\u001b[39;00m timeout \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 324\u001b[0m gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39;49macquire(\u001b[39mTrue\u001b[39;49;00m, timeout)\n\u001b[1;32m 325\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 326\u001b[0m gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39macquire(\u001b[39mFalse\u001b[39;00m)\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "collection.upsert_documents(documents[:20000])" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2KGenerating chunks \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:01:25\u001b[0m00:01\u001b[0m00:03\u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "collection.generate_chunks()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Generating embeddings using intfloat/e5-small \u001b[33m...\u001b[0m \n" - ] - } - ], - "source": [ - "collection.generate_embeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "ename": "PoolTimeout", - "evalue": "couldn't get a connection after 30.0 sec", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mPoolTimeout\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m query \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mwhat are haiku poems?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m result \u001b[39m=\u001b[39m collection\u001b[39m.\u001b[39;49mvector_search(query)\n", - "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:711\u001b[0m, in \u001b[0;36mCollection.vector_search\u001b[0;34m(self, query, query_parameters, top_k, model_id, splitter_id)\u001b[0m\n\u001b[1;32m 679\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvector_search\u001b[39m(\n\u001b[1;32m 680\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 681\u001b[0m query: \u001b[39mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 685\u001b[0m splitter_id: \u001b[39mint\u001b[39m \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m,\n\u001b[1;32m 686\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[Dict[\u001b[39mstr\u001b[39m, Any]]:\n\u001b[1;32m 687\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 688\u001b[0m \u001b[39m This function performs a vector search on a database using a query and returns the top matching\u001b[39;00m\n\u001b[1;32m 689\u001b[0m \u001b[39m results.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 709\u001b[0m \u001b[39m with the search result\u001b[39;00m\n\u001b[1;32m 710\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 711\u001b[0m conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m 713\u001b[0m \u001b[39mif\u001b[39;00m model_id \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names\u001b[39m.\u001b[39mkeys():\n\u001b[1;32m 714\u001b[0m model \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names[model_id]\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 180\u001b[0m timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m 181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m 183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m 184\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 744\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n", - "\u001b[0;31mPoolTimeout\u001b[0m: couldn't get a connection after 30.0 sec" - ] - } - ], - "source": [ - "query = \"which city has the highest population in the world?\"\n", - "result = collection.vector_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1m[\u001b[0m\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.9226062759447833\u001b[0m,\n", - " \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some examples of haiku poems about animals?'\u001b[0m,\n", - " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8803199702518053\u001b[0m,\n", - " \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do poems have rhythm?'\u001b[0m,\n", - " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8562827400386631\u001b[0m,\n", - " \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do you get inspiration to write a poem?'\u001b[0m,\n", - " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8304190744787252\u001b[0m,\n", - " \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some great examples of witty exchanges?'\u001b[0m,\n", - " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8221808330080992\u001b[0m,\n", - " \u001b[32m'chunk'\u001b[0m: \u001b[32m'Which are some inspirational books I can read?'\u001b[0m,\n", - " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - } - ], - "source": [ - "rprint(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pgml-zoggicR5-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py index 853a863ec..65a1c3834 100644 --- a/pgml-sdks/python/pgml/examples/semantic_search.py +++ b/pgml-sdks/python/pgml/examples/semantic_search.py @@ -2,6 +2,12 @@ from pgml import Database import os from rich import print as rprint +from dotenv import load_dotenv +from time import time +from rich.console import Console + +load_dotenv() +console = Console() dataset = load_dataset('quora', split='train') questions = [] @@ -28,6 +34,11 @@ collection.generate_chunks() collection.generate_embeddings() +start = time() query = "which city has the highest population in the world?" result = collection.vector_search(query) -rprint(result) \ No newline at end of file +_end = time() + +console.print("\nResults for '%s'"%(query),style="bold") +console.print(result) +console.print("Query time = %0.3f"%(_end-start)) \ No newline at end of file diff --git a/pgml-sdks/python/pgml/poetry.lock b/pgml-sdks/python/pgml/poetry.lock index 87633c653..497468c5e 100644 --- a/pgml-sdks/python/pgml/poetry.lock +++ b/pgml-sdks/python/pgml/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1832,6 +1832,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-dotenv" +version = "1.0.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, + {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "pytz" version = "2023.3" @@ -2561,4 +2576,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "51765e8059c760223993d3145da644935c0f77b091f1f2ea355f81c6249a3e1b" +content-hash = "7d20ef57aee0494f39dfbd3250c2654112393c9c8e0ef9257bce5d72506f0378" diff --git a/pgml-sdks/python/pgml/pyproject.toml b/pgml-sdks/python/pgml/pyproject.toml index 72322a567..c9020806d 100644 --- a/pgml-sdks/python/pgml/pyproject.toml +++ b/pgml-sdks/python/pgml/pyproject.toml @@ -19,6 +19,7 @@ psycopg-pool = "^3.1.7" langchain = "^0.0.167" ipywidgets = "^8.0.6" datasets = "^2.12.0" +python-dotenv = "^1.0.0" [tool.poetry.group.dev.dependencies] From 037988c728f24f13b506d8aac8bb59ad50c6c6f6 Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Tue, 30 May 2023 13:56:40 -0700 Subject: [PATCH 4/7] Added extractive qa example --- .../examples/extractive_question_answering.py | 69 +++++++++++++++++++ ...vector_search.py => question_answering.py} | 20 ++++-- .../python/pgml/examples/semantic_search.py | 30 ++++---- .../python/pgml/examples/vector_search.ipynb | 19 ----- 4 files changed, 101 insertions(+), 37 deletions(-) create mode 100644 pgml-sdks/python/pgml/examples/extractive_question_answering.py rename pgml-sdks/python/pgml/examples/{vector_search.py => question_answering.py} (59%) diff --git a/pgml-sdks/python/pgml/examples/extractive_question_answering.py b/pgml-sdks/python/pgml/examples/extractive_question_answering.py new file mode 100644 index 000000000..02cac7071 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/extractive_question_answering.py @@ -0,0 +1,69 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +from psycopg import sql +from pgml.dbutils import run_select_statement + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "squad_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("squad", split="train") +data = data.to_pandas() +data = data.drop_duplicates(subset=["context"]) + +documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") +] + +collection.upsert_documents(documents[:200]) +collection.generate_chunks() +collection.generate_embeddings() + +start = time() +query = "Who won more than 20 grammy awards?" +results = collection.vector_search(query, top_k=5) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +# Get the context passage and use pgml.transform to get short answer to the question + + +conn = db.pool.getconn() +context = " ".join(results[0]["chunk"].strip().split()) +context = context.replace('"', '\\"').replace("'", "''") + +select_statement = """SELECT pgml.transform( + 'question-answering', + inputs => ARRAY[ + '{ + \"question\": \"%s\", + \"context\": \"%s\" + }' + ] +) AS answer;""" % ( + query, + context, +) + +results = run_select_statement(conn, select_statement) +db.pool.putconn(conn) + +console.print("\nResults for query '%s'" % query) +console.print(results) +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/vector_search.py b/pgml-sdks/python/pgml/examples/question_answering.py similarity index 59% rename from pgml-sdks/python/pgml/examples/vector_search.py rename to pgml-sdks/python/pgml/examples/question_answering.py index 4a55cdc10..a70930fb7 100644 --- a/pgml-sdks/python/pgml/examples/vector_search.py +++ b/pgml-sdks/python/pgml/examples/question_answering.py @@ -3,14 +3,18 @@ import json from datasets import load_dataset from time import time -from rich import print as rprint +from dotenv import load_dotenv +from rich.console import Console + +load_dotenv() +console = Console() local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" conninfo = os.environ.get("PGML_CONNECTION", local_pgml) db = Database(conninfo) -collection_name = "test_pgml_sdk_1" +collection_name = "squad_collection" collection = db.create_or_get_collection(collection_name) @@ -19,7 +23,7 @@ data = data.drop_duplicates(subset=["context"]) documents = [ - {'id': r['id'], "text": r["context"], "title": r["title"]} + {"id": r["id"], "text": r["context"], "title": r["title"]} for r in data.to_dict(orient="records") ] @@ -28,7 +32,11 @@ collection.generate_embeddings() start = time() -results = collection.vector_search("Who won 20 grammy awards?", top_k=2) -rprint("Query time %0.3f"%(time()-start)) -rprint(json.dumps(results, indent=2)) +query = "Who won 20 grammy awards?" +results = collection.vector_search(query, top_k=5) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py index 65a1c3834..2eafbf55b 100644 --- a/pgml-sdks/python/pgml/examples/semantic_search.py +++ b/pgml-sdks/python/pgml/examples/semantic_search.py @@ -9,12 +9,13 @@ load_dotenv() console = Console() -dataset = load_dataset('quora', split='train') +# Prepare Data +dataset = load_dataset("quora", split="train") questions = [] -for record in dataset['questions']: - questions.extend(record['text']) - +for record in dataset["questions"]: + questions.extend(record["text"]) + # remove duplicates documents = [] for question in list(set(questions)): @@ -22,23 +23,28 @@ documents.append({"text": question}) +# Get Database connection local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo, min_connections=4) -conninfo = os.environ.get("PGML_CONNECTION",local_pgml) -db = Database(conninfo,min_connections=4) - +# Create or get collection collection_name = "quora_collection" collection = db.create_or_get_collection(collection_name) -collection.upsert_documents(documents[:20]) +# Upsert documents, chunk text, and generate embeddings +collection.upsert_documents(documents[:200]) collection.generate_chunks() collection.generate_embeddings() +# Query vector embeddings start = time() -query = "which city has the highest population in the world?" +query = "What is a good mobile os?" result = collection.vector_search(query) -_end = time() +_end = time() -console.print("\nResults for '%s'"%(query),style="bold") +console.print("\nResults for '%s'" % (query), style="bold") console.print(result) -console.print("Query time = %0.3f"%(_end-start)) \ No newline at end of file +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb index 1c30f8647..ce29791fa 100644 --- a/pgml-sdks/python/pgml/examples/vector_search.ipynb +++ b/pgml-sdks/python/pgml/examples/vector_search.ipynb @@ -11,25 +11,6 @@ "import json" ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "env: PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n", - "env: LOGLEVEL=INFO\n" - ] - } - ], - "source": [ - "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n", - "%env LOGLEVEL=INFO" - ] - }, { "cell_type": "code", "execution_count": 3, From 904dab9385fa26ea0b7259e7c151dd844d01b317 Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Tue, 30 May 2023 16:53:15 -0700 Subject: [PATCH 5/7] Table qa and instructor --- .../examples/question_answering_instructor.py | 45 +++ .../pgml/examples/table_question_answering.py | 49 ++++ .../python/pgml/examples/vector_search.ipynb | 257 ------------------ pgml-sdks/python/pgml/pgml/collection.py | 5 +- 4 files changed, 98 insertions(+), 258 deletions(-) create mode 100644 pgml-sdks/python/pgml/examples/question_answering_instructor.py create mode 100644 pgml-sdks/python/pgml/examples/table_question_answering.py delete mode 100644 pgml-sdks/python/pgml/examples/vector_search.ipynb diff --git a/pgml-sdks/python/pgml/examples/question_answering_instructor.py b/pgml-sdks/python/pgml/examples/question_answering_instructor.py new file mode 100644 index 000000000..7149ce9c6 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/question_answering_instructor.py @@ -0,0 +1,45 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "squad_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("squad", split="train") +data = data.to_pandas() +data = data.drop_duplicates(subset=["context"]) + +documents = [ + {"id": r["id"], "text": r["context"], "title": r["title"]} + for r in data.to_dict(orient="records") +] + +collection.upsert_documents(documents[:200]) +collection.generate_chunks() + +#register instructor model +model_id = collection.register_model(model_name="hkunlp/instructor-base", model_params={"instruction": "Represent the Wikipedia document for retrieval: "}) +collection.generate_embeddings(model_id=model_id) + +start = time() +query = "Who won 20 grammy awards?" +results = collection.vector_search(query, top_k=5, model_id = model_id, query_parameters={"instruction": "Represent the Wikipedia question for retrieving supporting documents: "}) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/table_question_answering.py b/pgml-sdks/python/pgml/examples/table_question_answering.py new file mode 100644 index 000000000..f0e5935c5 --- /dev/null +++ b/pgml-sdks/python/pgml/examples/table_question_answering.py @@ -0,0 +1,49 @@ +from pgml import Database +import os +import json +from datasets import load_dataset +from time import time +from dotenv import load_dotenv +from rich.console import Console +from rich.progress import track +from psycopg import sql +from pgml.dbutils import run_select_statement +import pandas as pd + +load_dotenv() +console = Console() + +local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development" + +conninfo = os.environ.get("PGML_CONNECTION", local_pgml) +db = Database(conninfo) + +collection_name = "ott_qa_20k_collection" +collection = db.create_or_get_collection(collection_name) + + +data = load_dataset("ashraq/ott-qa-20k", split="train") +documents = [] + +# loop through the dataset and convert tabular data to pandas dataframes +for doc in track(data): + table = pd.DataFrame(doc["data"], columns=doc["header"]) + processed_table = "\n".join([table.to_csv(index=False)]) + documents.append({"text": processed_table, "title": doc["title"], "url": doc["url"], "uid": doc["uid"]}) + +collection.upsert_documents(documents) +collection.generate_chunks() + +# SentenceTransformer model trained specifically for embedding tabular data for retrieval tasks +model_id = collection.register_model(model_name="deepset/all-mpnet-base-v2-table") +collection.generate_embeddings(model_id=model_id) + +start = time() +query = "which country has the highest GDP in 2020?" +results = collection.vector_search(query, top_k=5, model_id=model_id) +_end = time() +console.print("\nResults for '%s'" % (query), style="bold") +console.print(results) +console.print("Query time = %0.3f" % (_end - start)) + +db.archive_collection(collection_name) diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb deleted file mode 100644 index ce29791fa..000000000 --- a/pgml-sdks/python/pgml/examples/vector_search.ipynb +++ /dev/null @@ -1,257 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from pgml import Database\n", - "import os\n", - "import json" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n", - "\n", - "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n", - "db = Database(conninfo,min_connections=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "collection_name = \"test_collection_1\"\n", - "collection = db.create_or_get_collection(collection_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "missing FROM-clause entry for table \"test_collection_1\"\n", - "LINE 1: ...forms (table_name, task, model, splitter) VALUES (test_colle...\n", - " ^\n" - ] - } - ], - "source": [ - "conn = db.pool.getconn()\n", - "emb_table = collection._create_or_get_embeddings_table(conn, model_id=1,splitter_id=1)\n", - "db.pool.putconn(conn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "data = load_dataset(\"squad\", split=\"train\")\n", - "data = data.to_pandas()\n", - "data.head()\n", - "\n", - "data = data.drop_duplicates(subset=[\"context\"])\n", - "print(len(data))\n", - "data.head()\n", - "\n", - "documents = [\n", - " {\n", - " 'text': r['context'],\n", - " 'metadata': {\n", - " 'title': r['title']\n", - " }\n", - " } for r in data.to_dict(orient='records')\n", - "]\n", - "documents[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.upsert_documents(documents[0:200])\n", - "collection.generate_chunks()\n", - "collection.generate_embeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.get_models()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(json.dumps(collection.get_models(),indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(model_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.get_models()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(model_id=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_chunks(splitter_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "collection.generate_embeddings(splitter_id=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n", - "print(json.dumps(results,indent=2))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "db.delete_collection(collection_name)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pgml-zoggicR5-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pgml-sdks/python/pgml/pgml/collection.py b/pgml-sdks/python/pgml/pgml/collection.py index 38a8498b9..e49df3d17 100644 --- a/pgml-sdks/python/pgml/pgml/collection.py +++ b/pgml-sdks/python/pgml/pgml/collection.py @@ -438,7 +438,7 @@ def register_model( task: Optional[str] = "embedding", model_name: Optional[str] = "intfloat/e5-small", model_params: Optional[Dict[str, Any]] = {}, - ) -> None: + ) -> int: """ This function registers a model in a database if it does not already exist. @@ -777,5 +777,8 @@ def vector_search( search_results = run_select_statement(conn, cte_select_statement) self.pool.putconn(conn) + + # Sort the list of dictionaries based on the 'score' key in descending order + search_results = sorted(search_results, key=lambda x: x['score'], reverse=True) return search_results From 26216f95ed5dbf6a86d5c69775dab55bd3573e0a Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Wed, 31 May 2023 10:24:46 -0700 Subject: [PATCH 6/7] Examples and updates to README --- pgml-sdks/python/pgml/README.md | 17 +++++++++++++++-- pgml-sdks/python/pgml/examples/README.md | 19 +++++++++++++++++++ .../examples/question_answering_instructor.py | 16 +++++++++++++--- .../pgml/examples/table_question_answering.py | 9 ++++++++- 4 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 pgml-sdks/python/pgml/examples/README.md diff --git a/pgml-sdks/python/pgml/README.md b/pgml-sdks/python/pgml/README.md index 33487fdb1..6368675a1 100644 --- a/pgml-sdks/python/pgml/README.md +++ b/pgml-sdks/python/pgml/README.md @@ -1,10 +1,14 @@ -# Table of Contents +# Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone + +## Table of Contents - [Overview](#overview) - [Quickstart](#quickstart) - [Usage](#usage) +- [Examples](./examples/README.md) - [Developer setup](#developer-setup) - [API Reference](#api-reference) +- [Roadmap](#roadmap) ## Overview Python SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases. With this SDK, you can seamlessly manage various database tables related to documents, text chunks, text splitters, LLM (Language Model) models, and embeddings. By leveraging the SDK's capabilities, you can efficiently index LLM embeddings using PgVector for fast and accurate queries. @@ -274,4 +278,13 @@ LOGLEVEL=INFO python -m unittest tests/test_collection.py ### API Reference - [Database](./docs/pgml/database.md) -- [Collection](./docs/pgml/collection.md) \ No newline at end of file +- [Collection](./docs/pgml/collection.md) + +### Roadmap + +- Enable filters on document metadata in `vector_search`. [Issue](https://github.com/postgresml/postgresml/issues/663) +- `text_search` functionality on documents using Postgres text search. [Issue](https://github.com/postgresml/postgresml/issues/664) +- `hybrid_search` functionality that does a combination of `vector_search` and `text_search` in an order specified by the user. [Issue](https://github.com/postgresml/postgresml/issues/665) +- Ability to call and manage OpenAI embeddings for comparison purposes. [Issue](https://github.com/postgresml/postgresml/issues/666) +- Save `vector_search` history for downstream monitoring of model performance. [Issue](https://github.com/postgresml/postgresml/issues/667) +- Perform chunking on the DB with multiple langchain splitters. [Issue](https://github.com/postgresml/postgresml/issues/668) \ No newline at end of file diff --git a/pgml-sdks/python/pgml/examples/README.md b/pgml-sdks/python/pgml/examples/README.md new file mode 100644 index 000000000..a77848eff --- /dev/null +++ b/pgml-sdks/python/pgml/examples/README.md @@ -0,0 +1,19 @@ +## Examples + +### [Semantic Search](./semantic_search.py) +This is a basic example to perform semantic search on a collection of documents. It loads the Quora dataset, creates a collection in a PostgreSQL database, upserts documents, generates chunks and embeddings, and then performs a vector search on a query. Embeddings are created using `intfloat/e5-small` model. The results are are semantically similar documemts to the query. Finally, the collection is archived. + +### [Question Answering](./question_answering.py) +This is an example to find documents relevant to a question from the collection of documents. It loads the Stanford Question Answering Dataset (SQuAD) into the database, generates chunks and embeddings. Query is passed to vector search to retrieve documents that match closely in the embeddings space. A score is returned with each of the search result. + +### [Question Answering using Instructore Model](./question_answering_instructor.py) +In this example, we will use `hknlp/instructor-base` model to build text embeddings instead of the default `intfloat/e5-small` model. We will show how to use `register_model` method and use the `model_id` to build and query embeddings. + +### [Extractive Question Answering](./extractive_question_answering.py) +In this example, we will show how to use `vector_search` result as a `context` to a HuggingFace question answering model. We will use `pgml.transform` to run the model on the database. + +### [Table Question Answering](./table_question_answering.py) +In this example, we will use [Open Table-and-Text Question Answering (OTT-QA) +](https://github.com/wenhuchen/OTT-QA) dataset to run queries on tables. We will use `deepset/all-mpnet-base-v2-table` model that is trained for embedding tabular data for retrieval tasks. + + diff --git a/pgml-sdks/python/pgml/examples/question_answering_instructor.py b/pgml-sdks/python/pgml/examples/question_answering_instructor.py index 7149ce9c6..6ed49f27e 100644 --- a/pgml-sdks/python/pgml/examples/question_answering_instructor.py +++ b/pgml-sdks/python/pgml/examples/question_answering_instructor.py @@ -30,13 +30,23 @@ collection.upsert_documents(documents[:200]) collection.generate_chunks() -#register instructor model -model_id = collection.register_model(model_name="hkunlp/instructor-base", model_params={"instruction": "Represent the Wikipedia document for retrieval: "}) +# register instructor model +model_id = collection.register_model( + model_name="hkunlp/instructor-base", + model_params={"instruction": "Represent the Wikipedia document for retrieval: "}, +) collection.generate_embeddings(model_id=model_id) start = time() query = "Who won 20 grammy awards?" -results = collection.vector_search(query, top_k=5, model_id = model_id, query_parameters={"instruction": "Represent the Wikipedia question for retrieving supporting documents: "}) +results = collection.vector_search( + query, + top_k=5, + model_id=model_id, + query_parameters={ + "instruction": "Represent the Wikipedia question for retrieving supporting documents: " + }, +) _end = time() console.print("\nResults for '%s'" % (query), style="bold") console.print(results) diff --git a/pgml-sdks/python/pgml/examples/table_question_answering.py b/pgml-sdks/python/pgml/examples/table_question_answering.py index f0e5935c5..f208e3392 100644 --- a/pgml-sdks/python/pgml/examples/table_question_answering.py +++ b/pgml-sdks/python/pgml/examples/table_question_answering.py @@ -29,7 +29,14 @@ for doc in track(data): table = pd.DataFrame(doc["data"], columns=doc["header"]) processed_table = "\n".join([table.to_csv(index=False)]) - documents.append({"text": processed_table, "title": doc["title"], "url": doc["url"], "uid": doc["uid"]}) + documents.append( + { + "text": processed_table, + "title": doc["title"], + "url": doc["url"], + "uid": doc["uid"], + } + ) collection.upsert_documents(documents) collection.generate_chunks() From 9fa410497b2dcc0f7ab6d8e08c3b173db3674749 Mon Sep 17 00:00:00 2001 From: Santi Adavani Date: Thu, 1 Jun 2023 11:18:40 -0700 Subject: [PATCH 7/7] vector search sorting by score done in sql query --- pgml-sdks/python/pgml/pgml/collection.py | 7 +++--- pgml-sdks/python/pgml/pgml/dbutils.py | 29 ++++++++++++++++++++---- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/pgml-sdks/python/pgml/pgml/collection.py b/pgml-sdks/python/pgml/pgml/collection.py index e49df3d17..7501d0196 100644 --- a/pgml-sdks/python/pgml/pgml/collection.py +++ b/pgml-sdks/python/pgml/pgml/collection.py @@ -775,10 +775,9 @@ def vector_search( documents_table=self.documents_table, ) - search_results = run_select_statement(conn, cte_select_statement) + search_results = run_select_statement( + conn, cte_select_statement, order_by="score", ascending=False + ) self.pool.putconn(conn) - - # Sort the list of dictionaries based on the 'score' key in descending order - search_results = sorted(search_results, key=lambda x: x['score'], reverse=True) return search_results diff --git a/pgml-sdks/python/pgml/pgml/dbutils.py b/pgml-sdks/python/pgml/pgml/dbutils.py index 5d58b56a5..95ce5b003 100644 --- a/pgml-sdks/python/pgml/pgml/dbutils.py +++ b/pgml-sdks/python/pgml/pgml/dbutils.py @@ -52,7 +52,9 @@ def run_create_or_insert_statement( cur.close() -def run_select_statement(conn: Connection, statement: str) -> List[Any]: +def run_select_statement( + conn: Connection, statement: str, order_by: str = "", ascending: bool = True +) -> List[Any]: """ The function runs a select statement on a database connection and returns the results as a list of dictionaries. @@ -70,12 +72,29 @@ def run_select_statement(conn: Connection, statement: str) -> List[Any]: statement = statement.strip().rstrip(";") cur = conn.cursor() - json_conversion_statement = """ - SELECT array_to_json(array_agg(row_to_json(t))) + order_statement = "" + if order_by: + order_statement = "ORDER BY t.%s" % order_by + if ascending: + order_statement += " ASC" + else: + order_statement += " DESC" + + if order_statement: + json_conversion_statement = """ + SELECT array_to_json(array_agg(row_to_json(t) {order_statement})) FROM ({select_statement}) t; """.format( - select_statement=statement - ) + select_statement=statement, + order_statement=order_statement, + ) + else: + json_conversion_statement = """ + SELECT array_to_json(array_agg(row_to_json(t))) + FROM ({select_statement}) t; + """.format( + select_statement=statement + ) log.info("Running %s .. " % json_conversion_statement) cur.execute(json_conversion_statement) results = cur.fetchall() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy