From 74efc3958e4cea30994b3fef1a5db6b58fe1f464 Mon Sep 17 00:00:00 2001
From: Santi Adavani <santi@hyperparam.ai>
Date: Tue, 30 May 2023 09:57:17 -0700
Subject: [PATCH 1/7] semantic search example

---
 .../pgml/examples/semantic_search.ipynb       | 235 ++++++++++++++++++
 .../python/pgml/examples/semantic_search.py   |  33 +++
 .../python/pgml/examples/vector_search.ipynb  |  48 +++-
 3 files changed, 312 insertions(+), 4 deletions(-)
 create mode 100644 pgml-sdks/python/pgml/examples/semantic_search.ipynb
 create mode 100644 pgml-sdks/python/pgml/examples/semantic_search.py

diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb
new file mode 100644
index 000000000..d9160f6d3
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/semantic_search.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset('quora', split='train')\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "questions = []\n",
+    "\n",
+    "for record in dataset['questions']:\n",
+    "    questions.extend(record['text'])\n",
+    "  \n",
+    "# remove duplicates\n",
+    "documents = []\n",
+    "for question in list(set(questions)):\n",
+    "    if question:\n",
+    "        documents.append({\"text\": question})\n",
+    "\n",
+    "len(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pgml import Database\n",
+    "import os\n",
+    "from rich import print as rprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
+    "\n",
+    "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
+    "db = Database(conninfo,min_connections=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection_name = \"quora_collection\"\n",
+    "collection = db.create_or_get_collection(collection_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[43], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m collection\u001b[39m.\u001b[39;49mupsert_documents(documents[:\u001b[39m10000\u001b[39;49m])\n",
+      "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:291\u001b[0m, in \u001b[0;36mCollection.upsert_documents\u001b[0;34m(self, documents, text_key, id_key)\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mupsert_documents\u001b[39m(\n\u001b[1;32m    266\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    267\u001b[0m     documents: List[Dict[\u001b[39mstr\u001b[39m, Any]],\n\u001b[1;32m    268\u001b[0m     text_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    269\u001b[0m     id_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    270\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    271\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    272\u001b[0m \u001b[39m    The function `upsert_documents` inserts or updates documents in a database table based on their ID,\u001b[39;00m\n\u001b[1;32m    273\u001b[0m \u001b[39m    text, and metadata.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    289\u001b[0m \u001b[39m    upsert process. If set to False, only essential information will be printed, defaults to False\u001b[39;00m\n\u001b[1;32m    290\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 291\u001b[0m     conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m    292\u001b[0m     \u001b[39mfor\u001b[39;00m document \u001b[39min\u001b[39;00m track(documents, description\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUpserting documents\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m    293\u001b[0m         \u001b[39mif\u001b[39;00m text_key \u001b[39min\u001b[39;00m \u001b[39mlist\u001b[39m(document\u001b[39m.\u001b[39mkeys()):\n",
+      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    180\u001b[0m     timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m    181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m     conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m    183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    184\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
+      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    744\u001b[0m     \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n",
+      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:734\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    732\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconn \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror):\n\u001b[1;32m    733\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 734\u001b[0m         \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_cond\u001b[39m.\u001b[39;49mwait(timeout):\n\u001b[1;32m    735\u001b[0m             \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror \u001b[39m=\u001b[39m PoolTimeout(\n\u001b[1;32m    736\u001b[0m                 \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcouldn\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt get a connection after \u001b[39m\u001b[39m{\u001b[39;00mtimeout\u001b[39m}\u001b[39;00m\u001b[39m sec\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    737\u001b[0m             )\n\u001b[1;32m    738\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m ex:\n",
+      "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.3/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:324\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    322\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    323\u001b[0m     \u001b[39mif\u001b[39;00m timeout \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 324\u001b[0m         gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39;49macquire(\u001b[39mTrue\u001b[39;49;00m, timeout)\n\u001b[1;32m    325\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    326\u001b[0m         gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39macquire(\u001b[39mFalse\u001b[39;00m)\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "collection.upsert_documents(documents[:20000])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2KGenerating chunks \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:01:25\u001b[0m00:01\u001b[0m00:03\u001b[0m\n",
+      "\u001b[?25h"
+     ]
+    }
+   ],
+   "source": [
+    "collection.generate_chunks()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generating embeddings using intfloat/e5-small \u001b[33m...\u001b[0m \n"
+     ]
+    }
+   ],
+   "source": [
+    "collection.generate_embeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "PoolTimeout",
+     "evalue": "couldn't get a connection after 30.0 sec",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mPoolTimeout\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m query \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mwhat are haiku poems?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m result \u001b[39m=\u001b[39m collection\u001b[39m.\u001b[39;49mvector_search(query)\n",
+      "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:711\u001b[0m, in \u001b[0;36mCollection.vector_search\u001b[0;34m(self, query, query_parameters, top_k, model_id, splitter_id)\u001b[0m\n\u001b[1;32m    679\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvector_search\u001b[39m(\n\u001b[1;32m    680\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    681\u001b[0m     query: \u001b[39mstr\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    685\u001b[0m     splitter_id: \u001b[39mint\u001b[39m \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m,\n\u001b[1;32m    686\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[Dict[\u001b[39mstr\u001b[39m, Any]]:\n\u001b[1;32m    687\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    688\u001b[0m \u001b[39m    This function performs a vector search on a database using a query and returns the top matching\u001b[39;00m\n\u001b[1;32m    689\u001b[0m \u001b[39m    results.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    709\u001b[0m \u001b[39m    with the search result\u001b[39;00m\n\u001b[1;32m    710\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 711\u001b[0m     conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m    713\u001b[0m     \u001b[39mif\u001b[39;00m model_id \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names\u001b[39m.\u001b[39mkeys():\n\u001b[1;32m    714\u001b[0m         model \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names[model_id]\n",
+      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    180\u001b[0m     timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m    181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m     conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m    183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    184\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
+      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    744\u001b[0m     \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n",
+      "\u001b[0;31mPoolTimeout\u001b[0m: couldn't get a connection after 30.0 sec"
+     ]
+    }
+   ],
+   "source": [
+    "query = \"which city has the highest population in the world?\"\n",
+    "result = collection.vector_search(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m[\u001b[0m\n",
+      "    \u001b[1m{\u001b[0m\n",
+      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.9226062759447833\u001b[0m,\n",
+      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some examples of haiku poems about animals?'\u001b[0m,\n",
+      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+      "    \u001b[1m}\u001b[0m,\n",
+      "    \u001b[1m{\u001b[0m\n",
+      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8803199702518053\u001b[0m,\n",
+      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do poems have rhythm?'\u001b[0m,\n",
+      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+      "    \u001b[1m}\u001b[0m,\n",
+      "    \u001b[1m{\u001b[0m\n",
+      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8562827400386631\u001b[0m,\n",
+      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do you get inspiration to write a poem?'\u001b[0m,\n",
+      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+      "    \u001b[1m}\u001b[0m,\n",
+      "    \u001b[1m{\u001b[0m\n",
+      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8304190744787252\u001b[0m,\n",
+      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some great examples of witty exchanges?'\u001b[0m,\n",
+      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+      "    \u001b[1m}\u001b[0m,\n",
+      "    \u001b[1m{\u001b[0m\n",
+      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8221808330080992\u001b[0m,\n",
+      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'Which are some inspirational books I can read?'\u001b[0m,\n",
+      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
+      "    \u001b[1m}\u001b[0m\n",
+      "\u001b[1m]\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "rprint(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pgml-zoggicR5-py3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py
new file mode 100644
index 000000000..853a863ec
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/semantic_search.py
@@ -0,0 +1,33 @@
+from datasets import load_dataset
+from pgml import Database
+import os
+from rich import print as rprint
+
+dataset = load_dataset('quora', split='train')
+questions = []
+
+for record in dataset['questions']:
+    questions.extend(record['text'])
+  
+# remove duplicates
+documents = []
+for question in list(set(questions)):
+    if question:
+        documents.append({"text": question})
+
+
+local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+
+conninfo = os.environ.get("PGML_CONNECTION",local_pgml)
+db = Database(conninfo,min_connections=4)
+
+collection_name = "quora_collection"
+collection = db.create_or_get_collection(collection_name)
+
+collection.upsert_documents(documents[:20])
+collection.generate_chunks()
+collection.generate_embeddings()
+
+query = "which city has the highest population in the world?"
+result = collection.vector_search(query)
+rprint(result)
\ No newline at end of file
diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb
index e14e8b4f2..1c30f8647 100644
--- a/pgml-sdks/python/pgml/examples/vector_search.ipynb
+++ b/pgml-sdks/python/pgml/examples/vector_search.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -13,7 +13,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "env: PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n",
+      "env: LOGLEVEL=INFO\n"
+     ]
+    }
+   ],
+   "source": [
+    "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n",
+    "%env LOGLEVEL=INFO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,14 +44,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "collection_name = \"test_pgml_sdk_1\"\n",
+    "collection_name = \"test_collection_1\"\n",
     "collection = db.create_or_get_collection(collection_name)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "missing FROM-clause entry for table \"test_collection_1\"\n",
+      "LINE 1: ...forms (table_name, task, model, splitter) VALUES (test_colle...\n",
+      "                                                             ^\n"
+     ]
+    }
+   ],
+   "source": [
+    "conn = db.pool.getconn()\n",
+    "emb_table = collection._create_or_get_embeddings_table(conn, model_id=1,splitter_id=1)\n",
+    "db.pool.putconn(conn)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 1aa2c33d5bfaec3f4f9d93808e4c8d76ab691281 Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Tue, 30 May 2023 10:03:53 -0700
Subject: [PATCH 2/7] Removed env

---
 pgml-sdks/python/pgml/examples/semantic_search.ipynb | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb
index d9160f6d3..479e4a001 100644
--- a/pgml-sdks/python/pgml/examples/semantic_search.ipynb
+++ b/pgml-sdks/python/pgml/examples/semantic_search.ipynb
@@ -42,15 +42,6 @@
     "from rich import print as rprint"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

From 4ab0a60ffeb84e89b3ba23d6cebb0e9b9ef3b4bc Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Tue, 30 May 2023 11:51:55 -0700
Subject: [PATCH 3/7] Added semantic search

---
 .../pgml/examples/semantic_search.ipynb       | 226 ------------------
 .../python/pgml/examples/semantic_search.py   |  13 +-
 pgml-sdks/python/pgml/poetry.lock             |  19 +-
 pgml-sdks/python/pgml/pyproject.toml          |   1 +
 4 files changed, 30 insertions(+), 229 deletions(-)
 delete mode 100644 pgml-sdks/python/pgml/examples/semantic_search.ipynb

diff --git a/pgml-sdks/python/pgml/examples/semantic_search.ipynb b/pgml-sdks/python/pgml/examples/semantic_search.ipynb
deleted file mode 100644
index 479e4a001..000000000
--- a/pgml-sdks/python/pgml/examples/semantic_search.ipynb
+++ /dev/null
@@ -1,226 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "dataset = load_dataset('quora', split='train')\n",
-    "dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "questions = []\n",
-    "\n",
-    "for record in dataset['questions']:\n",
-    "    questions.extend(record['text'])\n",
-    "  \n",
-    "# remove duplicates\n",
-    "documents = []\n",
-    "for question in list(set(questions)):\n",
-    "    if question:\n",
-    "        documents.append({\"text\": question})\n",
-    "\n",
-    "len(documents)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pgml import Database\n",
-    "import os\n",
-    "from rich import print as rprint"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
-    "\n",
-    "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
-    "db = Database(conninfo,min_connections=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection_name = \"quora_collection\"\n",
-    "collection = db.create_or_get_collection(collection_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[43], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m collection\u001b[39m.\u001b[39;49mupsert_documents(documents[:\u001b[39m10000\u001b[39;49m])\n",
-      "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:291\u001b[0m, in \u001b[0;36mCollection.upsert_documents\u001b[0;34m(self, documents, text_key, id_key)\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mupsert_documents\u001b[39m(\n\u001b[1;32m    266\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    267\u001b[0m     documents: List[Dict[\u001b[39mstr\u001b[39m, Any]],\n\u001b[1;32m    268\u001b[0m     text_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    269\u001b[0m     id_key: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mid\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    270\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    271\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    272\u001b[0m \u001b[39m    The function `upsert_documents` inserts or updates documents in a database table based on their ID,\u001b[39;00m\n\u001b[1;32m    273\u001b[0m \u001b[39m    text, and metadata.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    289\u001b[0m \u001b[39m    upsert process. If set to False, only essential information will be printed, defaults to False\u001b[39;00m\n\u001b[1;32m    290\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 291\u001b[0m     conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m    292\u001b[0m     \u001b[39mfor\u001b[39;00m document \u001b[39min\u001b[39;00m track(documents, description\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mUpserting documents\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m    293\u001b[0m         \u001b[39mif\u001b[39;00m text_key \u001b[39min\u001b[39;00m \u001b[39mlist\u001b[39m(document\u001b[39m.\u001b[39mkeys()):\n",
-      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    180\u001b[0m     timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m    181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m     conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m    183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    184\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
-      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    744\u001b[0m     \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n",
-      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:734\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    732\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mconn \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror):\n\u001b[1;32m    733\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 734\u001b[0m         \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_cond\u001b[39m.\u001b[39;49mwait(timeout):\n\u001b[1;32m    735\u001b[0m             \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror \u001b[39m=\u001b[39m PoolTimeout(\n\u001b[1;32m    736\u001b[0m                 \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcouldn\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt get a connection after \u001b[39m\u001b[39m{\u001b[39;00mtimeout\u001b[39m}\u001b[39;00m\u001b[39m sec\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    737\u001b[0m             )\n\u001b[1;32m    738\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m ex:\n",
-      "File \u001b[0;32m/opt/homebrew/Cellar/python@3.11/3.11.3/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py:324\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    322\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    323\u001b[0m     \u001b[39mif\u001b[39;00m timeout \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m--> 324\u001b[0m         gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39;49macquire(\u001b[39mTrue\u001b[39;49;00m, timeout)\n\u001b[1;32m    325\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    326\u001b[0m         gotit \u001b[39m=\u001b[39m waiter\u001b[39m.\u001b[39macquire(\u001b[39mFalse\u001b[39;00m)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "collection.upsert_documents(documents[:20000])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2KGenerating chunks \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[33m0:01:25\u001b[0m00:01\u001b[0m00:03\u001b[0m\n",
-      "\u001b[?25h"
-     ]
-    }
-   ],
-   "source": [
-    "collection.generate_chunks()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Generating embeddings using intfloat/e5-small \u001b[33m...\u001b[0m \n"
-     ]
-    }
-   ],
-   "source": [
-    "collection.generate_embeddings()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "PoolTimeout",
-     "evalue": "couldn't get a connection after 30.0 sec",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mPoolTimeout\u001b[0m                               Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[42], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m query \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mwhat are haiku poems?\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m result \u001b[39m=\u001b[39m collection\u001b[39m.\u001b[39;49mvector_search(query)\n",
-      "File \u001b[0;32m~/Hyperparam/postgresml/pgml-sdks/python/pgml/pgml/collection.py:711\u001b[0m, in \u001b[0;36mCollection.vector_search\u001b[0;34m(self, query, query_parameters, top_k, model_id, splitter_id)\u001b[0m\n\u001b[1;32m    679\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvector_search\u001b[39m(\n\u001b[1;32m    680\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    681\u001b[0m     query: \u001b[39mstr\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    685\u001b[0m     splitter_id: \u001b[39mint\u001b[39m \u001b[39m=\u001b[39m \u001b[39m1\u001b[39m,\n\u001b[1;32m    686\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[Dict[\u001b[39mstr\u001b[39m, Any]]:\n\u001b[1;32m    687\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m    688\u001b[0m \u001b[39m    This function performs a vector search on a database using a query and returns the top matching\u001b[39;00m\n\u001b[1;32m    689\u001b[0m \u001b[39m    results.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    709\u001b[0m \u001b[39m    with the search result\u001b[39;00m\n\u001b[1;32m    710\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 711\u001b[0m     conn \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpool\u001b[39m.\u001b[39;49mgetconn()\n\u001b[1;32m    713\u001b[0m     \u001b[39mif\u001b[39;00m model_id \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names\u001b[39m.\u001b[39mkeys():\n\u001b[1;32m    714\u001b[0m         model \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_cache_model_names[model_id]\n",
-      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:182\u001b[0m, in \u001b[0;36mConnectionPool.getconn\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    180\u001b[0m     timeout \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtimeout\n\u001b[1;32m    181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m     conn \u001b[39m=\u001b[39m pos\u001b[39m.\u001b[39;49mwait(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[1;32m    183\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[1;32m    184\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stats[\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_REQUESTS_ERRORS] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
-      "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/pgml-zoggicR5-py3.11/lib/python3.11/site-packages/psycopg_pool/pool.py:745\u001b[0m, in \u001b[0;36mWaitingClient.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    743\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    744\u001b[0m     \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n\u001b[0;32m--> 745\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merror\n",
-      "\u001b[0;31mPoolTimeout\u001b[0m: couldn't get a connection after 30.0 sec"
-     ]
-    }
-   ],
-   "source": [
-    "query = \"which city has the highest population in the world?\"\n",
-    "result = collection.vector_search(query)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[1m[\u001b[0m\n",
-      "    \u001b[1m{\u001b[0m\n",
-      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.9226062759447833\u001b[0m,\n",
-      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some examples of haiku poems about animals?'\u001b[0m,\n",
-      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-      "    \u001b[1m}\u001b[0m,\n",
-      "    \u001b[1m{\u001b[0m\n",
-      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8803199702518053\u001b[0m,\n",
-      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do poems have rhythm?'\u001b[0m,\n",
-      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-      "    \u001b[1m}\u001b[0m,\n",
-      "    \u001b[1m{\u001b[0m\n",
-      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8562827400386631\u001b[0m,\n",
-      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'How do you get inspiration to write a poem?'\u001b[0m,\n",
-      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-      "    \u001b[1m}\u001b[0m,\n",
-      "    \u001b[1m{\u001b[0m\n",
-      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8304190744787252\u001b[0m,\n",
-      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'What are some great examples of witty exchanges?'\u001b[0m,\n",
-      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-      "    \u001b[1m}\u001b[0m,\n",
-      "    \u001b[1m{\u001b[0m\n",
-      "        \u001b[32m'score'\u001b[0m: \u001b[1;36m0.8221808330080992\u001b[0m,\n",
-      "        \u001b[32m'chunk'\u001b[0m: \u001b[32m'Which are some inspirational books I can read?'\u001b[0m,\n",
-      "        \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
-      "    \u001b[1m}\u001b[0m\n",
-      "\u001b[1m]\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "rprint(result)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "pgml-zoggicR5-py3.11",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py
index 853a863ec..65a1c3834 100644
--- a/pgml-sdks/python/pgml/examples/semantic_search.py
+++ b/pgml-sdks/python/pgml/examples/semantic_search.py
@@ -2,6 +2,12 @@
 from pgml import Database
 import os
 from rich import print as rprint
+from dotenv import load_dotenv
+from time import time
+from rich.console import Console
+
+load_dotenv()
+console = Console()
 
 dataset = load_dataset('quora', split='train')
 questions = []
@@ -28,6 +34,11 @@
 collection.generate_chunks()
 collection.generate_embeddings()
 
+start = time()
 query = "which city has the highest population in the world?"
 result = collection.vector_search(query)
-rprint(result)
\ No newline at end of file
+_end  = time()
+
+console.print("\nResults for '%s'"%(query),style="bold")
+console.print(result)
+console.print("Query time = %0.3f"%(_end-start))
\ No newline at end of file
diff --git a/pgml-sdks/python/pgml/poetry.lock b/pgml-sdks/python/pgml/poetry.lock
index 87633c653..497468c5e 100644
--- a/pgml-sdks/python/pgml/poetry.lock
+++ b/pgml-sdks/python/pgml/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -1832,6 +1832,21 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "pytz"
 version = "2023.3"
@@ -2561,4 +2576,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "51765e8059c760223993d3145da644935c0f77b091f1f2ea355f81c6249a3e1b"
+content-hash = "7d20ef57aee0494f39dfbd3250c2654112393c9c8e0ef9257bce5d72506f0378"
diff --git a/pgml-sdks/python/pgml/pyproject.toml b/pgml-sdks/python/pgml/pyproject.toml
index 72322a567..c9020806d 100644
--- a/pgml-sdks/python/pgml/pyproject.toml
+++ b/pgml-sdks/python/pgml/pyproject.toml
@@ -19,6 +19,7 @@ psycopg-pool = "^3.1.7"
 langchain = "^0.0.167"
 ipywidgets = "^8.0.6"
 datasets = "^2.12.0"
+python-dotenv = "^1.0.0"
 
 
 [tool.poetry.group.dev.dependencies]

From 037988c728f24f13b506d8aac8bb59ad50c6c6f6 Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Tue, 30 May 2023 13:56:40 -0700
Subject: [PATCH 4/7] Added extractive qa example

---
 .../examples/extractive_question_answering.py | 69 +++++++++++++++++++
 ...vector_search.py => question_answering.py} | 20 ++++--
 .../python/pgml/examples/semantic_search.py   | 30 ++++----
 .../python/pgml/examples/vector_search.ipynb  | 19 -----
 4 files changed, 101 insertions(+), 37 deletions(-)
 create mode 100644 pgml-sdks/python/pgml/examples/extractive_question_answering.py
 rename pgml-sdks/python/pgml/examples/{vector_search.py => question_answering.py} (59%)

diff --git a/pgml-sdks/python/pgml/examples/extractive_question_answering.py b/pgml-sdks/python/pgml/examples/extractive_question_answering.py
new file mode 100644
index 000000000..02cac7071
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/extractive_question_answering.py
@@ -0,0 +1,69 @@
+from pgml import Database
+import os
+import json
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich.console import Console
+from psycopg import sql
+from pgml.dbutils import run_select_statement
+
+load_dotenv()
+console = Console()
+
+local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+
+conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
+db = Database(conninfo)
+
+collection_name = "squad_collection"
+collection = db.create_or_get_collection(collection_name)
+
+
+data = load_dataset("squad", split="train")
+data = data.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+
+documents = [
+    {"id": r["id"], "text": r["context"], "title": r["title"]}
+    for r in data.to_dict(orient="records")
+]
+
+collection.upsert_documents(documents[:200])
+collection.generate_chunks()
+collection.generate_embeddings()
+
+start = time()
+query = "Who won more than 20 grammy awards?"
+results = collection.vector_search(query, top_k=5)
+_end = time()
+console.print("\nResults for '%s'" % (query), style="bold")
+console.print(results)
+console.print("Query time = %0.3f" % (_end - start))
+
+# Get the context passage and use pgml.transform to get short answer to the question
+
+
+conn = db.pool.getconn()
+context = " ".join(results[0]["chunk"].strip().split())
+context = context.replace('"', '\\"').replace("'", "''")
+
+select_statement = """SELECT pgml.transform(
+    'question-answering',
+    inputs => ARRAY[
+        '{
+            \"question\": \"%s\",
+            \"context\": \"%s\"
+        }'
+    ]
+) AS answer;""" % (
+    query,
+    context,
+)
+
+results = run_select_statement(conn, select_statement)
+db.pool.putconn(conn)
+
+console.print("\nResults for query '%s'" % query)
+console.print(results)
+db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/examples/vector_search.py b/pgml-sdks/python/pgml/examples/question_answering.py
similarity index 59%
rename from pgml-sdks/python/pgml/examples/vector_search.py
rename to pgml-sdks/python/pgml/examples/question_answering.py
index 4a55cdc10..a70930fb7 100644
--- a/pgml-sdks/python/pgml/examples/vector_search.py
+++ b/pgml-sdks/python/pgml/examples/question_answering.py
@@ -3,14 +3,18 @@
 import json
 from datasets import load_dataset
 from time import time
-from rich import print as rprint
+from dotenv import load_dotenv
+from rich.console import Console
+
+load_dotenv()
+console = Console()
 
 local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
 
 conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
 db = Database(conninfo)
 
-collection_name = "test_pgml_sdk_1"
+collection_name = "squad_collection"
 collection = db.create_or_get_collection(collection_name)
 
 
@@ -19,7 +23,7 @@
 data = data.drop_duplicates(subset=["context"])
 
 documents = [
-    {'id': r['id'], "text": r["context"], "title": r["title"]}
+    {"id": r["id"], "text": r["context"], "title": r["title"]}
     for r in data.to_dict(orient="records")
 ]
 
@@ -28,7 +32,11 @@
 collection.generate_embeddings()
 
 start = time()
-results = collection.vector_search("Who won 20 grammy awards?", top_k=2)
-rprint("Query time %0.3f"%(time()-start))
-rprint(json.dumps(results, indent=2))
+query = "Who won 20 grammy awards?"
+results = collection.vector_search(query, top_k=5)
+_end = time()
+console.print("\nResults for '%s'" % (query), style="bold")
+console.print(results)
+console.print("Query time = %0.3f" % (_end - start))
+
 db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/examples/semantic_search.py b/pgml-sdks/python/pgml/examples/semantic_search.py
index 65a1c3834..2eafbf55b 100644
--- a/pgml-sdks/python/pgml/examples/semantic_search.py
+++ b/pgml-sdks/python/pgml/examples/semantic_search.py
@@ -9,12 +9,13 @@
 load_dotenv()
 console = Console()
 
-dataset = load_dataset('quora', split='train')
+# Prepare Data
+dataset = load_dataset("quora", split="train")
 questions = []
 
-for record in dataset['questions']:
-    questions.extend(record['text'])
-  
+for record in dataset["questions"]:
+    questions.extend(record["text"])
+
 # remove duplicates
 documents = []
 for question in list(set(questions)):
@@ -22,23 +23,28 @@
         documents.append({"text": question})
 
 
+# Get Database connection
 local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
+db = Database(conninfo, min_connections=4)
 
-conninfo = os.environ.get("PGML_CONNECTION",local_pgml)
-db = Database(conninfo,min_connections=4)
-
+# Create or get collection
 collection_name = "quora_collection"
 collection = db.create_or_get_collection(collection_name)
 
-collection.upsert_documents(documents[:20])
+# Upsert documents, chunk text, and generate embeddings
+collection.upsert_documents(documents[:200])
 collection.generate_chunks()
 collection.generate_embeddings()
 
+# Query vector embeddings
 start = time()
-query = "which city has the highest population in the world?"
+query = "What is a good mobile os?"
 result = collection.vector_search(query)
-_end  = time()
+_end = time()
 
-console.print("\nResults for '%s'"%(query),style="bold")
+console.print("\nResults for '%s'" % (query), style="bold")
 console.print(result)
-console.print("Query time = %0.3f"%(_end-start))
\ No newline at end of file
+console.print("Query time = %0.3f" % (_end - start))
+
+db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb
index 1c30f8647..ce29791fa 100644
--- a/pgml-sdks/python/pgml/examples/vector_search.ipynb
+++ b/pgml-sdks/python/pgml/examples/vector_search.ipynb
@@ -11,25 +11,6 @@
     "import json"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "env: PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n",
-      "env: LOGLEVEL=INFO\n"
-     ]
-    }
-   ],
-   "source": [
-    "%env PGML_CONNECTION=postgres://u_iez3sqwpzd7n1yd:aiHpsKfyXwr9fa6us1zlbyJuY@sql.cloud.postgresml.org:6432/pgml_iez3sqwpzd7n1yd\n",
-    "%env LOGLEVEL=INFO"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,

From 904dab9385fa26ea0b7259e7c151dd844d01b317 Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Tue, 30 May 2023 16:53:15 -0700
Subject: [PATCH 5/7] Table qa and instructor

---
 .../examples/question_answering_instructor.py |  45 +++
 .../pgml/examples/table_question_answering.py |  49 ++++
 .../python/pgml/examples/vector_search.ipynb  | 257 ------------------
 pgml-sdks/python/pgml/pgml/collection.py      |   5 +-
 4 files changed, 98 insertions(+), 258 deletions(-)
 create mode 100644 pgml-sdks/python/pgml/examples/question_answering_instructor.py
 create mode 100644 pgml-sdks/python/pgml/examples/table_question_answering.py
 delete mode 100644 pgml-sdks/python/pgml/examples/vector_search.ipynb

diff --git a/pgml-sdks/python/pgml/examples/question_answering_instructor.py b/pgml-sdks/python/pgml/examples/question_answering_instructor.py
new file mode 100644
index 000000000..7149ce9c6
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/question_answering_instructor.py
@@ -0,0 +1,45 @@
+from pgml import Database
+import os
+import json
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich.console import Console
+
+load_dotenv()
+console = Console()
+
+local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+
+conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
+db = Database(conninfo)
+
+collection_name = "squad_collection"
+collection = db.create_or_get_collection(collection_name)
+
+
+data = load_dataset("squad", split="train")
+data = data.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+
+documents = [
+    {"id": r["id"], "text": r["context"], "title": r["title"]}
+    for r in data.to_dict(orient="records")
+]
+
+collection.upsert_documents(documents[:200])
+collection.generate_chunks()
+
+#register instructor model
+model_id = collection.register_model(model_name="hkunlp/instructor-base", model_params={"instruction": "Represent the Wikipedia document for retrieval: "})
+collection.generate_embeddings(model_id=model_id)
+
+start = time()
+query = "Who won 20 grammy awards?"
+results = collection.vector_search(query, top_k=5, model_id = model_id, query_parameters={"instruction": "Represent the Wikipedia question for retrieving supporting documents: "})
+_end = time()
+console.print("\nResults for '%s'" % (query), style="bold")
+console.print(results)
+console.print("Query time = %0.3f" % (_end - start))
+
+db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/examples/table_question_answering.py b/pgml-sdks/python/pgml/examples/table_question_answering.py
new file mode 100644
index 000000000..f0e5935c5
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/table_question_answering.py
@@ -0,0 +1,49 @@
+from pgml import Database
+import os
+import json
+from datasets import load_dataset
+from time import time
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.progress import track
+from psycopg import sql
+from pgml.dbutils import run_select_statement
+import pandas as pd
+
+load_dotenv()
+console = Console()
+
+local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+
+conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
+db = Database(conninfo)
+
+collection_name = "ott_qa_20k_collection"
+collection = db.create_or_get_collection(collection_name)
+
+
+data = load_dataset("ashraq/ott-qa-20k", split="train")
+documents = []
+
+# loop through the dataset and convert tabular data to pandas dataframes
+for doc in track(data):
+    table = pd.DataFrame(doc["data"], columns=doc["header"])
+    processed_table = "\n".join([table.to_csv(index=False)])
+    documents.append({"text": processed_table, "title": doc["title"], "url": doc["url"], "uid": doc["uid"]})
+
+collection.upsert_documents(documents)
+collection.generate_chunks()
+
+# SentenceTransformer model trained specifically for embedding tabular data for retrieval tasks
+model_id = collection.register_model(model_name="deepset/all-mpnet-base-v2-table")
+collection.generate_embeddings(model_id=model_id)
+
+start = time()
+query = "which country has the highest GDP in 2020?"
+results = collection.vector_search(query, top_k=5, model_id=model_id)
+_end = time()
+console.print("\nResults for '%s'" % (query), style="bold")
+console.print(results)
+console.print("Query time = %0.3f" % (_end - start))
+
+db.archive_collection(collection_name)
diff --git a/pgml-sdks/python/pgml/examples/vector_search.ipynb b/pgml-sdks/python/pgml/examples/vector_search.ipynb
deleted file mode 100644
index ce29791fa..000000000
--- a/pgml-sdks/python/pgml/examples/vector_search.ipynb
+++ /dev/null
@@ -1,257 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pgml import Database\n",
-    "import os\n",
-    "import json"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
-    "\n",
-    "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
-    "db = Database(conninfo,min_connections=4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection_name = \"test_collection_1\"\n",
-    "collection = db.create_or_get_collection(collection_name)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "missing FROM-clause entry for table \"test_collection_1\"\n",
-      "LINE 1: ...forms (table_name, task, model, splitter) VALUES (test_colle...\n",
-      "                                                             ^\n"
-     ]
-    }
-   ],
-   "source": [
-    "conn = db.pool.getconn()\n",
-    "emb_table = collection._create_or_get_embeddings_table(conn, model_id=1,splitter_id=1)\n",
-    "db.pool.putconn(conn)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "data = load_dataset(\"squad\", split=\"train\")\n",
-    "data = data.to_pandas()\n",
-    "data.head()\n",
-    "\n",
-    "data = data.drop_duplicates(subset=[\"context\"])\n",
-    "print(len(data))\n",
-    "data.head()\n",
-    "\n",
-    "documents = [\n",
-    "    {\n",
-    "        'text': r['context'],\n",
-    "        'metadata': {\n",
-    "            'title': r['title']\n",
-    "        }\n",
-    "    } for r in data.to_dict(orient='records')\n",
-    "]\n",
-    "documents[:3]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.upsert_documents(documents[0:200])\n",
-    "collection.generate_chunks()\n",
-    "collection.generate_embeddings()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n",
-    "print(json.dumps(results,indent=2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.get_models()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(json.dumps(collection.get_models(),indent=2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.generate_embeddings(model_id=2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n",
-    "print(json.dumps(results,indent=2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.get_models()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.generate_embeddings(model_id=3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n",
-    "print(json.dumps(results,indent=2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.generate_chunks(splitter_id=2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "collection.generate_embeddings(splitter_id=2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n",
-    "print(json.dumps(results,indent=2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "db.delete_collection(collection_name)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "pgml-zoggicR5-py3.11",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/pgml-sdks/python/pgml/pgml/collection.py b/pgml-sdks/python/pgml/pgml/collection.py
index 38a8498b9..e49df3d17 100644
--- a/pgml-sdks/python/pgml/pgml/collection.py
+++ b/pgml-sdks/python/pgml/pgml/collection.py
@@ -438,7 +438,7 @@ def register_model(
         task: Optional[str] = "embedding",
         model_name: Optional[str] = "intfloat/e5-small",
         model_params: Optional[Dict[str, Any]] = {},
-    ) -> None:
+    ) -> int:
         """
         This function registers a model in a database if it does not already exist.
 
@@ -777,5 +777,8 @@ def vector_search(
 
         search_results = run_select_statement(conn, cte_select_statement)
         self.pool.putconn(conn)
+    
+        # Sort the list of dictionaries based on the 'score' key in descending order
+        search_results = sorted(search_results, key=lambda x: x['score'], reverse=True)
 
         return search_results

From 26216f95ed5dbf6a86d5c69775dab55bd3573e0a Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Wed, 31 May 2023 10:24:46 -0700
Subject: [PATCH 6/7] Examples and updates to README

---
 pgml-sdks/python/pgml/README.md               | 17 +++++++++++++++--
 pgml-sdks/python/pgml/examples/README.md      | 19 +++++++++++++++++++
 .../examples/question_answering_instructor.py | 16 +++++++++++++---
 .../pgml/examples/table_question_answering.py |  9 ++++++++-
 4 files changed, 55 insertions(+), 6 deletions(-)
 create mode 100644 pgml-sdks/python/pgml/examples/README.md

diff --git a/pgml-sdks/python/pgml/README.md b/pgml-sdks/python/pgml/README.md
index 33487fdb1..6368675a1 100644
--- a/pgml-sdks/python/pgml/README.md
+++ b/pgml-sdks/python/pgml/README.md
@@ -1,10 +1,14 @@
-# Table of Contents
+# Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone
+
+## Table of Contents
 
 - [Overview](#overview)
 - [Quickstart](#quickstart)
 - [Usage](#usage)
+- [Examples](./examples/README.md)
 - [Developer setup](#developer-setup)
 - [API Reference](#api-reference)
+- [Roadmap](#roadmap)
 
 ## Overview
 Python SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases. With this SDK, you can seamlessly manage various database tables related to documents, text chunks, text splitters, LLM (Language Model) models, and embeddings. By leveraging the SDK's capabilities, you can efficiently index LLM embeddings using PgVector for fast and accurate queries.
@@ -274,4 +278,13 @@ LOGLEVEL=INFO python -m unittest tests/test_collection.py
 ### API Reference 
 
 - [Database](./docs/pgml/database.md)
-- [Collection](./docs/pgml/collection.md)
\ No newline at end of file
+- [Collection](./docs/pgml/collection.md)
+
+### Roadmap
+
+- Enable filters on document metadata in `vector_search`. [Issue](https://github.com/postgresml/postgresml/issues/663)
+- `text_search` functionality on documents using Postgres text search. [Issue](https://github.com/postgresml/postgresml/issues/664)
+- `hybrid_search` functionality that does a combination of `vector_search` and `text_search` in an order specified by the user. [Issue](https://github.com/postgresml/postgresml/issues/665)
+- Ability to call and manage OpenAI embeddings for comparison purposes. [Issue](https://github.com/postgresml/postgresml/issues/666)
+- Save `vector_search` history for downstream monitoring of model performance. [Issue](https://github.com/postgresml/postgresml/issues/667)
+- Perform chunking on the DB with multiple langchain splitters. [Issue](https://github.com/postgresml/postgresml/issues/668)
\ No newline at end of file
diff --git a/pgml-sdks/python/pgml/examples/README.md b/pgml-sdks/python/pgml/examples/README.md
new file mode 100644
index 000000000..a77848eff
--- /dev/null
+++ b/pgml-sdks/python/pgml/examples/README.md
@@ -0,0 +1,19 @@
+## Examples
+
+### [Semantic Search](./semantic_search.py)
+This is a basic example to perform semantic search on a collection of documents. It loads the Quora dataset, creates a collection in a PostgreSQL database, upserts documents, generates chunks and embeddings, and then performs a vector search on a query. Embeddings are created using `intfloat/e5-small` model. The results are are semantically similar documemts to the query. Finally, the collection is archived.
+
+### [Question Answering](./question_answering.py)
+This is an example to find documents relevant to a question from the collection of documents. It loads the Stanford Question Answering Dataset (SQuAD) into the database, generates chunks and embeddings. Query is passed to vector search to retrieve documents that match closely in the embeddings space. A score is returned with each of the search result.
+
+### [Question Answering using Instructore Model](./question_answering_instructor.py)
+In this example, we will use `hknlp/instructor-base` model to build text embeddings instead of the default `intfloat/e5-small` model. We will show how to use `register_model` method and use the `model_id` to build and query embeddings.
+
+### [Extractive Question Answering](./extractive_question_answering.py)
+In this example, we will show how to use `vector_search` result as a `context` to a HuggingFace question answering model. We will use `pgml.transform` to run the model on the database.
+
+### [Table Question Answering](./table_question_answering.py)
+In this example, we will use [Open Table-and-Text Question Answering (OTT-QA)
+](https://github.com/wenhuchen/OTT-QA) dataset to run queries on tables. We will use `deepset/all-mpnet-base-v2-table` model that is trained for embedding tabular data for retrieval tasks. 
+
+
diff --git a/pgml-sdks/python/pgml/examples/question_answering_instructor.py b/pgml-sdks/python/pgml/examples/question_answering_instructor.py
index 7149ce9c6..6ed49f27e 100644
--- a/pgml-sdks/python/pgml/examples/question_answering_instructor.py
+++ b/pgml-sdks/python/pgml/examples/question_answering_instructor.py
@@ -30,13 +30,23 @@
 collection.upsert_documents(documents[:200])
 collection.generate_chunks()
 
-#register instructor model
-model_id = collection.register_model(model_name="hkunlp/instructor-base", model_params={"instruction": "Represent the Wikipedia document for retrieval: "})
+# register instructor model
+model_id = collection.register_model(
+    model_name="hkunlp/instructor-base",
+    model_params={"instruction": "Represent the Wikipedia document for retrieval: "},
+)
 collection.generate_embeddings(model_id=model_id)
 
 start = time()
 query = "Who won 20 grammy awards?"
-results = collection.vector_search(query, top_k=5, model_id = model_id, query_parameters={"instruction": "Represent the Wikipedia question for retrieving supporting documents: "})
+results = collection.vector_search(
+    query,
+    top_k=5,
+    model_id=model_id,
+    query_parameters={
+        "instruction": "Represent the Wikipedia question for retrieving supporting documents: "
+    },
+)
 _end = time()
 console.print("\nResults for '%s'" % (query), style="bold")
 console.print(results)
diff --git a/pgml-sdks/python/pgml/examples/table_question_answering.py b/pgml-sdks/python/pgml/examples/table_question_answering.py
index f0e5935c5..f208e3392 100644
--- a/pgml-sdks/python/pgml/examples/table_question_answering.py
+++ b/pgml-sdks/python/pgml/examples/table_question_answering.py
@@ -29,7 +29,14 @@
 for doc in track(data):
     table = pd.DataFrame(doc["data"], columns=doc["header"])
     processed_table = "\n".join([table.to_csv(index=False)])
-    documents.append({"text": processed_table, "title": doc["title"], "url": doc["url"], "uid": doc["uid"]})
+    documents.append(
+        {
+            "text": processed_table,
+            "title": doc["title"],
+            "url": doc["url"],
+            "uid": doc["uid"],
+        }
+    )
 
 collection.upsert_documents(documents)
 collection.generate_chunks()

From 9fa410497b2dcc0f7ab6d8e08c3b173db3674749 Mon Sep 17 00:00:00 2001
From: Santi Adavani <santis@gmail.com>
Date: Thu, 1 Jun 2023 11:18:40 -0700
Subject: [PATCH 7/7] vector search sorting by score done in sql query

---
 pgml-sdks/python/pgml/pgml/collection.py |  7 +++---
 pgml-sdks/python/pgml/pgml/dbutils.py    | 29 ++++++++++++++++++++----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/pgml-sdks/python/pgml/pgml/collection.py b/pgml-sdks/python/pgml/pgml/collection.py
index e49df3d17..7501d0196 100644
--- a/pgml-sdks/python/pgml/pgml/collection.py
+++ b/pgml-sdks/python/pgml/pgml/collection.py
@@ -775,10 +775,9 @@ def vector_search(
             documents_table=self.documents_table,
         )
 
-        search_results = run_select_statement(conn, cte_select_statement)
+        search_results = run_select_statement(
+            conn, cte_select_statement, order_by="score", ascending=False
+        )
         self.pool.putconn(conn)
-    
-        # Sort the list of dictionaries based on the 'score' key in descending order
-        search_results = sorted(search_results, key=lambda x: x['score'], reverse=True)
 
         return search_results
diff --git a/pgml-sdks/python/pgml/pgml/dbutils.py b/pgml-sdks/python/pgml/pgml/dbutils.py
index 5d58b56a5..95ce5b003 100644
--- a/pgml-sdks/python/pgml/pgml/dbutils.py
+++ b/pgml-sdks/python/pgml/pgml/dbutils.py
@@ -52,7 +52,9 @@ def run_create_or_insert_statement(
     cur.close()
 
 
-def run_select_statement(conn: Connection, statement: str) -> List[Any]:
+def run_select_statement(
+    conn: Connection, statement: str, order_by: str = "", ascending: bool = True
+) -> List[Any]:
     """
     The function runs a select statement on a database connection and returns the results as a list of
     dictionaries.
@@ -70,12 +72,29 @@ def run_select_statement(conn: Connection, statement: str) -> List[Any]:
 
     statement = statement.strip().rstrip(";")
     cur = conn.cursor()
-    json_conversion_statement = """
-            SELECT array_to_json(array_agg(row_to_json(t)))
+    order_statement = ""
+    if order_by:
+        order_statement = "ORDER BY t.%s" % order_by
+        if ascending:
+            order_statement += " ASC"
+        else:
+            order_statement += " DESC"
+
+    if order_statement:
+        json_conversion_statement = """
+            SELECT array_to_json(array_agg(row_to_json(t) {order_statement}))
             FROM ({select_statement}) t;
             """.format(
-        select_statement=statement
-    )
+            select_statement=statement,
+            order_statement=order_statement,
+        )
+    else:
+        json_conversion_statement = """
+                SELECT array_to_json(array_agg(row_to_json(t)))
+                FROM ({select_statement}) t;
+                """.format(
+            select_statement=statement
+        )
     log.info("Running %s .. " % json_conversion_statement)
     cur.execute(json_conversion_statement)
     results = cur.fetchall()

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/postgresml/postgresml/pull/669.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/postgresml/postgresml/pull/669.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/postgresml/postgresml/pull/669.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/postgresml/postgresml/pull/669.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>