postgresml · SilasMarvin · Sep 6, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 5, 2023
diff --git a/pgml-sdks/pgml/Cargo.lock b/pgml-sdks/pgml/Cargo.lock
diff --git a/pgml-sdks/pgml/Cargo.toml b/pgml-sdks/pgml/Cargo.toml
@@ -20,7 +20,7 @@ serde_json = "1.0.9"
 anyhow = "1.0.9"
 tokio = { version = "1.28.2", features = [ "macros" ] }
 chrono = "0.4.9"
-pyo3 = { version = "0.18.3", optional = true, features = ["extension-module"] }
+pyo3 = { version = "0.18.3", optional = true, features = ["extension-module", "anyhow"] }
 pyo3-asyncio = { version = "0.18", features = ["attributes", "tokio-runtime"], optional = true }
 neon = { version = "0.10", optional = true, default-features = false, features = ["napi-6", "promise-api", "channel-api"] }
 itertools = "0.10.5"

diff --git a/pgml-sdks/pgml/build.rs b/pgml-sdks/pgml/build.rs
@@ -3,14 +3,16 @@ use std::fs::OpenOptions;
 use std::io::Write;
 
 const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#"
-def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+async def migrate() -> None
 
 Json = Any
 DateTime = int
 "#;
 
 const ADDITIONAL_DEFAULTS_FOR_JAVASCRIPT: &[u8] = br#"
-export function js_init_logger(level?: string, format?: string): void;
+export function init_logger(level?: string, format?: string): void;
+export function migrate(): Promise<void>;
 
 export type Json = { [key: string]: any };
 export type DateTime = Date;

diff --git a/pgml-sdks/pgml/javascript/README.md b/pgml-sdks/pgml/javascript/README.md
@@ -519,6 +519,24 @@ const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
 await collection.add_pipeline(pipeline)
 ```
 
+### Configuring HNSW Indexing Parameters
+
+Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
+
+Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
+
+```javascript
+const model = pgml.newModel()
+const splitter = pgml.newSplitter()
+const pipeline = pgml.newPipeline("test_pipeline", model, splitter, {
+    hnsw: {
+        m: 100,
+        ef_construction: 200
+    }
+})
+await collection.add_pipeline(pipeline)
+```
+
 ### Searching with Pipelines
 
 Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

diff --git a/pgml-sdks/pgml/javascript/examples/extractive_question_answering.js b/pgml-sdks/pgml/javascript/examples/extractive_question_answering.js
@@ -1,7 +1,6 @@
 const pgml = require("pgml");
 require("dotenv").config();
 
-pgml.js_init_logger();
 
 const main = async () => {
   // Initialize the collection

diff --git a/pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js b/pgml-sdks/pgml/javascript/examples/summarizing_question_answering.js
@@ -1,8 +1,6 @@
 const pgml = require("pgml");
 require("dotenv").config();
 
-pgml.js_init_logger();
-
 const main = async () => {
   // Initialize the collection
   const collection = pgml.newCollection("my_javascript_sqa_collection");

diff --git a/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts b/pgml-sdks/pgml/javascript/tests/typescript-tests/test.ts
@@ -10,7 +10,7 @@ import pgml from "../../index.js";
 ////////////////////////////////////////////////////////////////////////////////////
 
 const LOG_LEVEL = process.env.LOG_LEVEL ? process.env.LOG_LEVEL : "ERROR";
-pgml.js_init_logger(LOG_LEVEL);
+pgml.init_logger(LOG_LEVEL);
 
 const generate_dummy_documents = (count: number) => {
   let docs = [];
@@ -143,6 +143,52 @@ it("can vector search with query builder and metadata filtering", async () => {
   await collection.archive();
 });
 
+it("can vector search with query builder and custom hnsfw ef_search value", async () => {
+  let model = pgml.newModel();
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline("test_j_p_cvswqbachesv_0", model, splitter);
+  let collection = pgml.newCollection("test_j_c_cvswqbachesv_0");
+  await collection.upsert_documents(generate_dummy_documents(3));
+  await collection.add_pipeline(pipeline);
+  let results = await collection
+    .query()
+    .vector_recall("Here is some query", pipeline)
+    .filter({
+      hnsw: {
+        ef_search: 2,
+      },
+    })
+    .limit(10)
+    .fetch_all();
+  expect(results).toHaveLength(3);
+  await collection.archive();
+});
+
+it("can vector search with query builder and custom hnsfw ef_search value and remote embeddings", async () => {
+  let model = pgml.newModel("text-embedding-ada-002", "openai");
+  let splitter = pgml.newSplitter();
+  let pipeline = pgml.newPipeline(
+    "test_j_p_cvswqbachesvare_0",
+    model,
+    splitter,
+  );
+  let collection = pgml.newCollection("test_j_c_cvswqbachesvare_0");
+  await collection.upsert_documents(generate_dummy_documents(3));
+  await collection.add_pipeline(pipeline);
+  let results = await collection
+    .query()
+    .vector_recall("Here is some query", pipeline)
+    .filter({
+      hnsw: {
+        ef_search: 2,
+      },
+    })
+    .limit(10)
+    .fetch_all();
+  expect(results).toHaveLength(3);
+  await collection.archive();
+});
+
 ///////////////////////////////////////////////////
 // Test user output facing functions //////////////
 ///////////////////////////////////////////////////
@@ -220,3 +266,11 @@ it("can delete documents", async () => {
 
   await collection.archive();
 });
+
+///////////////////////////////////////////////////
+// Test migrations ////////////////////////////////
+///////////////////////////////////////////////////
+
+it("can migrate", async () => {
+  await pgml.migrate();
+});
diff --git a/pgml-sdks/pgml/python/README.md b/pgml-sdks/pgml/python/README.md
@@ -530,6 +530,24 @@ pipeline = Pipeline("test_pipeline", model, splitter, {
 await collection.add_pipeline(pipeline)
 ```
 
+### Configuring HNSW Indexing Parameters
+
+Our SDK utilizes [pgvector](https://github.com/pgvector/pgvector) for storing vectors and performing recall. We use HNSW indexing as it is the most performant mix of performance and recall.
+
+Our SDK allows for configuration of `m` (the maximum number of connections per layer (16 by default)) and `ef_construction` (the size of the dynamic candidate list when constructing the graph (64 by default)) per pipeline.
+
+```python
+model = Model()
+splitter = Splitter()
+pipeline = Pipeline("test_pipeline", model, splitter, {
+    "hnsw": {
+        "m": 100,
+        "ef_construction": 200 
+    }
+})
+await collection.add_pipeline(pipeline)
+```
+
 ### Searching with Pipelines
 
 Pipelines are a required argument when performing vector search. After a Pipeline has been added to a Collection, the Model and Splitter can be omitted when instantiating it.

diff --git a/pgml-sdks/pgml/python/examples/summarizing_question_answering.py b/pgml-sdks/pgml/python/examples/summarizing_question_answering.py
@@ -1,4 +1,4 @@
-from pgml import Collection, Model, Splitter, Pipeline, Builtins, py_init_logger
+from pgml import Collection, Model, Splitter, Pipeline, Builtins
 import json
 from datasets import load_dataset
 from time import time
@@ -7,9 +7,6 @@
 import asyncio
 
 
-py_init_logger()
-
-
 async def main():
     load_dotenv()
     console = Console()

diff --git a/pgml-sdks/pgml/python/pgml/pgml.pyi b/pgml-sdks/pgml/python/pgml/pgml.pyi
@@ -1,91 +1,6 @@
 
-def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+def init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None
+async def migrate() -> None
 
 Json = Any
 DateTime = int
-
-# Top of file key: A12BECOD!
-from typing import List, Dict, Optional, Self, Any
-
-
-class Builtins:
-	def __init__(self, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
-		...
-	def query(self, query: str) -> QueryRunner
-		...
-	async def transform(self, task: Json, inputs: List[str], args: Optional[Json] = Any) -> Json
-		...
-
-class Collection:
-	def __init__(self, name: str, database_url: Optional[str] = "Default set in Rust. Please check the documentation.") -> Self
-		...
-	async def add_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def remove_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def enable_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def disable_pipeline(self, pipeline: Pipeline) -> None
-		...
-	async def upsert_documents(self, documents: List[Json]) -> None
-		...
-	async def get_documents(self, args: Optional[Json] = Any) -> List[Json]
-		...
-	async def delete_documents(self, filter: Json) -> None
-		...
-	async def vector_search(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any, top_k: Optional[int] = 1) -> List[tuple[float, str, Json]]
-		...
-	async def archive(self) -> None
-		...
-	def query(self) -> QueryBuilder
-		...
-	async def get_pipelines(self) -> List[Pipeline]
-		...
-	async def get_pipeline(self, name: str) -> Pipeline
-		...
-	async def exists(self) -> bool
-		...
-
-class Model:
-	def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", source: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
-		...
-
-class Pipeline:
-	def __init__(self, name: str, model: Optional[Model] = Any, splitter: Optional[Splitter] = Any, parameters: Optional[Json] = Any) -> Self
-		...
-	async def get_status(self) -> PipelineSyncData
-		...
-	async def to_dict(self) -> Json
-		...
-
-class QueryBuilder:
-	def limit(self, limit: int) -> Self
-		...
-	def filter(self, filter: Json) -> Self
-		...
-	def vector_recall(self, query: str, pipeline: Pipeline, query_parameters: Optional[Json] = Any) -> Self
-		...
-	async def fetch_all(self) -> List[tuple[float, str, Json]]
-		...
-	def to_full_string(self) -> str
-		...
-
-class QueryRunner:
-	async def fetch_all(self) -> Json
-		...
-	async def execute(self) -> None
-		...
-	def bind_string(self, bind_value: str) -> Self
-		...
-	def bind_int(self, bind_value: int) -> Self
-		...
-	def bind_float(self, bind_value: float) -> Self
-		...
-	def bind_bool(self, bind_value: bool) -> Self
-		...
-	def bind_json(self, bind_value: Json) -> Self
-		...
-
-class Splitter:
-	def __init__(self, name: Optional[str] = "Default set in Rust. Please check the documentation.", parameters: Optional[Json] = Any) -> Self
-		...
diff --git a/pgml-sdks/pgml/python/tests/test.py b/pgml-sdks/pgml/python/tests/test.py
@@ -19,7 +19,7 @@
     print("No DATABASE_URL environment variable found. Please set one")
     exit(1)
 
-pgml.py_init_logger()
+pgml.init_logger()
 
 
 def generate_dummy_documents(count: int) -> List[Dict[str, Any]]:
@@ -164,6 +164,44 @@ async def test_can_vector_search_with_query_builder_and_metadata_filtering():
     await collection.archive()
 
 
+@pytest.mark.asyncio
+async def test_can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value():
+    model = pgml.Model()
+    splitter = pgml.Splitter()
+    pipeline = pgml.Pipeline("test_p_p_tcvswqbachesv_0", model, splitter)
+    collection = pgml.Collection(name="test_p_c_tcvswqbachesv_0")
+    await collection.upsert_documents(generate_dummy_documents(3))
+    await collection.add_pipeline(pipeline)
+    results = (
+        await collection.query()
+        .vector_recall("Here is some query", pipeline)
+        .filter({"hnsw": {"ef_search": 2}})
+        .limit(10)
+        .fetch_all()
+    )
+    assert len(results) == 3
+    await collection.archive()
+
+
+@pytest.mark.asyncio
+async def test_can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value_and_remote_embeddings():
+    model = pgml.Model(name="text-embedding-ada-002", source="openai")
+    splitter = pgml.Splitter()
+    pipeline = pgml.Pipeline("test_p_p_tcvswqbachesvare_0", model, splitter)
+    collection = pgml.Collection(name="test_p_c_tcvswqbachesvare_0")
+    await collection.upsert_documents(generate_dummy_documents(3))
+    await collection.add_pipeline(pipeline)
+    results = (
+        await collection.query()
+        .vector_recall("Here is some query", pipeline)
+        .filter({"hnsw": {"ef_search": 2}})
+        .limit(10)
+        .fetch_all()
+    )
+    assert len(results) == 3
+    await collection.archive()
+
+
 ###################################################
 ## Test user output facing functions ##############
 ###################################################
@@ -250,6 +288,16 @@ async def test_delete_documents():
     await collection.archive()
 
 
+###################################################
+## Migration tests ################################
+###################################################
+
+
+@pytest.mark.asyncio
+async def test_migrate():
+    await pgml.migrate()
+
+
 ###################################################
 ## Test with multiprocessing ######################
 ###################################################

diff --git a/pgml-sdks/pgml/src/builtins.rs b/pgml-sdks/pgml/src/builtins.rs
@@ -92,21 +92,21 @@ impl Builtins {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::init_logger;
+    use crate::internal_init_logger;
 
     #[sqlx::test]
     async fn can_query() -> anyhow::Result<()> {
-        init_logger(None, None).ok();
+        internal_init_logger(None, None).ok();
         let builtins = Builtins::new(None);
-        let query = "SELECT 10";
+        let query = "SELECT * from pgml.collections";
         let results = builtins.query(query).fetch_all().await?;
         assert!(results.as_array().is_some());
         Ok(())
     }
 
     #[sqlx::test]
     async fn can_transform() -> anyhow::Result<()> {
-        init_logger(None, None).ok();
+        internal_init_logger(None, None).ok();
         let builtins = Builtins::new(None);
         let task = Json::from(serde_json::json!("translation_en_to_fr"));
         let inputs = vec!["test1".to_string(), "test2".to_string()];