From 97398a2e1dc2c0e5d4ef6606d619645a1323d848 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Tue, 4 Jun 2024 16:46:55 -0700 Subject: [PATCH 1/4] Periodic commit --- pgml-cms/docs/api/client-sdk/README.md | 159 ++++++++++- pgml-cms/docs/api/client-sdk/collections.md | 233 +++++++++++++++++ pgml-cms/docs/api/client-sdk/pipelines.md | 196 +++++++++++++- pgml-cms/docs/api/client-sdk/search.md | 275 ++++++++++++++++++-- 4 files changed, 836 insertions(+), 27 deletions(-) diff --git a/pgml-cms/docs/api/client-sdk/README.md b/pgml-cms/docs/api/client-sdk/README.md index 866610b92..0ccddb9f0 100644 --- a/pgml-cms/docs/api/client-sdk/README.md +++ b/pgml-cms/docs/api/client-sdk/README.md @@ -12,17 +12,39 @@ The client SDK can be installed using standard package managers for JavaScript, Installing the SDK into your project is as simple as: {% tabs %} -{% tab title="JavaScript " %} +{% tab title="JavaScript" %} ```bash npm i pgml ``` {% endtab %} -{% tab title="Python " %} +{% tab title="Python" %} ```bash pip install pgml ``` {% endtab %} + +{% tab title="Rust" %} +```bash +cargo add pgml +``` +{% endtab %} + +{% tab title="C" %} + +First clone the `postgresml` repository and navigate to the `pgml-sdks/pgml/c` directory: +```bash +git clone https://github.com/postgresml/postgresml +cd postgresml/pgml-sdks/pgml/c +``` + +Then build the bindings +```bash +make bindings +``` + +This will generate the `pgml.h` file and a `.so` on linux and `.dyblib` on MacOS. +{% endtab %} {% endtabs %} ## Getting started @@ -41,10 +63,10 @@ export PGML_DATABASE_URL=postgres://user:password@sql.cloud.postgresml.org:6432/ ### Create a collection -The SDK is written in asynchronous code, so you need to run it inside an async runtime. Both Python and JavaScript support async functions natively. +The SDK is written in asynchronous code, so you need to run it inside an async runtime. Both Python, JavaScript and Rust support async functions natively. {% tabs %} -{% tab title="JavaScript " %} +{% tab title="JavaScript" %} ```javascript const pgml = require("pgml"); @@ -63,6 +85,28 @@ async def main(): collection = Collection("sample_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +use pgml::{Collection, Pipeline}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut collection = Collection::new("sample_collection", None)?; +} +``` +{% endtab %} + +{% tab title="C" %} +```c +#include +#include "pgml.h" + +int main() { + CollectionC * collection = pgml_collectionc_new("sample_collection", NULL); +} +``` +{% endtab %} {% endtabs %} The above example imports the `pgml` module and creates a collection object. By itself, the collection only tracks document contents and identifiers, but once we add a pipeline, we can instruct the SDK to perform additional tasks when documents and are inserted and retrieved. @@ -93,7 +137,7 @@ await collection.add_pipeline(pipeline); ```python # Add this code to the end of the main function from the above example. pipeline = Pipeline( - "test_pipeline", + "sample_pipeline", { "text": { "splitter": { "model": "recursive_character" }, @@ -107,6 +151,37 @@ pipeline = Pipeline( await collection.add_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function from the above example. +let mut pipeline = Pipeline::new( + "sample_pipeline", + Some( + serde_json::json!({ + "text": { + "splitter": { "model": "recursive_character" }, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function from the above example. +PipelineC * pipeline = pgml_pipelinec_new("sample_pipeline", "{\"text\": {\"splitter\": {\"model\": \"recursive_character\"},\"semantic_search\": {\"model\": \"Alibaba-NLP/gte-base-en-v1.5\"}}}"); + +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} The pipeline configuration is a key/value object, where the key is the name of a column in a document, and the value is the action the SDK should perform on that column. @@ -153,9 +228,36 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} -{% endtabs %} -If the same document `id` is used, the SDK computes the difference between existing and new documents and only updates the chunks that have changed. +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function in the above example. +let documents = vec![ + serde_json::json!({ + "id": "Document One", + "text": "document one contents...", + }) + .into(), + serde_json::json!({ + "id": "Document Two", + "text": "document two contents...", + }) + .into(), +]; + +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function in the above example. +char * documents_to_upsert[2] = {"{\"id\": \"Document One\", \"text\": \"document one contents...\"}", "{\"id\": \"Document Two\", \"text\": \"document two contents...\"}"}; + +pgml_collectionc_upsert_documents(collection, documents_to_upsert, 2, NULL); +``` +{% endtab %} +{% endtabs %} ### Search documents @@ -203,6 +305,47 @@ results = await collection.vector_search( print(results) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +// Add this code to the end of the main function in the above example. +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "text": { + "query": "Something about a document...", + }, + }, + }, + "limit": 2, + }) + .into(), + &mut pipeline, + ) + .await?; + +println!("{:?}", results); + +Ok(()) +``` +{% endtab %} + +{% tab title="C" %} +```c +// Add this code to the end of the main function in the above example. +r_size = 0; +char** results = pgml_collectionc_vector_search(collection, "{\"query\": {\"fields\": {\"text\": {\"query\": \"Something about a document...\"}}}, \"limit\": 2}", pipeline, &r_size); +printf("\n\nPrinting results:\n"); +for (i = 0; i < r_size; ++i) { + printf("Result %u -> %s\n", i, results[i]); +} + +pgml_pipelinec_delete(pipeline); +pgml_collectionc_delete(collection); +``` +{% endtab %} {% endtabs %} We are using built-in vector search, powered by embeddings and the PostgresML [pgml.embed()](../sql-extension/pgml.embed) function, which embeds the `query` argument, compares it to the embeddings stored in the database, and returns the top two results, ranked by cosine similarity. @@ -228,6 +371,8 @@ if __name__ == "__main__": {% endtab %} {% endtabs %} +Note that `Rust` and `C` example do not require any additional code to run correctly. + Once you run the example, you should see something like this in the terminal: ```bash diff --git a/pgml-cms/docs/api/client-sdk/collections.md b/pgml-cms/docs/api/client-sdk/collections.md index 14c64ad5c..ebd63afca 100644 --- a/pgml-cms/docs/api/client-sdk/collections.md +++ b/pgml-cms/docs/api/client-sdk/collections.md @@ -26,6 +26,18 @@ const collection = pgml.newCollection("test_collection") collection = Collection("test_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +``` +{% endtab %} {% endtabs %} ### Custom `PGML_DATABASE_URL` @@ -44,6 +56,18 @@ const collection = pgml.newCollection("test_collection", CUSTOM_DATABASE_URL) collection = Collection("test_collection", CUSTOM_DATABASE_URL) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", Some(CUSTOM_DATABASE_URL))?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", CUSTOM_DATABASE_URL); +``` +{% endtab %} {% endtabs %} ## Upserting Documents @@ -90,6 +114,38 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "title": "Document One", + "text": "Here are the contents of Document 1", + "random_key": "here is some random data", + }) + .into(), + serde_json::json!({ + "id": "document_two", + "title": "Document Two", + "text": "Here are the contents of Document 2", + "random_key": "here is some random data", + }) + .into(), +]; +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here are the contents of Document 1\", \"random_key\": \"here is some random data\"}", + "{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here are the contents of Document 2\", \"random_key\": \"here is some random data\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, NULL); +``` +{% endtab %} {% endtabs %} Documents can be replaced by upserting documents with the same `id`. @@ -134,6 +190,38 @@ documents = [ await collection.upsert_documents(documents) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "title": "Document One", + "text": "Here is some new text for document one", + "random_key": "here is some random data", + }) + .into(), + serde_json::json!({ + "id": "document_two", + "title": "Document Two", + "text": "Here is some new text for document two", + "random_key": "here is some random data", + }) + .into(), +]; +collection.upsert_documents(documents, None).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"title\": \"Document One\", \"text\": \"Here is some new text for document one\", \"random_key\": \"here is some random data\"}", + "{\"id\": \"document_two\", \"title\": \"Document Two\", \"text\": \"Here is some new text for document two\", \"random_key\": \"here is some random data\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, NULL); +``` +{% endtab %} {% endtabs %} Documents can be merged by setting the `merge` option. On conflict, new document keys will override old document keys. @@ -176,6 +264,38 @@ documents = [ await collection.upsert_documents(documents, {"merge": True}) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents: Vec = vec![ + serde_json::json!({ + "id": "document_one", + "new_key": "this will be a new key in document one", + "random_key": "this will replace old random_key" + }) + .into(), + serde_json::json!({ + "id": "document_two", + "new_key": "this will be a new key in document two", + "random_key": "this will replace old random_key" + }) + .into(), +]; +collection + .upsert_documents(documents, Some(serde_json::json!({"merge": true}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * documents[2] = { + "{\"id\": \"document_one\", \"new_key\": \"this will be a new key in document one\", \"random_key\": \"this will replace old random_key\"}", + "{\"id\": \"document_two\", \"new_key\": \"this will be a new key in document two\", \"random_key\": \"this will replace old random_key\"}" +}; +pgml_collectionc_upsert_documents(collection, documents, 2, "{\"merge\": true}"); +``` +{% endtab %} {% endtabs %} ## Getting Documents @@ -194,6 +314,21 @@ const documents = await collection.get_documents({limit: 100 }) documents = await collection.get_documents({ "limit": 100 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100}", &r_size); +``` +{% endtab %} {% endtabs %} ### Paginating Documents @@ -214,6 +349,21 @@ const documents = await collection.get_documents({ limit: 100, offset: 10 }) documents = await collection.get_documents({ "limit": 100, "offset": 10 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100, "offset": 10}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10}", &r_size); +``` +{% endtab %} {% endtabs %} #### Keyset Pagination @@ -230,6 +380,21 @@ const documents = await collection.get_documents({ limit: 100, last_row_id: 10 } documents = await collection.get_documents({ "limit": 100, "last_row_id": 10 }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some(serde_json::json!({"limit": 100, "last_row_id": 10}).into())) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"last_row_id\": 10}", &r_size); +``` +{% endtab %} {% endtabs %} The `last_row_id` can be taken from the `row_id` field in the returned document's dictionary. Keyset pagination does not currently work when specifying the `order_by` key. @@ -264,6 +429,29 @@ documents = await collection.get_documents( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 100, + "filter": { + "id": {"$eq": "document_one"}, + } + }) + .into(), + )) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"filter\": {\"id\": {\"$eq\": \"document_one\"}}}", &r_size); +``` +{% endtab %} {% endtabs %} ### Sorting Documents @@ -294,6 +482,30 @@ documents = await collection.get_documents({ }) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 100, + "offset": 10, + "order_by": { + "id": "desc" + } + }) + .into(), + )) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +unsigned long r_size = 0; +char** documents = pgml_collectionc_get_documents(collection, "{\"limit\": 100, \"offset\": 10, \"order_by\": {\"id\": \"desc\"}}", &r_size); +``` +{% endtab %} {% endtabs %} ### Deleting Documents @@ -320,4 +532,25 @@ documents = await collection.delete_documents( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let documents = collection + .delete_documents( + serde_json::json!({ + "id": { + "$eq": 1 + } + }) + .into(), + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +pgml_collectionc_delete_documents(collection, "{\"id\": { \"$eq\": 1}}"); +``` +{% endtab %} {% endtabs %} diff --git a/pgml-cms/docs/api/client-sdk/pipelines.md b/pgml-cms/docs/api/client-sdk/pipelines.md index c51987cad..6c3ed57cd 100644 --- a/pgml-cms/docs/api/client-sdk/pipelines.md +++ b/pgml-cms/docs/api/client-sdk/pipelines.md @@ -57,6 +57,48 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "title": { + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"title\": {\ + \"full_text_search\": {\"configuration\": \"english\"},\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} This `Pipeline` does two things. For each document in the `Collection`, it converts all `title`s into tsvectors enabling full text search, and splits and embeds the `body` text enabling semantic search using vectors. This kind of `Pipeline` would be great for site search utilizing hybrid keyword and semantic search. @@ -92,6 +134,42 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} This `Pipeline` splits and embeds the `body` text enabling semantic search using vectors. This is a very popular `Pipeline` for RAG. @@ -166,6 +244,44 @@ pipeline = Pipeline( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!({ + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "Alibaba-NLP/gte-base-en-v1.5", + "hnsw": {"m": 100, "ef_construction": 200} + }, + }, + }) + .into(), + ), +)?; + +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new( + "test_pipeline", + "{\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\",\ + \"hnsw\": {\"m\": 100, \"ef_construction\": 200}\ + }\ + }\ + }" +); +``` +{% endtab %} {% endtabs %} ## Adding Pipelines to a Collection @@ -184,6 +300,18 @@ await collection.add_pipeline(pipeline) await collection.add_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} > Note: After a `Pipeline` has been added to a `Collection` instances of the `Pipeline` object can be created without specifying a schema: @@ -200,6 +328,18 @@ const pipeline = pgml.newPipeline("test_pipeline") pipeline = Pipeline("test_pipeline") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new("test_pipeline", None)?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +``` +{% endtab %} {% endtabs %} ## Searching with Pipelines @@ -231,6 +371,22 @@ collection = Collection("test_collection") await collection.disable_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.disable_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_disable_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Disabling a `Pipeline` prevents it from running automatically, but leaves all tsvectors, chunks, and embeddings already created by that `Pipeline` in the database. @@ -255,6 +411,22 @@ collection = Collection("test_collection") await collection.enable_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.enable_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_enable_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Enabling a `Pipeline` will cause it to automatically run on all documents it may have missed while disabled. @@ -263,10 +435,10 @@ Enabling a `Pipeline` will cause it to automatically run on all documents it may {% tabs %} {% tab title="JavaScript" %} -

const pipeline = pgml.newPipeline("test_pipeline")
-const collection = pgml.newCollection("test_collection")
-await collection.remove_pipeline(pipeline)
-

+```javascript +const pipeline = pgml.newPipeline("test_pipeline") +const collection = pgml.newCollection("test_collection") +await collection.remove_pipeline(pipeline) {% endtab %} {% tab title="Python" %} @@ -276,6 +448,22 @@ collection = Collection("test_collection") await collection.remove_pipeline(pipeline) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut collection = Collection::new("test_collection", None)?; +let mut pipeline = Pipeline::new("test_pipeline", None)?; +collection.remove_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +PipelineC * pipeline = pgml_pipelinec_new("test_pipeline", NULL); +pgml_collectionc_remove_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} Removing a `Pipeline` deletes it and all associated data from the database. Removed `Pipelines` cannot be re-enabled but can be recreated. diff --git a/pgml-cms/docs/api/client-sdk/search.md b/pgml-cms/docs/api/client-sdk/search.md index 8318a8bee..3fc564c55 100644 --- a/pgml-cms/docs/api/client-sdk/search.md +++ b/pgml-cms/docs/api/client-sdk/search.md @@ -10,14 +10,14 @@ This section will assume we have previously ran the following code: const pipeline = pgml.newPipeline("test_pipeline", { abstract: { semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, full_text_search: { configuration: "english" }, }, body: { splitter: { model: "recursive_character" }, semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, }, }); @@ -33,19 +33,70 @@ pipeline = Pipeline( { "abstract": { "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, "full_text_search": {"configuration": "english"}, }, "body": { "splitter": {"model": "recursive_character"}, "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, }, }, ) collection = Collection("test_collection") +await collection.add_pipeline(pipeline); +``` +{% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!( + { + "abstract": { + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + }, + } + ) + .into(), + ), +)?; +let mut collection = Collection::new("test_collection", None)?; +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC *pipeline = pgml_pipelinec_new("test_pipeline", "{\ + \"abstract\": {\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + },\ + \"full_text_search\": {\"configuration\": \"english\"}\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"Alibaba-NLP/gte-base-en-v1.5\"\ + }\ + }\ +}"); +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +pgml_collectionc_add_pipeline(collection, pipeline); ``` {% endtab %} {% endtabs %} @@ -63,8 +114,8 @@ const results = await collection.vector_search( fields: { body: { query: "What is the best database?", parameters: { - instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + prompt: + "Represent this sentence for searching relevant passages: ", } }, }, @@ -85,7 +136,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "prompt": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -96,9 +147,56 @@ results = await collection.vector_search( ) ``` {% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "prompt": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"prompt\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + }\ + },\ + \"limit\": 5\ +}", +pipeline, &r_size); +``` +{% endtab %} {% endtabs %} -Let's break this down. `vector_search` takes in a `JSON` object and a `Pipeline`. The `JSON` object currently supports two keys: `query` and `limit` . The `limit` limits how many chunks should be returned, the `query` specifies the actual query to perform. Let's see another more complicated example: +Let's break this down. `vector_search` takes in a `JSON` object and a `Pipeline`. The `JSON` object currently supports two keys: `query` and `limit` . The `limit` limits how many chunks should be returned, the `query` specifies the actual query to perform. + +Note that `mixedbread-ai/mxbai-embed-large-v1` takes in a prompt when creating embeddings for searching against a corpus which we provide in the `parameters`. + +Let's see another more complicated example: {% tabs %} {% tab title="JavaScript" %} @@ -115,7 +213,7 @@ const results = await collection.vector_search( body: { query: query, parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -141,7 +239,7 @@ results = await collection.vector_search( "body": { "query": query, "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -151,6 +249,59 @@ results = await collection.vector_search( pipeline, ) +``` +{% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let query = "What is the best database?"; +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "abastract": { + "query": query, + "full_text_filter": "database", + }, + "body": { + "query": query, + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"abastract\": {\ + \"query\": \"What is the best database?\",\ + \"full_text_filter\": \"database\"\ + },\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + }\ + },\ + \"limit\": 5,\ +}", pipeline, &r_size); ``` {% endtab %} {% endtabs %} @@ -173,7 +324,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -199,7 +350,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -211,6 +362,52 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": {"user_id": {"$eq": 1}}, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all chunks from documents that do not contain a key `user_id` equal to `1`. @@ -227,7 +424,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -253,7 +450,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, @@ -265,6 +462,52 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% endtab %} +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": {"user_id": {"$gte": 1}}, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all documents that do not contain a key `user_id` with a value greater than or equal to `1`. @@ -281,7 +524,7 @@ const results = await collection.vector_search( body: { query: "What is the best database?", parameters: { instruction: - "Represent the Wikipedia question for retrieving supporting documents: ", + "Represent this sentence for searching relevant passages: ", } }, }, @@ -325,7 +568,7 @@ results = await collection.vector_search( "body": { "query": "What is the best database?", "parameters": { - "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + "instruction": "Represent this sentence for searching relevant passages: ", }, }, }, From 7efe6d9fc8d973452381140e9c5852119033a360 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:35 -0700 Subject: [PATCH 2/4] Updated everything to have rust and c --- .../docs/api/client-sdk/document-search.md | 117 +++++++++++++++++- pgml-cms/docs/api/client-sdk/search.md | 59 ++++++++- 2 files changed, 169 insertions(+), 7 deletions(-) diff --git a/pgml-cms/docs/api/client-sdk/document-search.md b/pgml-cms/docs/api/client-sdk/document-search.md index cf91f95ee..4ada75d7f 100644 --- a/pgml-cms/docs/api/client-sdk/document-search.md +++ b/pgml-cms/docs/api/client-sdk/document-search.md @@ -10,14 +10,14 @@ This section will assume we have previously ran the following code: const pipeline = pgml.newPipeline("test_pipeline", { abstract: { semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, full_text_search: { configuration: "english" }, }, body: { splitter: { model: "recursive_character" }, semantic_search: { - model: "Alibaba-NLP/gte-base-en-v1.5", + model: "mixedbread-ai/mxbai-embed-large-v1", }, }, }); @@ -33,14 +33,14 @@ pipeline = Pipeline( { "abstract": { "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, "full_text_search": {"configuration": "english"}, }, "body": { "splitter": {"model": "recursive_character"}, "semantic_search": { - "model": "Alibaba-NLP/gte-base-en-v1.5", + "model": "mixedbread-ai/mxbai-embed-large-v1", }, }, }, @@ -48,8 +48,60 @@ pipeline = Pipeline( collection = Collection("test_collection") ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let mut pipeline = Pipeline::new( + "test_pipeline", + Some( + serde_json::json!( + { + "abstract": { + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + "full_text_search": {"configuration": "english"}, + }, + "body": { + "splitter": {"model": "recursive_character"}, + "semantic_search": { + "model": "mixedbread-ai/mxbai-embed-large-v1", + }, + }, + } + ) + .into(), + ), +)?; +let mut collection = Collection::new("test_collection", None)?; +collection.add_pipeline(&mut pipeline).await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +PipelineC *pipeline = pgml_pipelinec_new("test_pipeline", "{\ + \"abstract\": {\ + \"semantic_search\": {\ + \"model\": \"mixedbread-ai/mxbai-embed-large-v1\"\ + },\ + \"full_text_search\": {\"configuration\": \"english\"}\ + },\ + \"body\": {\ + \"splitter\": {\"model\": \"recursive_character\"},\ + \"semantic_search\": {\ + \"model\": \"mixedbread-ai/mxbai-embed-large-v1\"\ + }\ + }\ +}"); +CollectionC * collection = pgml_collectionc_new("test_collection", NULL); +pgml_collectionc_add_pipeline(collection, pipeline); +``` +{% endtab %} {% endtabs %} +This creates a `Pipeline` that is capable of full text search and semantic search on the `abstract` and semantic search on the `body` of documents. + ## Doing Document Search {% tabs %} @@ -108,6 +160,63 @@ results = await collection.search( ) ``` {% endtab %} + + +{% tab title="Rust" %} +```rust +let results = collection + .search(serde_json::json!({ + "query": { + "full_text_search": { + "abstract": {"query": "What is the best database?", "boost": 1.2} + }, + "semantic_search": { + "abstract": { + "query": "What is the best database?", + "boost": 2.0, + }, + "body": { + "query": "What is the best database?", + "boost": 1.25, + "parameters": { + "instruction": "Represent the Wikipedia question for retrieving supporting documents: ", + }, + }, + }, + "filter": {"user_id": {"$eq": 1}}, + }, + "limit": 10, + }).into(), &mut pipeline) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +char * results = pgml_collectionc_search(collection, "\ + \"query\": {\ + \"full_text_search\": {\ + \"abstract\": {\"query\": \"What is the best database?\", \"boost\": 1.2}\ + },\ + \"semantic_search\": {\ + \"abstract\": {\ + \"query\": \"What is the best database?\",\ + \"boost\": 2.0\ + },\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"boost\": 1.25,\ + \"parameters\": {\ + \"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"\ + }\ + }\ + },\ + \"filter\": {\"user_id\": {\"$eq\": 1}}\ + },\ + \"limit\": 10\ +", pipeline); +``` +{% endtab %} {% endtabs %} Just like `vector_search`, `search` takes in two arguments. The first is a `JSON` object specifying the `query` and `limit` and the second is the `Pipeline`. The `query` object can have three fields: `full_text_search`, `semantic_search` and `filter`. Both `full_text_search` and `semantic_search` function similarly. They take in the text to compare against, titled`query`, an optional `boost` parameter used to boost the effectiveness of the ranking, and `semantic_search` also takes in an optional `parameters` key which specify parameters to pass to the embedding model when embedding the passed in text. diff --git a/pgml-cms/docs/api/client-sdk/search.md b/pgml-cms/docs/api/client-sdk/search.md index 3fc564c55..2d5b5ce41 100644 --- a/pgml-cms/docs/api/client-sdk/search.md +++ b/pgml-cms/docs/api/client-sdk/search.md @@ -147,6 +147,7 @@ results = await collection.vector_search( ) ``` {% endtab %} + {% tab title="Rust" %} ```rust let results = collection @@ -252,7 +253,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let query = "What is the best database?"; @@ -363,7 +363,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let results = collection @@ -463,7 +462,6 @@ results = await collection.vector_search( ``` {% endtab %} -{% endtab %} {% tab title="Rust" %} ```rust let results = collection @@ -585,6 +583,61 @@ results = await collection.vector_search( ) ``` {% endtab %} + +{% tab title="Rust" %} +```rust +let results = collection + .vector_search( + serde_json::json!({ + "query": { + "fields": { + "body": { + "query": "What is the best database?", + "parameters": { + "instruction": "Represent this sentence for searching relevant passages: ", + }, + }, + }, + "filter": { + "$or": [ + {"$and": [{"$eq": {"user_id": 1}}, {"$lt": {"user_score": 100}}]}, + {"special": {"$ne": True}}, + ], + }, + }, + "limit": 5, + }) + .into(), + &mut pipeline, + ) + .await?; +``` +{% endtab %} + +{% tab title="C" %} +```c +r_size = 0; +char **results = pgml_collectionc_vector_search(collection, "{\ + \"query\": {\ + \"fields\": {\ + \"body\": {\ + \"query\": \"What is the best database?\",\ + \"parameters\": {\ + \"instruction\": \"Represent this sentence for searching relevant passages: \"\ + }\ + }\ + },\ + \"filter\": {\ + \"$or\": [\ + {\"$and\": [{\"$eq\": {\"user_id\": 1}}, {\"$lt\": {\"user_score\": 100}}]},\ + {\"special\": {\"$ne\": True}}\ + ]\ + }\ + },\ + \"limit\": 5\ +}", pipeline, &r_size); +``` +{% endtab %} {% endtabs %} The above query would filter out all documents that do not have a key `special` with a value `True` or (have a key `user_id` equal to 1 and a key `user_score` less than 100). From b494857dc00f6935740f24e214fbbcd4b226ec95 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 12:05:43 -0700 Subject: [PATCH 3/4] Rust and c docs ready to go --- pgml-cms/docs/api/client-sdk/README.md | 3 ++- pgml-cms/docs/api/client-sdk/pipelines.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pgml-cms/docs/api/client-sdk/README.md b/pgml-cms/docs/api/client-sdk/README.md index 0ccddb9f0..5e6fc56a0 100644 --- a/pgml-cms/docs/api/client-sdk/README.md +++ b/pgml-cms/docs/api/client-sdk/README.md @@ -89,9 +89,10 @@ async def main(): {% tab title="Rust" %} ```rust use pgml::{Collection, Pipeline}; +use anyhow::Error; #[tokio::main] -async fn main() -> Result<(), Box> { +async fn main() -> Result<(), Error> { let mut collection = Collection::new("sample_collection", None)?; } ``` diff --git a/pgml-cms/docs/api/client-sdk/pipelines.md b/pgml-cms/docs/api/client-sdk/pipelines.md index 6c3ed57cd..dccf3f2b7 100644 --- a/pgml-cms/docs/api/client-sdk/pipelines.md +++ b/pgml-cms/docs/api/client-sdk/pipelines.md @@ -439,6 +439,7 @@ Enabling a `Pipeline` will cause it to automatically run on all documents it may const pipeline = pgml.newPipeline("test_pipeline") const collection = pgml.newCollection("test_collection") await collection.remove_pipeline(pipeline) +``` {% endtab %} {% tab title="Python" %} From f9803076957752a69c3d49de3792df94ad661ac7 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 5 Jun 2024 12:24:38 -0700 Subject: [PATCH 4/4] Updated to make highlighting work --- pgml-dashboard/package-lock.json | 20 +++++++++++++++++++ pgml-dashboard/package.json | 1 + .../code_block/code_block_controller.js | 3 +++ pgml-dashboard/src/utils/markdown.rs | 2 ++ 4 files changed, 26 insertions(+) diff --git a/pgml-dashboard/package-lock.json b/pgml-dashboard/package-lock.json index 4fe4783c7..1da57fd91 100644 --- a/pgml-dashboard/package-lock.json +++ b/pgml-dashboard/package-lock.json @@ -5,6 +5,7 @@ "packages": { "": { "dependencies": { + "@codemirror/lang-cpp": "^6.0.2", "@codemirror/lang-javascript": "^6.2.1", "@codemirror/lang-json": "^6.0.1", "@codemirror/lang-python": "^6.1.3", @@ -46,6 +47,15 @@ "@lezer/common": "^1.1.0" } }, + "node_modules/@codemirror/lang-cpp": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@codemirror/lang-cpp/-/lang-cpp-6.0.2.tgz", + "integrity": "sha512-6oYEYUKHvrnacXxWxYa6t4puTlbN3dgV662BDfSH8+MfjQjVmP697/KYTDOqpxgerkvoNm7q5wlFMBeX8ZMocg==", + "dependencies": { + "@codemirror/language": "^6.0.0", + "@lezer/cpp": "^1.0.0" + } + }, "node_modules/@codemirror/lang-javascript": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/@codemirror/lang-javascript/-/lang-javascript-6.2.2.tgz", @@ -143,6 +153,16 @@ "resolved": "https://registry.npmjs.org/@lezer/common/-/common-1.2.1.tgz", "integrity": "sha512-yemX0ZD2xS/73llMZIK6KplkjIjf2EvAHcinDi/TfJ9hS25G0388+ClHt6/3but0oOxinTcQHJLDXh6w1crzFQ==" }, + "node_modules/@lezer/cpp": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@lezer/cpp/-/cpp-1.1.2.tgz", + "integrity": "sha512-macwKtyeUO0EW86r3xWQCzOV9/CF8imJLpJlPv3sDY57cPGeUZ8gXWOWNlJr52TVByMV3PayFQCA5SHEERDmVQ==", + "dependencies": { + "@lezer/common": "^1.2.0", + "@lezer/highlight": "^1.0.0", + "@lezer/lr": "^1.0.0" + } + }, "node_modules/@lezer/highlight": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/@lezer/highlight/-/highlight-1.2.0.tgz", diff --git a/pgml-dashboard/package.json b/pgml-dashboard/package.json index bc2860eaa..be19da478 100644 --- a/pgml-dashboard/package.json +++ b/pgml-dashboard/package.json @@ -3,6 +3,7 @@ "@codemirror/lang-javascript": "^6.2.1", "@codemirror/lang-python": "^6.1.3", "@codemirror/lang-rust": "^6.0.1", + "@codemirror/lang-cpp": "^6.0.2", "postgresml-lang-sql": "^6.6.3-5", "@codemirror/lang-json": "^6.0.1", "@codemirror/state": "^6.2.1", diff --git a/pgml-dashboard/src/components/code_block/code_block_controller.js b/pgml-dashboard/src/components/code_block/code_block_controller.js index 8817ea08c..25b06a97e 100644 --- a/pgml-dashboard/src/components/code_block/code_block_controller.js +++ b/pgml-dashboard/src/components/code_block/code_block_controller.js @@ -4,6 +4,7 @@ import { sql } from "postgresml-lang-sql"; import { python } from "@codemirror/lang-python"; import { javascript } from "@codemirror/lang-javascript"; import { rust } from "@codemirror/lang-rust"; +import { cpp } from "@codemirror/lang-cpp"; import { json } from "@codemirror/lang-json"; import { EditorView, ViewPlugin, Decoration } from "@codemirror/view"; import { RangeSetBuilder, Facet } from "@codemirror/state"; @@ -84,6 +85,8 @@ const language = (element) => { return rust; case "json": return json; + case "cpp": + return cpp; default: return null; } diff --git a/pgml-dashboard/src/utils/markdown.rs b/pgml-dashboard/src/utils/markdown.rs index 3863dae2e..f55e0ee7a 100644 --- a/pgml-dashboard/src/utils/markdown.rs +++ b/pgml-dashboard/src/utils/markdown.rs @@ -208,6 +208,8 @@ impl<'a> From<&str> for CodeFence<'a> { "postgresql-line-nums" } else if options.starts_with("rust") { "rust" + } else if options.starts_with("cpp") { + "cpp" } else if options.starts_with("json") { "json" } else { pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies: