From ec470b790d90529e0edbc3132aa0cbe80e4d7728 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:06:20 -0700 Subject: [PATCH 1/2] Cleaned up some documentation --- pgml-sdks/rust/pgml/Cargo.lock | 2 +- pgml-sdks/rust/pgml/build.rs | 2 +- pgml-sdks/rust/pgml/javascript/README.md | 533 ++++-------------- .../examples/getting-started/index.js | 4 +- .../getting-started/package-lock.json | 18 - pgml-sdks/rust/pgml/python/README.md | 2 +- pgml-sdks/rust/pgml/python/pgml/pgml.pyi | 2 +- 7 files changed, 112 insertions(+), 451 deletions(-) delete mode 100644 pgml-sdks/rust/pgml/javascript/examples/getting-started/package-lock.json diff --git a/pgml-sdks/rust/pgml/Cargo.lock b/pgml-sdks/rust/pgml/Cargo.lock index b0db6692f..e61b81510 100644 --- a/pgml-sdks/rust/pgml/Cargo.lock +++ b/pgml-sdks/rust/pgml/Cargo.lock @@ -1120,7 +1120,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "pgml" -version = "0.8.1" +version = "0.9.0" dependencies = [ "anyhow", "async-trait", diff --git a/pgml-sdks/rust/pgml/build.rs b/pgml-sdks/rust/pgml/build.rs index ea5bb25bd..656db9886 100644 --- a/pgml-sdks/rust/pgml/build.rs +++ b/pgml-sdks/rust/pgml/build.rs @@ -3,7 +3,7 @@ use std::fs::OpenOptions; use std::io::Write; const ADDITIONAL_DEFAULTS_FOR_PYTHON: &[u8] = br#" -def py_init_logger(level: Optional[str] = "Default set in Rust. Please see documentation.", format: Optional[str] = "Default set in Rust. Please see documentation.") -> None +def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None Json = Any DateTime = int diff --git a/pgml-sdks/rust/pgml/javascript/README.md b/pgml-sdks/rust/pgml/javascript/README.md index 6c4ee83ae..6192d85f4 100644 --- a/pgml-sdks/rust/pgml/javascript/README.md +++ b/pgml-sdks/rust/pgml/javascript/README.md @@ -1,36 +1,28 @@ -# Open Source Alternative for Building End-to-End Vector Search Applications without OpenAI & Pinecone - ## Table of Contents - [Overview](#overview) - [Quickstart](#quickstart) - [Usage](#usage) - - [Create or Get a Collection](#create-or-get-a-collection) - - [Upsert Documents](#upsert-documents) - - [Generate Chunks](#generate-chunks) - - [Generate Embeddings](#generate-embeddings) - - [Vector Search](#vector-search) - - [Register Model](#register-model) - - [Register Text Splitter](#register-text-splitter) +- [Examples](./examples/README.md) - [Developer setup](#developer-setup) - [API Reference](#api-reference) - [Roadmap](#roadmap) ## Overview -The pgml SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases. With this SDK, you can seamlessly manage various database tables related to documents, text chunks, text splitters, LLM (Language Model) models, and embeddings. By leveraging the SDK's capabilities, you can efficiently index LLM embeddings using PgVector for fast and accurate queries. +JavaScript SDK is designed to facilitate the development of scalable vector search applications on PostgreSQL databases. With this SDK, you can seamlessly manage various database tables related to documents, text chunks, text splitters, LLM (Language Model) models, and embeddings. By leveraging the SDK's capabilities, you can efficiently index LLM embeddings using PgVector for fast and accurate queries. ### Key Features - **Automated Database Management**: With the SDK, you can easily handle the management of database tables related to documents, text chunks, text splitters, LLM models, and embeddings. This automated management system simplifies the process of setting up and maintaining your vector search application's data structure. -- **Embedding Generation from Open Source Models**: The Javascript SDK provides the ability to generate embeddings using hundreds of open source models. These models, trained on vast amounts of data, capture the semantic meaning of text and enable powerful analysis and search capabilities. +- **Embedding Generation from Open Source Models**: The JavaScript SDK provides the ability to generate embeddings using hundreds of open source models. These models, trained on vast amounts of data, capture the semantic meaning of text and enable powerful analysis and search capabilities. -- **Flexible and Scalable Vector Search**: The Javascript SDK empowers you to build flexible and scalable vector search applications. The Javascript SDK seamlessly integrates with PgVector, a PostgreSQL extension specifically designed for handling vector-based indexing and querying. By leveraging these indices, you can perform advanced searches, rank results by relevance, and retrieve accurate and meaningful information from your database. +- **Flexible and Scalable Vector Search**: The JavaScript SDK empowers you to build flexible and scalable vector search applications. The JavaScript SDK seamlessly integrates with PgVector, a PostgreSQL extension specifically designed for handling vector-based indexing and querying. By leveraging these indices, you can perform advanced searches, rank results by relevance, and retrieve accurate and meaningful information from your database. ### Use Cases -Embeddings, the core concept of the pgml SDK, find applications in various scenarios, including: +Embeddings, the core concept of the JavaScript SDK, find applications in various scenarios, including: - Search: Embeddings are commonly used for search functionalities, where results are ranked by relevance to a query string. By comparing the embeddings of query strings and documents, you can retrieve search results in order of their similarity or relevance. @@ -42,229 +34,150 @@ Embeddings, the core concept of the pgml SDK, find applications in various scena - Classification: Embeddings are utilized in classification tasks, where text strings are classified based on their most similar label. By comparing the embeddings of text strings and labels, you can classify new text strings into predefined categories. -### How the SDK Works +### How the JavaScript SDK Works -The SDK streamlines the development of vector search applications by abstracting away the complexities of database management and indexing. Here's an overview of how the SDK works: +The JavaScript SDK streamlines the development of vector search applications by abstracting away the complexities of database management and indexing. Here's an overview of how the SDK works: -- **Document and Text Chunk Management**: The SDK provides a convenient interface to create, update, and delete documents and their corresponding text chunks. You can easily organize and structure your text data within the PostgreSQL database. +- **Automatic Document and Text Chunk Management**: The SDK provides a convenient interface to manage documents and pipelines, automatically handling chunking and embedding for you. You can easily organize and structure your text data within the PostgreSQL database. - **Open Source Model Integration**: With the SDK, you can seamlessly incorporate a wide range of open source models to generate high-quality embeddings. These models capture the semantic meaning of text and enable powerful analysis and search capabilities. -- **Embedding Indexing**: The Javascript SDK utilizes the PgVector extension to efficiently index the embeddings generated by the open source models. This indexing process optimizes search performance and allows for fast and accurate retrieval of relevant results. +- **Embedding Indexing**: The JavaScript SDK utilizes the PgVector extension to efficiently index the embeddings generated by the open source models. This indexing process optimizes search performance and allows for fast and accurate retrieval of relevant results. - **Querying and Search**: Once the embeddings are indexed, you can perform vector-based searches on the documents and text chunks stored in the PostgreSQL database. The SDK provides intuitive methods for executing queries and retrieving search results. ## Quickstart -Follow the steps below to quickly get started with the Javascript SDK for building scalable vector search applications on PostgresML databases. +Follow the steps below to quickly get started with the JavaScript SDK for building scalable vector search applications on PostgresML databases. ### Prerequisites Before you begin, make sure you have the following: -- PostgresML Database: You can [sign up for a free GPU-powered database](https://postgresml.org/signup) or [spin up a database using Docker](https://github.com/postgresml/postgresml#installation). Ensure you have a PostgresML database version >`2.3.1`. Set the `PGML_CONNECTION` environment variable to the connection string of your PostgresML database. If not set, the SDK will use the default connection string for your local installation `postgres://postgres@127.0.0.1:5433/pgml_development`. - -To install the Javascript SDK: - -```bash -npm install pgml -``` - -or +- PostgresML Database: Ensure you have a PostgresML database version >=`2.7.7`. You can spin up a database using [Docker](https://github.com/postgresml/postgresml#installation) or [sign up for a free GPU-powered database](https://postgresml.org/signup). -```bash -yarn add pgml -``` - -### Example Usage +- Set the `DATABASE_URL` environment variable to the connection string of your PostgresML database. -In the example below we will step through the code required to create a collection, upsert documents, generate chunks, generate embeddings, and perform vector search. +- Python version >=3.8.1 -#### Initialize project +### Installation -Run the following command to create a new npm project: +To install the JavaScript SDK, use pip: -```bash -mkdir pgml_example && cd pgml_example && npm init ``` - -Install required npm packages: - -```bash -npm install pgml dotenv +npm i pgml ``` -Create index.js file: +### Sample Code -```bash -touch index.js .env -``` - -Add your postgres connection string to the .env file: - -```bash -PGML_CONNECTION=postgres://postgres@localhost:5433/pgml_development -``` - -#### Create a collection - -Add the following code to index.js: +Once you have the JavaScript SDK installed, you can use the following sample code as a starting point for your vector search application: ```javascript const pgml = require("pgml"); -require("dotenv").config(); - -const CONNECTION_STRING = - process.env.PGML_CONNECTION || - "postgres://postgres@127.0.0.1:5433/pgml_development"; - -const db = await pgml.newDatabase(CONNECTION_STRING); const main = async () => { - const collection_name = "hello_world"; - const collection = await db.create_or_get_collection(collection_name); -}; - -main().then((results) => { - console.log("Vector search Results: ", results); -}); + collection = pgml.newCollection("my_javascript_collection"); ``` **Explanation:** -- The code imports the pgml sdk and dotenv. -- It defines the CONNECTION_STRING variable with the default local connection string, and retrieves the connection information from the PGML_CONNECTION environment variable or uses the default if not set. -- An instance of the Database class is created by passing the connection information. -- The method [`create_or_get_collection`](#create-or-get-a-collection) collection with the name `hello_world` is retrieved if it exists or a new collection is created. +- This code imports `pgml` and creates an instance of the Collection class which we will add pipelines and documents onto -Continuing within `main()` +Continuing within `const main` ```javascript -const documents = [ - { - name: "Document One", - text: "document one contents...", - }, - { - name: "Document Two", - text: "document two contents...", - }, -]; -await collection.upsert_documents(documents); -await collection.generate_chunks(); -await collection.generate_embeddings(); + model = pgml.newModel(); + splitter = pgml.newSplitter(); + pipeline = pgml.Pipeline("my_javascript_pipeline", model, splitter); + await collection.add_pipeline(pipeline); ``` -**Explanation:** +**Explanation** -- We define a list of documents with the `name` and `text` fields. The "text" field contains the string that will be embedded. The other fields will be stored as a 'metadata' object. -- The [`upsert_documents`](#upsert-documents) method is called to insert or update the documents in the collection. -- The [`generate_chunks`](#generate-chunks) method splits the documents into smaller text chunks for efficient indexing and search. -- The [`generate_embeddings`](#generate-embeddings) method generates embeddings for the documents in the collection. +- The code creates an instance of `Model` and `Splitter` using their default arguments. +- Finally, the code constructs a pipeline called `"my_javascript_pipeline"` and add it to the collection we Initialized above. This pipeline automatically generates chunks and embeddings for every upserted document. -Continuing within `main()` +Continuing with `const main` ```javascript -const results = await collection.vector_search( - "What are the contents of document one?", - {}, - 1 -); -// convert the results to array of objects -const results = queryResults.map((result) => { - const [similarity, text, metadata] = result; - return { - similarity, - text, - metadata, - }; -}); -await db.archive_collection(collection_name); -return results; + const documents = [ + { + id: "Document One", + text: "document one contents...", + }, + { + id: "Document Two", + text: "document two contents...", + }, + ]; + await collection.upsert_documents(documents); ``` -**Explanation:** +**Explanation** -- The [`vector_search`](#vector-search) method is used to perform a vector-based search on the collection. The first argument, `What are the contents of document one?`, represents the text for which you want to find the most similar results. The second argument is an object that holds the embedding model parameters, and in this case, it is empty. The third argument specifies the number of results to return. -- Next we convert the results to an array of objects. -- Lastly, the `archive_collection` method is called to archive the collection and free up resources in the PostgresML database. +- This code crates and upserts some filler documents. +- As mentioned above, the pipeline added earlier automatically runs and generates chunks and embeddings for each document. -Call `main` function. +Continuing within `const main` ```javascript -main().then((results) => { - console.log("success!", results); -}); -``` + const queryResults = await collection + .query() + .vector_recall("Some user query that will match document one first", pipeline) + .limit(2) + .fetch_all(); -**Putting it all together** + // Convert the results to an array of objects + const results = queryResults.map((result) => { + const [similarity, text, metadata] = result; + return { + similarity, + text, + metadata, + }; + }); + console.log(results); -Assuming you followed along, you should have a file `index.js` that looks like this: + await collection.archive(); +``` -```javascript -const pgml = require("pgml"); -require("dotenv").config(); +**Explanation:** -const CONNECTION_STRING = - process.env.PGML_CONNECTION || - "postgres://postgres@127.0.0.1:5433/pgml_development"; +- The `query` method is called to perform a vector-based search on the collection. The query string is `Some user query that will match document one first`, and the top 2 results are requested. +- The search results are converted to objects and printed. +- Finally, the `archive` method is called to archive the collection and free up resources in the PostgresML database. -const main = async () => { - const db = await pgml.newDatabase(CONNECTION_STRING); - const collection_name = "hello_world_2"; - const collection = await db.create_or_get_collection(collection_name); - const documents = [ - { - name: "Document One", - text: "document one contents...", - }, - { - name: "Document Two", - text: "document two contents...", - }, - ]; - await collection.upsert_documents(documents); - await collection.generate_chunks(); - await collection.generate_embeddings(); - const queryResults = await collection.vector_search( - "What are the contents of document one?", // query text - {}, // embedding model parameters - 1 // top_k - ); - - // convert the results to array of objects - const results = queryResults.map((result) => { - const [similarity, text, metadata] = result; - return { - similarity, - text, - metadata, - }; - }); - - await db.archive_collection(collection_name); - return results; -}; - -main().then((results) => { - console.log("Vector search Results: ", results); +Call `main` function. + +```javascript +main().then(() => { + console.log("Done with PostgresML demo"); }); ``` +**Running the Code** + +Open a terminal or command prompt and navigate to the directory where the file is saved. + Execute the following command: ``` -node index.js +node vector_search.js ``` -You should see the search results printed in the terminal. As you can see, our vector search engine found the right text chunk with the answer we are looking for. +You should see the search results printed in the terminal. As you can see, our vector search engine did match document one first. -```json +``` [ { - "similarity": 0.917946581193032, - "text": "document one contents...", - "metadata": { "name": "Document One" } + similarity: 0.8506832955692104, + text: 'document one contents...', + metadata: { id: 'Document One' } + }, + { + similarity: 0.8066114609244565, + text: 'document two contents...', + metadata: { id: 'Document Two' } } ] ``` @@ -273,286 +186,52 @@ You should see the search results printed in the terminal. As you can see, our v ### High-level Description -The Javascript SDK provides a set of functionalities to build scalable vector search applications on PostgresQL databases. It enables users to create a collection, which represents a schema in the database, to store tables for documents, chunks, models, splitters, and embeddings. The Collection class in the SDK handles all operations related to these tables, allowing users to interact with the collection and perform various tasks. - -### Connect to Database - -`.newDatabase(CONNECTION_STRING)` - -This method establishes a connection to a new database. - -#### Parameters: - -- `CONNECTION_STRING` (required): The connection string for the database. The connection string should be in the format of `postgres://username@hostname:port/database`. This parameter is required. - -The method initializes a connection pool to the DB and creates a table named `pgml.collections` if it does not already exist. - -#### Usage: - -```javascript -const pgml = require("pgml"); - -const CONNECTION_STRING = - process.env.PGML_CONNECTION || - "postgres://postgres@127.0.0.1:5433/pgml_development"; - -const db = await pgml.newDatabase(CONNECTION_STRING); -``` - -### Create or Get a Collection - -`.create_or_get_collection(collection_name)` - -This method either creates a new collection or retrieves an existing one from a PostgreSQL database. - -#### Parameters: - -- `collection_name` (required): The name of the collection to be created or retrieved. This parameter is required and must be a string. - -If the collection already exists in the database, this method will return that collection. If the collection does not exist, this method will create a new collection with the specified name, along with the associated tables and indices for documents, chunks, models, splitters, and embeddings. - -#### Usage: - -```javascript -const collection_name = "test_collection"; -const collection = await db.create_or_get_collection(collection_name); -``` - -In the above example, a collection named test_collection is either created or retrieved from the database. - -Ensure that the name provided is unique if you intend to create a new collection, as this function will return the existing collection if a collection with the same name already exists in the database. - -### Upsert Documents - -`.upsert_documents(documents)` - -This method is used to insert or update documents in a database table based on their ID, text, and any additional fields. All the fields except `id` and `text` will be aggregated and stored in a `metadata` object. - -#### Parameters: - -- `documents` (required): An array of document objects to be inserted or updated in the database. Each document object should be a dictionary containing at least an `id` and `text` fields. Any other fields will be considered as metadata. - - `id`: A unique identifier for the document. If a document with the same ID already exists in the database, the document will be updated with the new text and metadata. - - `text`: The text content of the document. - - Other fields (optional): Any other fields in the document will be considered as metadata. The structure of the metadata can vary based on the specific needs of your application. - -#### Usage: - -```javascript -let documents = [ - { - id: "1", - text: "This is a sample document", - author: "John Doe", - date: "2023-07-05", - }, - { - id: "2", - text: "This is another sample document", - }, -]; - -await collection.upsert_documents(documents); -``` - -In the above example, two documents are upserted into the database. The first document includes additional fields `author` and `date`, which will be stored as metadata. - -To update a document, simply upsert a document with the same `id` but different `text` or additional fields. For example: - -```javascript -let updated_document = { - id: "1", - text: "This is an updated sample document", - author: "John Doe", - date: "2023-07-05", - version: "2.0", -}; - -await collection.upsert_documents([updated_document]); -``` - -In this case, the document with ID `1` is updated with new text and additional fields `version`. The fields `author`, `date`, and `version` will be stored as metadata. - -### Generate Chunks - -`.generate_chunks(splitter_id)` - -This method is used to generate chunks of text from unchunked documents using a specified text splitter. - -#### Parameters: - -- `splitter_id` (optional): The ID of the splitter used for segmenting the text into chunks. This parameter is optional. If not specified, it defaults to `1`, which corresponds to the `RecursiveCharacterTextSplitter` with default parameters. - -The `splitter_id` should correspond to a splitter that is already registered. If you want to use a different splitter, you need to register it first. - -#### Usage: - -```javascript -await collection.generate_chunks(1); -``` - -In the above example, `splitter_id` is specified as `1`, which means that the default `RecursiveCharacterTextSplitter` is used to generate chunks. - -To use a different splitter, you need to pass its corresponding ID. For example: - -```javascript -await collection.generate_chunks(2); -``` - -In this case, the splitter with ID `2` is used. Ensure that this ID corresponds to a registered splitter. If the splitter isn't registered yet, you'll need to do so before using this method. For information on how to register a new splitter, refer to the [`register_text_splitter`](#register-text-splitter) documentation. - -### Generate Embeddings - -`.generate_embeddings(model_id, splitter_id)` - -This method generates embeddings from the chunks of text. - -#### Parameters: - -- `model_id` (optional): The ID of the model used to generate embeddings. This parameter is optional. If not specified, it defaults to `1`, corresponding to the `intfloat/e5-small` embeddings model. -- `splitter_id` (optional): The ID of the splitter used for segmenting the text into chunks. This parameter is optional. If not specified, it defaults to `1`. - -Both `splitter_id` and `model_id` should correspond to a splitter and model that are already registered. - -#### Usage: - -```javascript -await collection.generate_embeddings(1, 1); -``` - -In the above example, both `splitter_id` and `model_id` are specified as `1`, which means that the default splitter and `intfloat/e5-small` model are used to generate embeddings. +The JavaScript SDK provides a set of functionalities to build scalable vector search applications on PostgresQL databases. It enables users to create a collection, which represents a schema in the database, to store tables for documents, chunks, models, splitters, and embeddings. The Collection class in the SDK handles all operations related to these tables, allowing users to interact with the collection and perform various tasks. -To use a different splitter or model, you need to pass their corresponding IDs. For example: +#### Create or a Collection ```javascript -await collection.generate_embeddings(2, 3); +collection_name = pgml.newCollection("test_collection") ``` -In this case, the splitter with ID `3` and the model with ID `2` are used. Ensure that these IDs correspond to a registered splitter and model. - -### Vector Search - -`.vector_search(query, top_k, model_id, splitter_id)` +This initializes a new Collection used to do everything from upserting documents to performing vector search. -This method converts the input query into embeddings and searches the embeddings table for the nearest matches. - -#### Parameters: - -- `query` (required): The query text that needs to be converted into embeddings for vector search. -- `top_k` (optional): The number of top matches that should be returned. This parameter is optional. If not specified, it defaults to `10`. -- `model_id` (optional): The ID of the model used to convert query into embeddings. This parameter is optional. If not specified, it defaults to `1`, corresponding to the `intfloat/e5-small` embeddings model. -- `splitter_id` (optional): The ID of the splitter used for segmenting the query text into chunks. This parameter is optional. If not specified, it defaults to `1`. - -Both `splitter_id` and `model_id` should correspond to a splitter and model that are already registered. - -#### Usage: +### Add a Pipeline ```javascript -const results = await collection.vector_search( - "Who won 20 grammy awards?", - 2, // top_k - 1 // model_id - 1, // splitter_id -); +model = pgml.newModel() +splitter = pgml.newSplitter() +pipeline = pgml.newPipeline("test_pipeline", model, splitter) +await collection.add_pipeline(pipeline) ``` -The `vector_search` method returns an array of results, where each result is a an array where idx 0 is the similarity score, idx 1 is the text, and idx 2 is the metadata object. +This creates a new pipeline with the specified `Model` and `Splitter`. The pipelines do the heavy lifting automatically handling the chunking and embedding of documents. -You can format the results into an array of objects like so: +#### Upsert Documents ```javascript -// convert the results to array of objects -const results = queryResults.map((result) => { - const [similarity, text, metadata] = result; - return { - similarity, - text, - metadata, - }; -}); +await collection.upsert_documents(documents) ``` -### Register Model +The method is used to insert or update documents in a database table based on their ID, and text. All enabled pipelines automatically chunk and embed upserted documents. -`.register_model(model_name, model_params)` - -This method allows for the registration of a model that can be used in the collection. It creates a record if the model does not already exist. - -#### Parameters: - -- `model_name` (required): The name of the open source HuggingFace model being registered. This should be a string that represents the model name. -- `model_params` (optional): A dictionary containing parameters for configuring the model. This parameter is optional and can be left empty if no special configuration is needed for the model. - -The `model_name` should correspond to a valid HuggingFace model name. The `model_params`, if provided, should be a dictionary where keys are parameter names and values are the corresponding settings. - -#### Usage: +#### Vector Search ```javascript -const modelId = await collection.register_model("hkunlp/instructor-xl", { - instruction: "Represent the Wikipedia document for retrieval: ", -}); +results = await collection.query().vector_recall("Who won 20 grammy awards?", pipeline=pipeline).limit(2).fetch_all() ``` -In the above example, the `model_name` is "hkunlp/instructor-xl", and `model_params` is an object that sets the instruction to "Represent the Wikipedia document for retrieval: ". - -To register a model without any special parameters, you can simply pass the model name. For example: - -```javascript -const modelId = await collection.register_model("distilbert-base-uncased"); -``` - -In this case, the model "distilbert-base-uncased" is registered with the default parameters. Make sure that the model name corresponds to a valid HuggingFace model. - -### Register Text Splitter - -`.register_text_splitter(splitter_name, parameters)` - -This method registers a new text splitter in the system. - -#### Parameters: - -- `splitter_name` (required): The name of the splitter. The system currently supports the following splitter names: - - - `"character"` - - `"latex"` - - `"markdown"` - - `"nltk"` - - `"python"` - - `"recursive_character"` - - `"spacy"` - -- `parameters` (required): This is an object that contains the parameters specific to the splitter. For example, if you're using the `"recursive_character"` splitter, the parameters could be: - - `chunk_size`: Specifies the size of each chunk of text. - - `chunk_overlap`: Specifies how much each chunk should overlap with the next. - -The parameters required will depend on the splitter being used. Please refer to the [LangChain documentation](https://python.langchain.com/en/latest/reference/modules/text_splitter.html) for more details. - -#### Usage: - -```javascript -const textSplitterId = await collection.register_text_splitter( - "recursive_character", - { chunk_size: "100", chunk_overlap: "20" } -); -``` - -In the above example, the `recursive_character` splitter is registered with a `chunk_size` of `100` and `chunk_overlap` of `20`. The method returns the ID of the registered splitter, which can be used in other methods like `.generate_embeddings()`. - -Ensure that the `splitter_name` corresponds to one of the registered splitters, and the parameters match the requirements of that specific splitter. +The `query` method returns a flexible query builder for high performance filterable vector search. ### Developer Setup -This Javascript library is generated from our core rust-sdk. Please check [rust-sdk documentation](../../rust/pgml/README.md) for developer setup. - -### API Reference - -- [Database](./docs/pgml/database.md) -- [Collection](./docs/pgml/collection.md) +This JavaScript library is generated from our core rust-sdk. Please check [rust-sdk documentation](../../README.md) for developer setup. ### Roadmap -- Enable filters on document metadata in `vector_search`. [Issue](https://github.com/postgresml/postgresml/issues/663) -- `text_search` functionality on documents using Postgres text search. [Issue](https://github.com/postgresml/postgresml/issues/664) -- `hybrid_search` functionality that does a combination of `vector_search` and `text_search` in an order specified by the user. [Issue](https://github.com/postgresml/postgresml/issues/665) -- Ability to call and manage OpenAI embeddings for comparison purposes. [Issue](https://github.com/postgresml/postgresml/issues/666) +- [x] Enable filters on document metadata in `vector_search`. [Issue](https://github.com/postgresml/postgresml/issues/663) +- [x] `text_search` functionality on documents using Postgres text search. [Issue](https://github.com/postgresml/postgresml/issues/664) +- [x] `hybrid_search` functionality that does a combination of `vector_search` and `text_search` in an order specified by the user. [Issue](https://github.com/postgresml/postgresml/issues/665) +- [x] Ability to call and manage OpenAI embeddings for comparison purposes. [Issue](https://github.com/postgresml/postgresml/issues/666) - Save `vector_search` history for downstream monitoring of model performance. [Issue](https://github.com/postgresml/postgresml/issues/667) - Perform chunking on the DB with multiple langchain splitters. [Issue](https://github.com/postgresml/postgresml/issues/668) diff --git a/pgml-sdks/rust/pgml/javascript/examples/getting-started/index.js b/pgml-sdks/rust/pgml/javascript/examples/getting-started/index.js index 75bd75802..0c1c5f7eb 100644 --- a/pgml-sdks/rust/pgml/javascript/examples/getting-started/index.js +++ b/pgml-sdks/rust/pgml/javascript/examples/getting-started/index.js @@ -27,7 +27,7 @@ const main = async () => { // Perform vector search const queryResults = await collection .query() - .vector_recall("What are the contents of document one?", pipeline) + .vector_recall("Some user query that will match document one first", pipeline) .limit(2) .fetch_all(); @@ -46,5 +46,5 @@ const main = async () => { }; main().then((results) => { - console.log("Vector search Results: ", results); + console.log("Vector search Results: \n", results); }); diff --git a/pgml-sdks/rust/pgml/javascript/examples/getting-started/package-lock.json b/pgml-sdks/rust/pgml/javascript/examples/getting-started/package-lock.json deleted file mode 100644 index 78645d335..000000000 --- a/pgml-sdks/rust/pgml/javascript/examples/getting-started/package-lock.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "name": "getting-started", - "version": "1.0.0", - "lockfileVersion": 1, - "requires": true, - "dependencies": { - "dotenv": { - "version": "16.3.1", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.3.1.tgz", - "integrity": "sha512-IPzF4w4/Rd94bA9imS68tZBaYyBWSCE47V1RGuMrB94iyTOIEwRmVL2x/4An+6mETpLrKJ5hQkB8W4kFAadeIQ==" - }, - "pgml": { - "version": "0.1.6", - "resolved": "https://registry.npmjs.org/pgml/-/pgml-0.1.6.tgz", - "integrity": "sha512-gjuEYDPl7TrnsxtL2htXb0NCKQ6FhM9kyRvzwYNhYQSRdnEHVps4Yk3vIL9QoZj1y9niuGW6aXebx4T734TJhQ==" - } - } -} diff --git a/pgml-sdks/rust/pgml/python/README.md b/pgml-sdks/rust/pgml/python/README.md index b37174feb..01c154112 100644 --- a/pgml-sdks/rust/pgml/python/README.md +++ b/pgml-sdks/rust/pgml/python/README.md @@ -56,7 +56,7 @@ Follow the steps below to quickly get started with the Python SDK for building s Before you begin, make sure you have the following: -- PostgresML Database: Ensure you have a PostgresML database version >`2.3.1`. You can spin up a database using [Docker](https://github.com/postgresml/postgresml#installation) or [sign up for a free GPU-powered database](https://postgresml.org/signup). +- PostgresML Database: Ensure you have a PostgresML database version >= `2.7.7` You can spin up a database using [Docker](https://github.com/postgresml/postgresml#installation) or [sign up for a free GPU-powered database](https://postgresml.org/signup). - Set the `DATABASE_URL` environment variable to the connection string of your PostgresML database. diff --git a/pgml-sdks/rust/pgml/python/pgml/pgml.pyi b/pgml-sdks/rust/pgml/python/pgml/pgml.pyi index 6c6dbb04f..a1bb82320 100644 --- a/pgml-sdks/rust/pgml/python/pgml/pgml.pyi +++ b/pgml-sdks/rust/pgml/python/pgml/pgml.pyi @@ -1,5 +1,5 @@ -def py_init_logger(level: Optional[str] = "Default set in Rust. Please see documentation.", format: Optional[str] = "Default set in Rust. Please see documentation.") -> None +def py_init_logger(level: Optional[str] = "", format: Optional[str] = "") -> None Json = Any DateTime = int From 737d40df018941e6525b7c23005ddb05355e2f13 Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:11:59 -0700 Subject: [PATCH 2/2] Fixed spelling --- pgml-sdks/rust/pgml/javascript/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgml-sdks/rust/pgml/javascript/README.md b/pgml-sdks/rust/pgml/javascript/README.md index 6192d85f4..68f657676 100644 --- a/pgml-sdks/rust/pgml/javascript/README.md +++ b/pgml-sdks/rust/pgml/javascript/README.md @@ -62,7 +62,7 @@ Before you begin, make sure you have the following: ### Installation -To install the JavaScript SDK, use pip: +To install the JavaScript SDK, use npm: ``` npm i pgml pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy