pFad - Phonifier reborn

From be038978c57b758de5758acb52380608174e31bc Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 10 Jan 2024 09:18:18 -0800 Subject: [PATCH 01/72] New site search --- pgml-dashboard/src/api/cms.rs | 13 +- pgml-dashboard/src/main.rs | 8 +- pgml-dashboard/src/utils/markdown.rs | 360 +++- pgml-sdks/pgml/src/collection.rs | 1083 +++++----- pgml-sdks/pgml/src/lib.rs | 2258 ++++++++++++-------- pgml-sdks/pgml/src/models.rs | 30 +- pgml-sdks/pgml/src/multi_field_pipeline.rs | 755 +++++++ pgml-sdks/pgml/src/pipeline.rs | 1248 +++++------ pgml-sdks/pgml/src/queries.rs | 57 +- pgml-sdks/pgml/src/query_builder.rs | 174 +- pgml-sdks/pgml/src/remote_embeddings.rs | 49 +- pgml-sdks/pgml/src/search_query_builder.rs | 258 +++ pgml-sdks/pgml/src/types.rs | 4 +- 13 files changed, 3958 insertions(+), 2339 deletions(-) create mode 100644 pgml-sdks/pgml/src/multi_field_pipeline.rs create mode 100644 pgml-sdks/pgml/src/search_query_builder.rs diff --git a/pgml-dashboard/src/api/cms.rs b/pgml-dashboard/src/api/cms.rs index 2048b24c8..d2a7c767f 100644 --- a/pgml-dashboard/src/api/cms.rs +++ b/pgml-dashboard/src/api/cms.rs @@ -559,8 +559,15 @@ impl Collection { } #[get("/search?", rank = 20)] -async fn search(query: &str, index: &State) -> ResponseOk { - let results = index.search(query).unwrap(); +async fn search( + query: &str, + site_search: &State, +) -> ResponseOk { + eprintln!("\n\nWE IN HERE\n\n"); + let results = site_search + .search(query) + .await + .expect("Error performing search"); ResponseOk( Template(Search { @@ -779,7 +786,7 @@ This is the end of the markdown async fn rocket() -> Rocket { dotenv::dotenv().ok(); rocket::build() - .manage(crate::utils::markdown::SearchIndex::open().unwrap()) + // .manage(crate::utils::markdown::SearchIndex::open().unwrap()) .mount("/", crate::api::cms::routes()) } diff --git a/pgml-dashboard/src/main.rs b/pgml-dashboard/src/main.rs index f09b21d8b..275e9c5df 100644 --- a/pgml-dashboard/src/main.rs +++ b/pgml-dashboard/src/main.rs @@ -92,14 +92,18 @@ async fn main() { // it's important to hang on to sentry so it isn't dropped and stops reporting let _sentry = configure_reporting().await; - markdown::SearchIndex::build().await.unwrap(); + // markdown::SearchIndex::build().await.unwrap(); + + let site_search = markdown::SiteSearch::new() + .await + .expect("Error initializing site search"); pgml_dashboard::migrate(guards::Cluster::default(None).pool()) .await .unwrap(); let _ = rocket::build() - .manage(markdown::SearchIndex::open().unwrap()) + .manage(site_search) .mount("/", rocket::routes![index, error]) .mount("/dashboard/static", FileServer::from(config::static_dir())) .mount("/dashboard", pgml_dashboard::routes()) diff --git a/pgml-dashboard/src/utils/markdown.rs b/pgml-dashboard/src/utils/markdown.rs index dcd878e3a..ee19c606c 100644 --- a/pgml-dashboard/src/utils/markdown.rs +++ b/pgml-dashboard/src/utils/markdown.rs @@ -15,6 +15,7 @@ use comrak::{ use convert_case; use itertools::Itertools; use regex::Regex; +use serde::{Deserialize, Serialize}; use tantivy::collector::TopDocs; use tantivy::query::{QueryParser, RegexQuery}; use tantivy::schema::*; @@ -1222,6 +1223,7 @@ pub async fn get_document(path: &PathBuf) -> anyhow::Result { Ok(tokio::fs::read_to_string(path).await?) } +#[derive(Deserialize)] pub struct SearchResult { pub title: String, pub body: String, @@ -1229,20 +1231,33 @@ pub struct SearchResult { pub snippet: String, } -pub struct SearchIndex { - // The index. - pub index: Arc, +#[derive(Serialize)] +struct Document { + id: String, + title: String, + body: String, + path: String, +} - // Index schema (fields). - pub schema: Arc, +impl Document { + fn new(id: String, title: String, body: String, path: String) -> Self { + Self { id, title, body, path } + } +} - // The index reader, supports concurrent access. - pub reader: Arc, +pub struct SiteSearch { + collection: pgml::Collection, + pipeline: pgml::MultiFieldPipeline, } -impl SearchIndex { - pub fn path() -> PathBuf { - Path::new(&config::search_index_dir()).to_owned() +impl SiteSearch { + pub async fn new() -> anyhow::Result { + let collection = pgml::Collection::new( + "hypercloud-site-search-c-1", + Some(std::env::var("SITE_SEARCH_DATABASE_URL")?), + ); + let pipeline = pgml::MultiFieldPipeline::new("hypercloud-site-search-p-1", serde_json::json!({}).into()); + Ok(Self { collection, pipeline }) } pub fn documents() -> Vec { @@ -1255,23 +1270,59 @@ impl SearchIndex { .collect() } - pub fn schema() -> Schema { - // TODO: Make trigram title index - // and full text body index, and use trigram only if body gets nothing. - let mut schema_builder = Schema::builder(); - let title_field_indexing = TextFieldIndexing::default() - .set_tokenizer("ngram3") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let title_options = TextOptions::default() - .set_indexing_options(title_field_indexing) - .set_stored(); - - schema_builder.add_text_field("title", title_options.clone()); - schema_builder.add_text_field("title_regex", TEXT | STORED); - schema_builder.add_text_field("body", TEXT | STORED); - schema_builder.add_text_field("path", STORED); - - schema_builder.build() + pub async fn search(&self, query: &str) -> anyhow::Result> { + self.collection + .search( + serde_json::json!({ + "query": { + "semantic_search": { + "title": { + "query": query, + "boost": 2.0, + }, + "body": { + "query": query, + } + } + } + }) + .into(), + &self.pipeline, + ) + .await? + .into_iter() + .map(|r| serde_json::from_value(r.0).map_err(anyhow::Error::msg)) + } + + pub async fn build(&mut self) -> anyhow::Result<()> { + let documents: Vec = + futures::future::try_join_all(Self::get_document_paths()?.into_iter().map(|path| async move { + let text = get_document(&path).await?; + + let arena = Arena::new(); + let root = parse_document(&arena, &text, &options()); + let title_text = get_title(root)?; + let body_text = get_text(root)?.into_iter().join(" "); + + let path = path + .to_str() + .unwrap() + .to_string() + .split("content") + .last() + .unwrap() + .to_string() + .replace("README", "") + .replace(&config::cms_dir().display().to_string(), ""); + + anyhow::Ok(Document::new(path.clone(), title_text, body_text, path)) + })) + .await?; + let documents: Vec = documents + .into_iter() + .map(|d| serde_json::to_value(d).unwrap().into()) + .collect(); + self.collection.upsert_documents(documents, None).await } pub async fn build() -> tantivy::Result<()> { @@ -1468,8 +1519,263 @@ impl SearchIndex { Ok(results) } + + fn get_document_paths() -> anyhow::Result> { + // TODO imrpove this .display().to_string() + let guides = glob::glob(&config::cms_dir().join("docs/**/*.md").display().to_string())?; + let blogs = glob::glob(&config::cms_dir().join("blog/**/*.md").display().to_string())?; + Ok(guides + .chain(blogs) + .map(|path| path.expect("glob path failed")) + .collect()) + } } +// pub struct SearchIndex { +// // The index. +// pub index: Arc, + +// // Index schema (fields). +// pub schema: Arc, + +// // The index reader, supports concurrent access. +// pub reader: Arc, +// } + +// impl SearchIndex { +// pub fn path() -> PathBuf { +// Path::new(&config::search_index_dir()).to_owned() +// } + +// pub fn documents() -> Vec { +// // TODO imrpove this .display().to_string() +// let guides = glob::glob(&config::cms_dir().join("docs/**/*.md").display().to_string()) +// .expect("glob failed"); +// let blogs = glob::glob(&config::cms_dir().join("blog/**/*.md").display().to_string()) +// .expect("glob failed"); +// guides +// .chain(blogs) +// .map(|path| path.expect("glob path failed")) +// .collect() +// } + +// pub fn schema() -> Schema { +// // TODO: Make trigram title index +// // and full text body index, and use trigram only if body gets nothing. +// let mut schema_builder = Schema::builder(); +// let title_field_indexing = TextFieldIndexing::default() +// .set_tokenizer("ngram3") +// .set_index_option(IndexRecordOption::WithFreqsAndPositions); +// let title_options = TextOptions::default() +// .set_indexing_options(title_field_indexing) +// .set_stored(); + +// schema_builder.add_text_field("title", title_options.clone()); +// schema_builder.add_text_field("title_regex", TEXT | STORED); +// schema_builder.add_text_field("body", TEXT | STORED); +// schema_builder.add_text_field("path", STORED); + +// schema_builder.build() +// } + +// pub async fn build() -> tantivy::Result<()> { +// // Remove existing index. +// let _ = std::fs::remove_dir_all(Self::path()); +// std::fs::create_dir(Self::path()).unwrap(); + +// let index = tokio::task::spawn_blocking(move || -> tantivy::Result { +// Index::create_in_dir(Self::path(), Self::schema()) +// }) +// .await +// .unwrap()?; + +// let ngram = TextAnalyzer::from(NgramTokenizer::new(3, 3, false)).filter(LowerCaser); + +// index.tokenizers().register("ngram3", ngram); + +// let schema = Self::schema(); +// let mut index_writer = index.writer(50_000_000)?; + +// for path in Self::documents().into_iter() { +// let text = get_document(&path).await.unwrap(); + +// let arena = Arena::new(); +// let root = parse_document(&arena, &text, &options()); +// let title_text = get_title(root).unwrap(); +// let body_text = get_text(root).unwrap().into_iter().join(" "); + +// let title_field = schema.get_field("title").unwrap(); +// let body_field = schema.get_field("body").unwrap(); +// let path_field = schema.get_field("path").unwrap(); +// let title_regex_field = schema.get_field("title_regex").unwrap(); + +// info!("found path: {path}", path = path.display()); +// let path = path +// .to_str() +// .unwrap() +// .to_string() +// .split("content") +// .last() +// .unwrap() +// .to_string() +// .replace("README", "") +// .replace(&config::cms_dir().display().to_string(), ""); +// let mut doc = Document::default(); +// doc.add_text(title_field, &title_text); +// doc.add_text(body_field, &body_text); +// doc.add_text(path_field, &path); +// doc.add_text(title_regex_field, &title_text); + +// index_writer.add_document(doc)?; +// } + +// tokio::task::spawn_blocking(move || -> tantivy::Result { index_writer.commit() }) +// .await +// .unwrap()?; + +// Ok(()) +// } + +// pub fn open() -> tantivy::Result { +// let path = Self::path(); + +// if !path.exists() { +// std::fs::create_dir(&path) +// .expect("failed to create search_index directory, is the filesystem writable?"); +// } + +// let index = match tantivy::Index::open_in_dir(&path) { +// Ok(index) => index, +// Err(err) => { +// warn!( +// "Failed to open Tantivy index in '{}', creating an empty one, error: {}", +// path.display(), +// err +// ); +// Index::create_in_dir(&path, Self::schema())? +// } +// }; + +// let reader = index.reader_builder().try_into()?; + +// let ngram = TextAnalyzer::from(NgramTokenizer::new(3, 3, false)).filter(LowerCaser); + +// index.tokenizers().register("ngram3", ngram); + +// Ok(SearchIndex { +// index: Arc::new(index), +// schema: Arc::new(Self::schema()), +// reader: Arc::new(reader), +// }) +// } + +// pub fn search(&self, query_string: &str) -> tantivy::Result> { +// let mut results = Vec::new(); +// let searcher = self.reader.searcher(); +// let title_field = self.schema.get_field("title").unwrap(); +// let body_field = self.schema.get_field("body").unwrap(); +// let path_field = self.schema.get_field("path").unwrap(); +// let title_regex_field = self.schema.get_field("title_regex").unwrap(); + +// // Search using: +// // +// // 1. Full text search on the body +// // 2. Trigrams on the title +// let query_parser = QueryParser::for_index(&self.index, vec![title_field, body_field]); +// let query = match query_parser.parse_query(query_string) { +// Ok(query) => query, +// Err(err) => { +// warn!("Query parse error: {}", err); +// return Ok(Vec::new()); +// } +// }; + +// let mut top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + +// // If that's not enough, search using prefix search on the title. +// if top_docs.len() < 10 { +// let query = +// match RegexQuery::from_pattern(&format!("{}.*", query_string), title_regex_field) { +// Ok(query) => query, +// Err(err) => { +// warn!("Query regex error: {}", err); +// return Ok(Vec::new()); +// } +// }; + +// let more_results = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); +// top_docs.extend(more_results); +// } + +// // Oh jeez ok +// if top_docs.len() < 10 { +// let query = match RegexQuery::from_pattern(&format!("{}.*", query_string), body_field) { +// Ok(query) => query, +// Err(err) => { +// warn!("Query regex error: {}", err); +// return Ok(Vec::new()); +// } +// }; + +// let more_results = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); +// top_docs.extend(more_results); +// } + +// // Generate snippets for the FTS query. +// let snippet_generator = SnippetGenerator::create(&searcher, &*query, body_field)?; + +// let mut dedup = HashSet::new(); + +// for (_score, doc_address) in top_docs { +// let retrieved_doc = searcher.doc(doc_address)?; +// let snippet = snippet_generator.snippet_from_doc(&retrieved_doc); +// let path = retrieved_doc +// .get_first(path_field) +// .unwrap() +// .as_text() +// .unwrap() +// .to_string() +// .replace(".md", "") +// .replace(&config::static_dir().display().to_string(), ""); + +// // Dedup results from prefix search and full text search. +// let new = dedup.insert(path.clone()); + +// if !new { +// continue; +// } + +// let title = retrieved_doc +// .get_first(title_field) +// .unwrap() +// .as_text() +// .unwrap() +// .to_string(); +// let body = retrieved_doc +// .get_first(body_field) +// .unwrap() +// .as_text() +// .unwrap() +// .to_string(); + +// let snippet = if snippet.is_empty() { +// body.split(' ').take(20).collect::>().join(" ") + " ..." +// } else { +// "... ".to_string() + &snippet.to_html() + " ..." +// }; + +// results.push(SearchResult { +// title, +// body, +// path, +// snippet, +// }); +// } + +// Ok(results) +// } +// } + #[cfg(test)] mod test { use super::*; diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index e893e64c5..ac1f1a486 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -18,7 +18,9 @@ use walkdir::WalkDir; use crate::{ filter_builder, get_or_initialize_pool, model::ModelRuntime, - models, order_by_builder, + models, + multi_field_pipeline::MultiFieldPipeline, + order_by_builder, pipeline::Pipeline, queries, query_builder, query_builder::QueryBuilder, @@ -104,7 +106,6 @@ pub struct Collection { pub database_url: Option, pub pipelines_table_name: String, pub documents_table_name: String, - pub transforms_table_name: String, pub chunks_table_name: String, pub documents_tsvectors_table_name: String, pub(crate) database_data: Option, @@ -147,7 +148,6 @@ impl Collection { let ( pipelines_table_name, documents_table_name, - transforms_table_name, chunks_table_name, documents_tsvectors_table_name, ) = Self::generate_table_names(name); @@ -156,7 +156,6 @@ impl Collection { database_url, pipelines_table_name, documents_table_name, - transforms_table_name, chunks_table_name, documents_tsvectors_table_name, database_data: None, @@ -233,16 +232,14 @@ impl Collection { }, }; + // Splitters table is not unique to a collection or pipeline. It exists in the pgml schema Splitter::create_splitters_table(&mut transaction).await?; - Pipeline::create_pipelines_table( + self.create_documents_table(&mut transaction).await?; + MultiFieldPipeline::create_multi_field_pipelines_table( &collection_database_data.project_info, &mut transaction, ) .await?; - self.create_documents_table(&mut transaction).await?; - self.create_chunks_table(&mut transaction).await?; - self.create_documents_tsvectors_table(&mut transaction) - .await?; transaction.commit().await?; Some(collection_database_data) @@ -272,9 +269,15 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn add_pipeline(&mut self, pipeline: &mut Pipeline) -> anyhow::Result<()> { + pub async fn add_pipeline(&mut self, pipeline: &mut MultiFieldPipeline) -> anyhow::Result<()> { self.verify_in_database(false).await?; - pipeline.set_project_info(self.database_data.as_ref().unwrap().project_info.clone()); + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to add a pipeline to a collection")? + .project_info; + pipeline.set_project_info(project_info.clone()); + pipeline.verify_in_database(true).await?; let mp = MultiProgress::new(); mp.println(format!("Added Pipeline {}, Now Syncing...", pipeline.name))?; pipeline.execute(&None, mp).await?; @@ -301,65 +304,35 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn remove_pipeline(&mut self, pipeline: &mut Pipeline) -> anyhow::Result<()> { + pub async fn remove_pipeline( + &mut self, + pipeline: &mut MultiFieldPipeline, + ) -> anyhow::Result<()> { let pool = get_or_initialize_pool(&self.database_url).await?; self.verify_in_database(false).await?; - pipeline.set_project_info(self.database_data.as_ref().unwrap().project_info.clone()); - pipeline.verify_in_database(false).await?; - - let database_data = pipeline + let project_info = &self .database_data .as_ref() - .context("Pipeline must be verified to remove it")?; - - let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); + .context("Database data must be set to remove pipeline from collection")? + .project_info; + pipeline.set_project_info(project_info.clone()); + pipeline.verify_in_database(false).await?; - let parameters = pipeline - .parameters - .as_ref() - .context("Pipeline must be verified to remove it")?; + let pipeline_schema = format!("{}_{}", project_info.name, pipeline.name); let mut transaction = pool.begin().await?; - - // Need to delete from chunks table only if no other pipelines use the same splitter - sqlx::query(&query_builder!( - "DELETE FROM %s WHERE splitter_id = $1 AND NOT EXISTS (SELECT 1 FROM %s WHERE splitter_id = $1 AND id != $2)", - self.chunks_table_name, - self.pipelines_table_name - )) - .bind(database_data.splitter_id) - .bind(database_data.id) - .execute(&mut *transaction) + transaction + .execute(query_builder!("DROP SCHEMA IF EXISTS %s CASCADE", pipeline_schema).as_str()) .await?; - - // Drop the embeddings table - sqlx::query(&query_builder!( - "DROP TABLE IF EXISTS %s", - embeddings_table_name - )) - .execute(&mut *transaction) - .await?; - - // Need to delete from the tsvectors table only if no other pipelines use the - // same tsvector configuration - sqlx::query(&query_builder!( - "DELETE FROM %s WHERE configuration = $1 AND NOT EXISTS (SELECT 1 FROM %s WHERE parameters->'full_text_search'->>'configuration' = $1 AND id != $2)", - self.documents_tsvectors_table_name, - self.pipelines_table_name)) - .bind(parameters["full_text_search"]["configuration"].as_str()) - .bind(database_data.id) - .execute(&mut *transaction) - .await?; - sqlx::query(&query_builder!( - "DELETE FROM %s WHERE id = $1", + "UPDATE %s SET active = FALSE WHERE name = $1", self.pipelines_table_name )) - .bind(database_data.id) + .bind(&pipeline.name) .execute(&mut *transaction) .await?; - transaction.commit().await?; + Ok(()) } @@ -429,110 +402,13 @@ impl Collection { query_builder!(queries::CREATE_DOCUMENTS_TABLE, self.documents_table_name).as_str(), ) .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "created_at_index", - self.documents_table_name, - "created_at" - ) - .as_str(), - ) - .await?; conn.execute( query_builder!( queries::CREATE_INDEX_USING_GIN, "", - "metadata_index", + "documents_document_index", self.documents_table_name, - "metadata jsonb_path_ops" - ) - .as_str(), - ) - .await?; - Ok(()) - } - - #[instrument(skip(self, conn))] - async fn create_chunks_table(&mut self, conn: &mut PgConnection) -> anyhow::Result<()> { - conn.execute( - query_builder!( - queries::CREATE_CHUNKS_TABLE, - self.chunks_table_name, - self.documents_table_name - ) - .as_str(), - ) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "created_at_index", - self.chunks_table_name, - "created_at" - ) - .as_str(), - ) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "document_id_index", - self.chunks_table_name, - "document_id" - ) - .as_str(), - ) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "splitter_id_index", - self.chunks_table_name, - "splitter_id" - ) - .as_str(), - ) - .await?; - Ok(()) - } - - #[instrument(skip(self, conn))] - async fn create_documents_tsvectors_table( - &mut self, - conn: &mut PgConnection, - ) -> anyhow::Result<()> { - conn.execute( - query_builder!( - queries::CREATE_DOCUMENTS_TSVECTORS_TABLE, - self.documents_tsvectors_table_name, - self.documents_table_name - ) - .as_str(), - ) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "configuration_index", - self.documents_tsvectors_table_name, - "configuration" - ) - .as_str(), - ) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX_USING_GIN, - "", - "tsvector_index", - self.documents_tsvectors_table_name, - "ts" + "document jsonb_path_ops" ) .as_str(), ) @@ -562,6 +438,7 @@ impl Collection { /// Ok(()) /// } /// ``` + // TODO: Make it so if we upload the same documen twice it doesn't do anything #[instrument(skip(self, documents))] pub async fn upsert_documents( &mut self, @@ -571,111 +448,31 @@ impl Collection { let pool = get_or_initialize_pool(&self.database_url).await?; self.verify_in_database(false).await?; + // TODO: Work on this let args = args.unwrap_or_default(); + let mut document_ids = vec![]; + let progress_bar = utils::default_progress_bar(documents.len() as u64); progress_bar.println("Upserting Documents..."); - let documents: anyhow::Result> = documents - .into_iter() - .map(|mut document| { - let document = document - .as_object_mut() - .context("Documents must be a vector of objects")?; - - // We don't want the text included in the document metadata, but everything else - // should be in there - let text = document.remove("text").map(|t| { - t.as_str() - .expect("`text` must be a string in document") - .to_string() - }); - let metadata = serde_json::to_value(&document)?.into(); - - let id = document - .get("id") - .context("`id` must be a key in document")? - .to_string(); - let md5_digest = md5::compute(id.as_bytes()); - let source_uuid = uuid::Uuid::from_slice(&md5_digest.0)?; - - Ok((source_uuid, text, metadata)) - }) - .collect(); - - // We could continue chaining the above iterators but types become super annoying to - // deal with, especially because we are dealing with async functions. This is much easier to read - // Also, we may want to use a variant of chunks that is owned, I'm not 100% sure of what - // cloning happens when passing values into sqlx bind. itertools variants will not work as - // it is not thread safe and pyo3 will get upset - let mut document_ids = Vec::new(); - for chunk in documents?.chunks(10) { - // Need to make it a vec to partition it and must include explicit typing here - let mut chunk: Vec<&(uuid::Uuid, Option, Json)> = chunk.iter().collect(); - - // Split the chunk into two groups, one with text, and one with just metadata - let split_index = itertools::partition(&mut chunk, |(_, text, _)| text.is_some()); - let (text_chunk, metadata_chunk) = chunk.split_at(split_index); - - // Start the transaction - let mut transaction = pool.begin().await?; - - if !metadata_chunk.is_empty() { - // Update the metadata - // Merge the metadata if the user has specified to do so otherwise replace it - if args["metadata"]["merge"].as_bool().unwrap_or(false) { - sqlx::query(query_builder!( - "UPDATE %s d SET metadata = d.metadata || v.metadata FROM (SELECT UNNEST($1) source_uuid, UNNEST($2) metadata) v WHERE d.source_uuid = v.source_uuid", - self.documents_table_name - ).as_str()).bind(metadata_chunk.iter().map(|(source_uuid, _, _)| *source_uuid).collect::>()) - .bind(metadata_chunk.iter().map(|(_, _, metadata)| metadata.0.clone()).collect::>()) - .execute(&mut *transaction).await?; - } else { - sqlx::query(query_builder!( - "UPDATE %s d SET metadata = v.metadata FROM (SELECT UNNEST($1) source_uuid, UNNEST($2) metadata) v WHERE d.source_uuid = v.source_uuid", - self.documents_table_name - ).as_str()).bind(metadata_chunk.iter().map(|(source_uuid, _, _)| *source_uuid).collect::>()) - .bind(metadata_chunk.iter().map(|(_, _, metadata)| metadata.0.clone()).collect::>()) - .execute(&mut *transaction).await?; - } - } - - if !text_chunk.is_empty() { - // First delete any documents that already have the same UUID as documents in - // text_chunk, then insert the new ones. - // We are essentially upserting in two steps - sqlx::query(&query_builder!( - "DELETE FROM %s WHERE source_uuid IN (SELECT source_uuid FROM %s WHERE source_uuid = ANY($1::uuid[]))", - self.documents_table_name, - self.documents_table_name - )). - bind(&text_chunk.iter().map(|(source_uuid, _, _)| *source_uuid).collect::>()). - execute(&mut *transaction).await?; - let query_string_values = (0..text_chunk.len()) - .map(|i| format!("(${}, ${}, ${})", i * 3 + 1, i * 3 + 2, i * 3 + 3)) - .collect::>() - .join(","); - let query_string = format!( - "INSERT INTO %s (source_uuid, text, metadata) VALUES {} ON CONFLICT (source_uuid) DO UPDATE SET text = $2, metadata = $3 RETURNING id", - query_string_values - ); - let query = query_builder!(query_string, self.documents_table_name); - let mut query = sqlx::query_scalar(&query); - for (source_uuid, text, metadata) in text_chunk.iter() { - query = query.bind(source_uuid).bind(text).bind(metadata); - } - let ids: Vec = query.fetch_all(&mut *transaction).await?; - document_ids.extend(ids); - progress_bar.inc(chunk.len() as u64); - } - - transaction.commit().await?; + let mut transaction = pool.begin().await?; + for document in documents { + let id = document + .get("id") + .context("`id` must be a key in document")? + .to_string(); + let md5_digest = md5::compute(id.as_bytes()); + let source_uuid = uuid::Uuid::from_slice(&md5_digest.0)?; + + let id: i64 = sqlx::query_scalar(&query_builder!("INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = $2 RETURNING id", self.documents_table_name)).bind(source_uuid).bind(document).fetch_one(&mut *transaction).await?; + document_ids.push(id); } - progress_bar.finish(); - eprintln!("Done Upserting Documents\n"); + transaction.commit().await?; - self.sync_pipelines(Some(document_ids)).await?; - Ok(()) + progress_bar.println("Done Upserting Documents\n"); + progress_bar.finish(); + self.sync_pipelines(Some(document_ids)).await } /// Gets the documents on a [Collection] @@ -696,104 +493,107 @@ impl Collection { /// } #[instrument(skip(self))] pub async fn get_documents(&self, args: Option) -> anyhow::Result> { - let pool = get_or_initialize_pool(&self.database_url).await?; - - let mut args = args.unwrap_or_default().0; - let args = args.as_object_mut().context("args must be an object")?; - - // Get limit or set it to 1000 - let limit = args - .remove("limit") - .map(|l| l.try_to_u64()) - .unwrap_or(Ok(1000))?; - - let mut query = Query::select(); - query - .from_as( - self.documents_table_name.to_table_tuple(), - SIden::Str("documents"), - ) - .expr(Expr::cust("*")) // Adds the * in SELECT * FROM - .limit(limit); - - if let Some(order_by) = args.remove("order_by") { - let order_by_builder = - order_by_builder::OrderByBuilder::new(order_by, "documents", "metadata").build()?; - for (order_by, order) in order_by_builder { - query.order_by_expr_with_nulls(order_by, order, NullOrdering::Last); - } - } - query.order_by((SIden::Str("documents"), SIden::Str("id")), Order::Asc); - - // TODO: Make keyset based pagination work with custom order by - if let Some(last_row_id) = args.remove("last_row_id") { - let last_row_id = last_row_id - .try_to_u64() - .context("last_row_id must be an integer")?; - query.and_where(Expr::col((SIden::Str("documents"), SIden::Str("id"))).gt(last_row_id)); - } - - if let Some(offset) = args.remove("offset") { - let offset = offset.try_to_u64().context("offset must be an integer")?; - query.offset(offset); - } - - if let Some(mut filter) = args.remove("filter") { - let filter = filter - .as_object_mut() - .context("filter must be a Json object")?; - - if let Some(f) = filter.remove("metadata") { - query.cond_where( - filter_builder::FilterBuilder::new(f, "documents", "metadata").build(), - ); - } - if let Some(f) = filter.remove("full_text_search") { - let f = f - .as_object() - .context("Full text filter must be a Json object")?; - let configuration = f - .get("configuration") - .context("In full_text_search `configuration` is required")? - .as_str() - .context("In full_text_search `configuration` must be a string")?; - let filter_text = f - .get("text") - .context("In full_text_search `text` is required")? - .as_str() - .context("In full_text_search `text` must be a string")?; - query - .join_as( - JoinType::InnerJoin, - self.documents_tsvectors_table_name.to_table_tuple(), - Alias::new("documents_tsvectors"), - Expr::col((SIden::Str("documents"), SIden::Str("id"))) - .equals((SIden::Str("documents_tsvectors"), SIden::Str("document_id"))), - ) - .and_where( - Expr::col(( - SIden::Str("documents_tsvectors"), - SIden::Str("configuration"), - )) - .eq(configuration), - ) - .and_where(Expr::cust_with_values( - format!( - "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", - configuration - ), - [filter_text], - )); - } - } - - let (sql, values) = query.build_sqlx(PostgresQueryBuilder); - let documents: Vec = - sqlx::query_as_with(&sql, values).fetch_all(&pool).await?; - Ok(documents - .into_iter() - .map(|d| d.into_user_friendly_json()) - .collect()) + // TODO: If we want to filter on full text this needs to be part of a pipeline + unimplemented!() + + // let pool = get_or_initialize_pool(&self.database_url).await?; + + // let mut args = args.unwrap_or_default().0; + // let args = args.as_object_mut().context("args must be an object")?; + + // // Get limit or set it to 1000 + // let limit = args + // .remove("limit") + // .map(|l| l.try_to_u64()) + // .unwrap_or(Ok(1000))?; + + // let mut query = Query::select(); + // query + // .from_as( + // self.documents_table_name.to_table_tuple(), + // SIden::Str("documents"), + // ) + // .expr(Expr::cust("*")) // Adds the * in SELECT * FROM + // .limit(limit); + + // if let Some(order_by) = args.remove("order_by") { + // let order_by_builder = + // order_by_builder::OrderByBuilder::new(order_by, "documents", "metadata").build()?; + // for (order_by, order) in order_by_builder { + // query.order_by_expr_with_nulls(order_by, order, NullOrdering::Last); + // } + // } + // query.order_by((SIden::Str("documents"), SIden::Str("id")), Order::Asc); + + // // TODO: Make keyset based pagination work with custom order by + // if let Some(last_row_id) = args.remove("last_row_id") { + // let last_row_id = last_row_id + // .try_to_u64() + // .context("last_row_id must be an integer")?; + // query.and_where(Expr::col((SIden::Str("documents"), SIden::Str("id"))).gt(last_row_id)); + // } + + // if let Some(offset) = args.remove("offset") { + // let offset = offset.try_to_u64().context("offset must be an integer")?; + // query.offset(offset); + // } + + // if let Some(mut filter) = args.remove("filter") { + // let filter = filter + // .as_object_mut() + // .context("filter must be a Json object")?; + + // if let Some(f) = filter.remove("metadata") { + // query.cond_where( + // filter_builder::FilterBuilder::new(f, "documents", "metadata").build(), + // ); + // } + // if let Some(f) = filter.remove("full_text_search") { + // let f = f + // .as_object() + // .context("Full text filter must be a Json object")?; + // let configuration = f + // .get("configuration") + // .context("In full_text_search `configuration` is required")? + // .as_str() + // .context("In full_text_search `configuration` must be a string")?; + // let filter_text = f + // .get("text") + // .context("In full_text_search `text` is required")? + // .as_str() + // .context("In full_text_search `text` must be a string")?; + // query + // .join_as( + // JoinType::InnerJoin, + // self.documents_tsvectors_table_name.to_table_tuple(), + // Alias::new("documents_tsvectors"), + // Expr::col((SIden::Str("documents"), SIden::Str("id"))) + // .equals((SIden::Str("documents_tsvectors"), SIden::Str("document_id"))), + // ) + // .and_where( + // Expr::col(( + // SIden::Str("documents_tsvectors"), + // SIden::Str("configuration"), + // )) + // .eq(configuration), + // ) + // .and_where(Expr::cust_with_values( + // format!( + // "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", + // configuration + // ), + // [filter_text], + // )); + // } + // } + + // let (sql, values) = query.build_sqlx(PostgresQueryBuilder); + // let documents: Vec = + // sqlx::query_as_with(&sql, values).fetch_all(&pool).await?; + // Ok(documents + // .into_iter() + // .map(|d| d.into_user_friendly_json()) + // .collect()) } /// Deletes documents in a [Collection] @@ -820,64 +620,67 @@ impl Collection { /// } #[instrument(skip(self))] pub async fn delete_documents(&self, mut filter: Json) -> anyhow::Result<()> { - let pool = get_or_initialize_pool(&self.database_url).await?; - - let mut query = Query::delete(); - query.from_table(self.documents_table_name.to_table_tuple()); - - let filter = filter - .as_object_mut() - .context("filter must be a Json object")?; - - if let Some(f) = filter.remove("metadata") { - query - .cond_where(filter_builder::FilterBuilder::new(f, "documents", "metadata").build()); - } - - if let Some(mut f) = filter.remove("full_text_search") { - let f = f - .as_object_mut() - .context("Full text filter must be a Json object")?; - let configuration = f - .get("configuration") - .context("In full_text_search `configuration` is required")? - .as_str() - .context("In full_text_search `configuration` must be a string")?; - let filter_text = f - .get("text") - .context("In full_text_search `text` is required")? - .as_str() - .context("In full_text_search `text` must be a string")?; - let mut inner_select_query = Query::select(); - inner_select_query - .from_as( - self.documents_tsvectors_table_name.to_table_tuple(), - SIden::Str("documents_tsvectors"), - ) - .column(SIden::Str("document_id")) - .and_where(Expr::cust_with_values( - format!( - "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", - configuration - ), - [filter_text], - )) - .and_where( - Expr::col(( - SIden::Str("documents_tsvectors"), - SIden::Str("configuration"), - )) - .eq(configuration), - ); - query.and_where( - Expr::col((SIden::Str("documents"), SIden::Str("id"))) - .in_subquery(inner_select_query), - ); - } - - let (sql, values) = query.build_sqlx(PostgresQueryBuilder); - sqlx::query_with(&sql, values).fetch_all(&pool).await?; - Ok(()) + // TODO: If we want to filter on full text this needs to be part of a pipeline + unimplemented!() + + // let pool = get_or_initialize_pool(&self.database_url).await?; + + // let mut query = Query::delete(); + // query.from_table(self.documents_table_name.to_table_tuple()); + + // let filter = filter + // .as_object_mut() + // .context("filter must be a Json object")?; + + // if let Some(f) = filter.remove("metadata") { + // query + // .cond_where(filter_builder::FilterBuilder::new(f, "documents", "metadata").build()); + // } + + // if let Some(mut f) = filter.remove("full_text_search") { + // let f = f + // .as_object_mut() + // .context("Full text filter must be a Json object")?; + // let configuration = f + // .get("configuration") + // .context("In full_text_search `configuration` is required")? + // .as_str() + // .context("In full_text_search `configuration` must be a string")?; + // let filter_text = f + // .get("text") + // .context("In full_text_search `text` is required")? + // .as_str() + // .context("In full_text_search `text` must be a string")?; + // let mut inner_select_query = Query::select(); + // inner_select_query + // .from_as( + // self.documents_tsvectors_table_name.to_table_tuple(), + // SIden::Str("documents_tsvectors"), + // ) + // .column(SIden::Str("document_id")) + // .and_where(Expr::cust_with_values( + // format!( + // "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", + // configuration + // ), + // [filter_text], + // )) + // .and_where( + // Expr::col(( + // SIden::Str("documents_tsvectors"), + // SIden::Str("configuration"), + // )) + // .eq(configuration), + // ); + // query.and_where( + // Expr::col((SIden::Str("documents"), SIden::Str("id"))) + // .in_subquery(inner_select_query), + // ); + // } + + // let (sql, values) = query.build_sqlx(PostgresQueryBuilder); + // sqlx::query_with(&sql, values).fetch_all(&pool).await?; + // Ok(()) } #[instrument(skip(self))] @@ -901,11 +704,25 @@ impl Collection { .expect("Failed to execute pipeline"); }) .await; - eprintln!("Done Syncing Pipelines\n"); + mp.println("Done Syncing Pipelines\n")?; } Ok(()) } + #[instrument(skip(self))] + pub async fn search( + &self, + query: Json, + pipeline: &MultiFieldPipeline, + ) -> anyhow::Result> { + let pool = get_or_initialize_pool(&self.database_url).await?; + let (query, values) = + crate::search_query_builder::build_search_query(self, query, pipeline).await?; + println!("\n\n{query}\n\n"); + let results: Vec<(Json,)> = sqlx::query_as_with(&query, values).fetch_all(&pool).await?; + Ok(results.into_iter().map(|r| r.0).collect()) + } + /// Performs vector search on the [Collection] /// /// # Arguments @@ -932,7 +749,7 @@ impl Collection { pub async fn vector_search( &mut self, query: &str, - pipeline: &mut Pipeline, + pipeline: &mut MultiFieldPipeline, query_parameters: Option, top_k: Option, ) -> anyhow::Result> { @@ -942,66 +759,80 @@ impl Collection { let top_k = top_k.unwrap_or(5); // With this system, we only do the wrong type of vector search once - let runtime = if pipeline.model.is_some() { - pipeline.model.as_ref().unwrap().runtime - } else { - ModelRuntime::Python - }; - match runtime { - ModelRuntime::Python => { - let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); - - let result = sqlx::query_as(&query_builder!( - queries::EMBED_AND_VECTOR_SEARCH, - self.pipelines_table_name, - embeddings_table_name, - self.chunks_table_name, - self.documents_table_name - )) - .bind(&pipeline.name) - .bind(query) - .bind(&query_parameters) - .bind(top_k) - .fetch_all(&pool) - .await; - - match result { - Ok(r) => Ok(r), - Err(e) => match e.as_database_error() { - Some(d) => { - if d.code() == Some(Cow::from("XX000")) { - self.vector_search_with_remote_embeddings( - query, - pipeline, - query_parameters, - top_k, - &pool, - ) - .await - } else { - Err(anyhow::anyhow!(e)) - } - } - None => Err(anyhow::anyhow!(e)), - }, - } - } - _ => { - self.vector_search_with_remote_embeddings( - query, - pipeline, - query_parameters, - top_k, - &pool, - ) - .await - } - } - .map(|r| { - r.into_iter() - .map(|(score, id, metadata)| (1. - score, id, metadata)) - .collect() - }) + // let runtime = if pipeline.model.is_some() { + // pipeline.model.as_ref().unwrap().runtime + // } else { + // ModelRuntime::Python + // }; + + unimplemented!() + + // let pool = get_or_initialize_pool(&self.database_url).await?; + + // let query_parameters = query_parameters.unwrap_or_default(); + // let top_k = top_k.unwrap_or(5); + + // // With this system, we only do the wrong type of vector search once + // let runtime = if pipeline.model.is_some() { + // pipeline.model.as_ref().unwrap().runtime + // } else { + // ModelRuntime::Python + // }; + // match runtime { + // ModelRuntime::Python => { + // let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); + + // let result = sqlx::query_as(&query_builder!( + // queries::EMBED_AND_VECTOR_SEARCH, + // self.pipelines_table_name, + // embeddings_table_name, + // self.chunks_table_name, + // self.documents_table_name + // )) + // .bind(&pipeline.name) + // .bind(query) + // .bind(&query_parameters) + // .bind(top_k) + // .fetch_all(&pool) + // .await; + + // match result { + // Ok(r) => Ok(r), + // Err(e) => match e.as_database_error() { + // Some(d) => { + // if d.code() == Some(Cow::from("XX000")) { + // self.vector_search_with_remote_embeddings( + // query, + // pipeline, + // query_parameters, + // top_k, + // &pool, + // ) + // .await + // } else { + // Err(anyhow::anyhow!(e)) + // } + // } + // None => Err(anyhow::anyhow!(e)), + // }, + // } + // } + // _ => { + // self.vector_search_with_remote_embeddings( + // query, + // pipeline, + // query_parameters, + // top_k, + // &pool, + // ) + // .await + // } + // } + // .map(|r| { + // r.into_iter() + // .map(|(score, id, metadata)| (1. - score, id, metadata)) + // .collect() + // }) } #[instrument(skip(self, pool))] @@ -1014,45 +845,48 @@ impl Collection { top_k: i64, pool: &PgPool, ) -> anyhow::Result> { - self.verify_in_database(false).await?; - - // Have to set the project info before we can get and set the model - pipeline.set_project_info( - self.database_data - .as_ref() - .context( - "Collection must be verified to perform vector search with remote embeddings", - )? - .project_info - .clone(), - ); - // Verify to get and set the model if we don't have it set on the pipeline yet - pipeline.verify_in_database(false).await?; - let model = pipeline - .model - .as_ref() - .context("Pipeline must be verified to perform vector search with remote embeddings")?; - - // We need to make sure we are not mutably and immutably borrowing the same things - let embedding = { - let remote_embeddings = - build_remote_embeddings(model.runtime, &model.name, &query_parameters)?; - let mut embeddings = remote_embeddings.embed(vec![query.to_string()]).await?; - std::mem::take(&mut embeddings[0]) - }; - - let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); - sqlx::query_as(&query_builder!( - queries::VECTOR_SEARCH, - embeddings_table_name, - self.chunks_table_name, - self.documents_table_name - )) - .bind(embedding) - .bind(top_k) - .fetch_all(pool) - .await - .map_err(|e| anyhow::anyhow!(e)) + // TODO: Make this actually work maybe an alias for the new search or something idk + unimplemented!() + + // self.verify_in_database(false).await?; + + // // Have to set the project info before we can get and set the model + // pipeline.set_project_info( + // self.database_data + // .as_ref() + // .context( + // "Collection must be verified to perform vector search with remote embeddings", + // )? + // .project_info + // .clone(), + // ); + // // Verify to get and set the model if we don't have it set on the pipeline yet + // pipeline.verify_in_database(false).await?; + // let model = pipeline + // .model + // .as_ref() + // .context("Pipeline must be verified to perform vector search with remote embeddings")?; + + // // We need to make sure we are not mutably and immutably borrowing the same things + // let embedding = { + // let remote_embeddings = + // build_remote_embeddings(model.runtime, &model.name, &query_parameters)?; + // let mut embeddings = remote_embeddings.embed(vec![query.to_string()]).await?; + // std::mem::take(&mut embeddings[0]) + // }; + + // let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); + // sqlx::query_as(&query_builder!( + // queries::VECTOR_SEARCH, + // embeddings_table_name, + // self.chunks_table_name, + // self.documents_table_name + // )) + // .bind(embedding) + // .bind(top_k) + // .fetch_all(pool) + // .await + // .map_err(|e| anyhow::anyhow!(e)) } #[instrument(skip(self))] @@ -1099,53 +933,29 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn get_pipelines(&mut self) -> anyhow::Result> { + pub async fn get_pipelines(&mut self) -> anyhow::Result> { self.verify_in_database(false).await?; + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to get collection pipelines")? + .project_info; let pool = get_or_initialize_pool(&self.database_url).await?; + let pipelines: Vec = sqlx::query_as(&query_builder!( + "SELECT * FROM %s WHERE active = TRUE", + self.pipelines_table_name + )) + .fetch_all(&pool) + .await?; - let pipelines_with_models_and_splitters: Vec = - sqlx::query_as(&query_builder!( - r#"SELECT - p.id as pipeline_id, - p.name as pipeline_name, - p.created_at as pipeline_created_at, - p.active as pipeline_active, - p.parameters as pipeline_parameters, - m.id as model_id, - m.created_at as model_created_at, - m.runtime::TEXT as model_runtime, - m.hyperparams as model_hyperparams, - s.id as splitter_id, - s.created_at as splitter_created_at, - s.name as splitter_name, - s.parameters as splitter_parameters - FROM - %s p - INNER JOIN pgml.models m ON p.model_id = m.id - INNER JOIN pgml.splitters s ON p.splitter_id = s.id - WHERE - p.active = TRUE - "#, - self.pipelines_table_name - )) - .fetch_all(&pool) - .await?; - - let pipelines: Vec = pipelines_with_models_and_splitters + pipelines .into_iter() .map(|p| { - let mut pipeline: Pipeline = p.into(); - pipeline.set_project_info( - self.database_data - .as_ref() - .expect("Collection must be verified to get all pipelines") - .project_info - .clone(), - ); - pipeline + let mut p: MultiFieldPipeline = p.try_into()?; + p.set_project_info(project_info.clone()); + Ok(p) }) - .collect(); - Ok(pipelines) + .collect() } /// Gets a [Pipeline] by name @@ -1162,42 +972,23 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn get_pipeline(&mut self, name: &str) -> anyhow::Result { + pub async fn get_pipeline(&mut self, name: &str) -> anyhow::Result { self.verify_in_database(false).await?; + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to get collection pipelines")? + .project_info; let pool = get_or_initialize_pool(&self.database_url).await?; - - let pipeline_with_model_and_splitter: models::PipelineWithModelAndSplitter = - sqlx::query_as(&query_builder!( - r#"SELECT - p.id as pipeline_id, - p.name as pipeline_name, - p.created_at as pipeline_created_at, - p.active as pipeline_active, - p.parameters as pipeline_parameters, - m.id as model_id, - m.created_at as model_created_at, - m.runtime::TEXT as model_runtime, - m.hyperparams as model_hyperparams, - s.id as splitter_id, - s.created_at as splitter_created_at, - s.name as splitter_name, - s.parameters as splitter_parameters - FROM - %s p - INNER JOIN pgml.models m ON p.model_id = m.id - INNER JOIN pgml.splitters s ON p.splitter_id = s.id - WHERE - p.active = TRUE - AND p.name = $1 - "#, - self.pipelines_table_name - )) - .bind(name) - .fetch_one(&pool) - .await?; - - let mut pipeline: Pipeline = pipeline_with_model_and_splitter.into(); - pipeline.set_project_info(self.database_data.as_ref().unwrap().project_info.clone()); + let pipeline: models::Pipeline = sqlx::query_as(&query_builder!( + "SELECT * FROM %s WHERE name = $1 AND active = TRUE LIMIT 1", + self.pipelines_table_name + )) + .bind(name) + .fetch_one(&pool) + .await?; + let mut pipeline: MultiFieldPipeline = pipeline.try_into()?; + pipeline.set_project_info(project_info.clone()); Ok(pipeline) } @@ -1312,6 +1103,125 @@ impl Collection { Ok(()) } + pub async fn generate_er_diagram( + &mut self, + pipeline: &mut MultiFieldPipeline, + ) -> anyhow::Result { + self.verify_in_database(false).await?; + pipeline.verify_in_database(false).await?; + + let parsed_schema = pipeline + .parsed_schema + .as_ref() + .context("Pipeline must have schema to generate er diagram")?; + + let mut uml_entites = format!( + r#" +@startuml +' hide the spot +' hide circle + +' avoid problems with angled crows feet +skinparam linetype ortho + +entity "pgml.collections" as pgmlc {{ + id : bigint + -- + created_at : timestamp without time zone + name : text + active : boolean + project_id : bigint + sdk_version : text +}} + +entity "{}.documents" as documents {{ + id : bigint + -- + created_at : timestamp without time zone + source_uuid : uuid + document : jsonb +}} + +entity "{}.pipelines" as pipelines {{ + id : bigint + -- + created_at : timestamp without time zone + name : text + active : boolean + schema : jsonb +}} + "#, + self.name, self.name + ); + + let schema = format!("{}_{}", self.name, pipeline.name); + + let mut uml_relations = r#" +pgmlc ||..|| pipelines + "# + .to_string(); + + for (key, field_action) in parsed_schema.iter() { + let nice_name_key = key.replace(' ', "_"); + if let Some(_embed_action) = &field_action.embed { + let entites = format!( + r#" +entity "{schema}.{key}_chunks" as {nice_name_key}_chunks {{ + id : bigint + -- + created_at : timestamp without time zone + documnt_id : bigint + chunk_index : bigint + chunk : text +}} + +entity "{schema}.{key}_embeddings" as {nice_name_key}_embeddings {{ + id : bigint + -- + created_at : timestamp without time zone + chunk_id : bigint + embedding : vector +}} + "# + ); + uml_entites.push_str(&entites); + + let relations = format!( + r#" +documents ||..|{{ {nice_name_key}_chunks +{nice_name_key}_chunks ||.|| {nice_name_key}_embeddings + "# + ); + uml_relations.push_str(&relations); + } + + if let Some(_full_text_search_action) = &field_action.full_text_search { + let entites = format!( + r#" +entity "{schema}.{key}_tsvectors" as {nice_name_key}_tsvectors {{ + id : bigint + -- + created_at : timestamp without time zone + documnt_id : bigint + tsvectors : tsvector +}} + "# + ); + uml_entites.push_str(&entites); + + let relations = format!( + r#" +documents ||..|| {nice_name_key}_tsvectors + "# + ); + uml_relations.push_str(&relations); + } + } + + uml_entites.push_str(¨_relations); + Ok(uml_entites) + } + pub async fn upsert_file(&mut self, path: &str) -> anyhow::Result<()> { self.verify_in_database(false).await?; let path = Path::new(path); @@ -1323,11 +1233,10 @@ impl Collection { self.upsert_documents(vec![document.into()], None).await } - fn generate_table_names(name: &str) -> (String, String, String, String, String) { + fn generate_table_names(name: &str) -> (String, String, String, String) { [ ".pipelines", ".documents", - ".transforms", ".chunks", ".documents_tsvectors", ] diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index cef33c024..c0d4cb8e4 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -21,6 +21,7 @@ mod languages; pub mod migrations; mod model; pub mod models; +mod multi_field_pipeline; mod open_source_ai; mod order_by_builder; mod pipeline; @@ -28,6 +29,7 @@ mod queries; mod query_builder; mod query_runner; mod remote_embeddings; +mod search_query_builder; mod splitter; pub mod transformer_pipeline; pub mod types; @@ -37,6 +39,7 @@ mod utils; pub use builtins::Builtins; pub use collection::Collection; pub use model::Model; +pub use multi_field_pipeline::MultiFieldPipeline; pub use open_source_ai::OpenSourceAI; pub use pipeline::Pipeline; pub use splitter::Splitter; @@ -224,7 +227,8 @@ fn main(mut cx: neon::context::ModuleContext) -> neon::result::NeonResult<()> { #[cfg(test)] mod tests { use super::*; - use crate::{model::Model, pipeline::Pipeline, splitter::Splitter, types::Json}; + use crate::types::Json; + use itertools::assert_equal; use serde_json::json; fn generate_dummy_documents(count: usize) -> Vec { @@ -233,7 +237,9 @@ mod tests { let document = serde_json::json!( { "id": i, - "text": format!("This is a test document: {}", i), + "title": format!("Test document: {}", i), + "body": format!("Here is the body for test document {}", i), + "notes": format!("Here are some notes or something for test document {}", i), "metadata": { "uuid": i * 10, "name": format!("Test Document {}", i) @@ -262,23 +268,8 @@ mod tests { #[sqlx::test] async fn can_add_remove_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_p_cap_57", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); - let mut collection = Collection::new("test_r_c_carp_3", None); + let mut pipeline = MultiFieldPipeline::new("test_p_cap_57", Some(json!({}).into()))?; + let mut collection = Collection::new("test_r_c_carp_1", None); assert!(collection.database_data.is_none()); collection.add_pipeline(&mut pipeline).await?; assert!(collection.database_data.is_some()); @@ -289,1043 +280,1420 @@ mod tests { Ok(()) } - // #[sqlx::test] - // async fn can_add_remove_pipelines() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline1 = Pipeline::new( - // "test_r_p_carps_0", - // Some(model.clone()), - // Some(splitter.clone()), - // None, - // ); - // let mut pipeline2 = Pipeline::new("test_r_p_carps_1", Some(model), Some(splitter), None); - // let mut collection = Collection::new("test_r_c_carps_1", None); - // collection.add_pipeline(&mut pipeline1).await?; - // collection.add_pipeline(&mut pipeline2).await?; - // let pipelines = collection.get_pipelines().await?; - // assert!(pipelines.len() == 2); - // collection.remove_pipeline(&mut pipeline1).await?; - // let pipelines = collection.get_pipelines().await?; - // assert!(pipelines.len() == 1); - // assert!(collection.get_pipeline("test_r_p_carps_0").await.is_err()); - // collection.archive().await?; - // Ok(()) - // } - - #[sqlx::test] - async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cschpfp_0", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "hnsw": { - "m": 100, - "ef_construction": 200 - } - }) - .into(), - ), - ); - let collection_name = "test_r_c_cschpfp_1"; - let mut collection = Collection::new(collection_name, None); - collection.add_pipeline(&mut pipeline).await?; - let full_embeddings_table_name = pipeline.create_or_get_embeddings_table().await?; - let embeddings_table_name = full_embeddings_table_name.split('.').collect::>()[1]; - let pool = get_or_initialize_pool(&None).await?; - let results: Vec<(String, String)> = sqlx::query_as(&query_builder!( - "select indexname, indexdef from pg_indexes where tablename = '%d' and schemaname = '%d'", - embeddings_table_name, - collection_name - )).fetch_all(&pool).await?; - let names = results.iter().map(|(name, _)| name).collect::>(); - let definitions = results - .iter() - .map(|(_, definition)| definition) - .collect::>(); - assert!(names.contains(&&format!("{}_pipeline_hnsw_vector_index", pipeline.name))); - assert!(definitions.contains(&&format!("CREATE INDEX {}_pipeline_hnsw_vector_index ON {} USING hnsw (embedding vector_cosine_ops) WITH (m='100', ef_construction='200')", pipeline.name, full_embeddings_table_name))); - Ok(()) - } - - #[sqlx::test] - async fn disable_enable_pipeline() -> anyhow::Result<()> { - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new("test_p_dep_0", Some(model), Some(splitter), None); - let mut collection = Collection::new("test_r_c_dep_1", None); - collection.add_pipeline(&mut pipeline).await?; - let queried_pipeline = &collection.get_pipelines().await?[0]; - assert_eq!(pipeline.name, queried_pipeline.name); - collection.disable_pipeline(&pipeline).await?; - let queried_pipelines = &collection.get_pipelines().await?; - assert!(queried_pipelines.is_empty()); - collection.enable_pipeline(&pipeline).await?; - let queried_pipeline = &collection.get_pipelines().await?[0]; - assert_eq!(pipeline.name, queried_pipeline.name); - collection.archive().await?; - Ok(()) - } - #[sqlx::test] - async fn sync_multiple_pipelines() -> anyhow::Result<()> { + async fn can_add_remove_pipelines() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline1 = Pipeline::new( - "test_r_p_smp_0", - Some(model.clone()), - Some(splitter.clone()), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); - let mut pipeline2 = Pipeline::new( - "test_r_p_smp_1", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); - let mut collection = Collection::new("test_r_c_smp_3", None); + let mut pipeline1 = MultiFieldPipeline::new("test_r_p_carps_1", Some(json!({}).into()))?; + let mut pipeline2 = MultiFieldPipeline::new("test_r_p_carps_2", Some(json!({}).into()))?; + let mut collection = Collection::new("test_r_c_carps_7", None); collection.add_pipeline(&mut pipeline1).await?; collection.add_pipeline(&mut pipeline2).await?; - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; - let status_1 = pipeline1.get_status().await?; - let status_2 = pipeline2.get_status().await?; - assert!( - status_1.chunks_status.synced == status_1.chunks_status.total - && status_1.chunks_status.not_synced == 0 - ); - assert!( - status_2.chunks_status.synced == status_2.chunks_status.total - && status_2.chunks_status.not_synced == 0 - ); + let pipelines = collection.get_pipelines().await?; + assert!(pipelines.len() == 2); + collection.remove_pipeline(&mut pipeline1).await?; + let pipelines = collection.get_pipelines().await?; + assert!(pipelines.len() == 1); + assert!(collection.get_pipeline("test_r_p_carps_1").await.is_err()); collection.archive().await?; Ok(()) } - /////////////////////////////// - // Various Searches /////////// - /////////////////////////////// - #[sqlx::test] - async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { + async fn can_add_pipeline_and_upsert_documents() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswle_1", - Some(model), - Some(splitter), + let collection_name = "test_r_c_capaud_33"; + let pipeline_name = "test_r_p_capaud_6"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + }, + "body": { + "embed": { + "model": "intfloat/e5-small", + "splitter": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + } } }) .into(), ), - ); - let mut collection = Collection::new("test_r_c_cvswle_28", None); + )?; + let mut collection = Collection::new(collection_name, None); collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let mut pipeline = Pipeline::new("test_r_p_cvswle_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; - let results = collection - .vector_search("Here is some query", &mut pipeline, None, None) - .await?; - assert!(results.len() == 3); + let documents = generate_dummy_documents(2); + collection.upsert_documents(documents.clone(), None).await?; + let pool = get_or_initialize_pool(&None).await?; + let documents_table = format!("{}.documents", collection_name); + let queried_documents: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", documents_table)) + .fetch_all(&pool) + .await?; + assert!(queried_documents.len() == 2); + for (d, qd) in std::iter::zip(documents, queried_documents) { + assert_eq!(d, qd.document); + } + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 2); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 2); collection.archive().await?; + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 2); Ok(()) } #[sqlx::test] - async fn can_vector_search_with_remote_embeddings() -> anyhow::Result<()> { + async fn can_upsert_documents_and_add_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::new( - Some("text-embedding-ada-002".to_string()), - Some("openai".to_string()), - None, - ); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswre_1", - Some(model), - Some(splitter), + let collection_name = "test_r_c_cudaap_34"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(2); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cudaap_6"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + }, + "body": { + "embed": { + "model": "intfloat/e5-small", + "splitter": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + } } }) .into(), ), - ); - let mut collection = Collection::new("test_r_c_cvswre_21", None); + )?; collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let mut pipeline = Pipeline::new("test_r_p_cvswre_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; - let results = collection - .vector_search("Here is some query", &mut pipeline, None, Some(10)) - .await?; - assert!(results.len() == 3); + let pool = get_or_initialize_pool(&None).await?; + let documents_table = format!("{}.documents", collection_name); + let queried_documents: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", documents_table)) + .fetch_all(&pool) + .await?; + assert!(queried_documents.len() == 2); + for (d, qd) in std::iter::zip(documents, queried_documents) { + assert_eq!(d, qd.document); + } + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 2); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 2); collection.archive().await?; + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 2); Ok(()) } #[sqlx::test] - async fn can_vector_search_with_query_builder() -> anyhow::Result<()> { + async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswqb_1", - Some(model), - Some(splitter), + let collection_name = "test_r_c_cs_44"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(10000); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cs_7"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + } + }, + "body": { + "embed": { + "model": "intfloat/e5-small", + "splitter": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + } + }, + "notes": { + "embed": { + "model": "intfloat/e5-small" + } } }) .into(), ), - ); - let mut collection = Collection::new("test_r_c_cvswqb_4", None); + )?; collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let pipeline = Pipeline::new("test_r_p_cvswqb_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(4), None) - .await?; let results = collection - .query() - .vector_recall("Here is some query", &pipeline, None) - .limit(3) - .fetch_all() - .await?; - assert!(results.len() == 3); - collection.archive().await?; - Ok(()) - } - - #[sqlx::test] - async fn can_vector_search_with_query_builder_and_pass_model_parameters_in_search( - ) -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::new( - Some("hkunlp/instructor-base".to_string()), - Some("python".to_string()), - Some(json!({"instruction": "Represent the Wikipedia document for retrieval: "}).into()), - ); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswqbapmpis_1", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } + .search( + json!({ + "query": { + // "full_text_search": { + // "title": { + // "query": "test", + // "boost": 4.0 + // }, + // "body": { + // "query": "Test", + // "boost": 1.2 + // } + // }, + "semantic_search": { + "title": { + "query": "This is a test", + "boost": 2.0 + }, + // "body": { + // "query": "This is the body test", + // "boost": 1.01 + // }, + // "notes": { + // "query": "This is the notes test", + // "boost": 1.01 + // } + } + }, + "limit": 5 }) .into(), - ), - ); - let mut collection = Collection::new("test_r_c_cvswqbapmpis_4", None); - collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let pipeline = Pipeline::new("test_r_p_cvswqbapmpis_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; - let results = collection - .query() - .vector_recall( - "Here is some query", &pipeline, - Some( - json!({ - "instruction": "Represent the Wikipedia document for retrieval: " - }) - .into(), - ), ) - .limit(10) - .fetch_all() .await?; - assert!(results.len() == 3); + assert!(results.len() == 5); + let ids: Vec = results + .into_iter() + .map(|r| r["document"]["id"].as_u64().unwrap()) + .collect(); + assert_eq!(ids, vec![1, 2, 0, 3, 7]); collection.archive().await?; + // results.into_iter().for_each(|r| { + // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); + // }); Ok(()) } #[sqlx::test] - async fn can_vector_search_with_query_builder_with_remote_embeddings() -> anyhow::Result<()> { + async fn can_search_with_remote_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::new( - Some("text-embedding-ada-002".to_string()), - Some("openai".to_string()), - None, - ); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswqbwre_1", - Some(model), - Some(splitter), + let collection_name = "test_r_c_cswre_47"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(10); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cswre_7"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + }, + "body": { + "embed": { + "model": "text-embedding-ada-002", + "source": "openai", + "splitter": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + } + }, }) .into(), ), - ); - let mut collection = Collection::new("test_r_c_cvswqbwre_5", None); - collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let pipeline = Pipeline::new("test_r_p_cvswqbwre_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(4), None) - .await?; - let results = collection - .query() - .vector_recall("Here is some query", &pipeline, None) - .limit(3) - .fetch_all() - .await?; - assert!(results.len() == 3); - collection.archive().await?; - Ok(()) - } - - #[sqlx::test] - async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value( - ) -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = - Pipeline::new("test_r_p_cvswqbachesv_1", Some(model), Some(splitter), None); - let mut collection = Collection::new("test_r_c_cvswqbachesv_3", None); + )?; collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let pipeline = Pipeline::new("test_r_p_cvswqbachesv_1", None, None, None); - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; let results = collection - .query() - .vector_recall( - "Here is some query", - &pipeline, - Some( - json!({ - "hnsw": { - "ef_search": 2 + .search( + json!({ + "query": { + "full_text_search": { + "body": { + "query": "Test", + "boost": 1.2 + } + }, + "semantic_search": { + "title": { + "query": "This is a test", + "boost": 2.0 + }, + "body": { + "query": "This is the body test", + "boost": 1.01 + }, } - }) - .into(), - ), + }, + "limit": 5 + }) + .into(), + &pipeline, ) - .fetch_all() .await?; - assert!(results.len() == 3); + assert!(results.len() == 5); + let ids: Vec = results + .into_iter() + .map(|r| r["document"]["id"].as_u64().unwrap()) + .collect(); + assert_eq!(ids, vec![1, 2, 3, 4, 0]); collection.archive().await?; + // results.into_iter().for_each(|r| { + // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); + // }); Ok(()) } #[sqlx::test] - async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value_and_remote_embeddings( - ) -> anyhow::Result<()> { + async fn can_vector_search() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::new( - Some("text-embedding-ada-002".to_string()), - Some("openai".to_string()), - None, - ); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cvswqbachesvare_2", - Some(model), - Some(splitter), - None, - ); - let mut collection = Collection::new("test_r_c_cvswqbachesvare_7", None); + let collection_name = "test_r_c_cvs_0"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(10); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cvs_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + }, + "body": { + "embed": { + "model": "intfloat/e5-small", + "splitter": "recursive_character" + }, + }, + }) + .into(), + ), + )?; collection.add_pipeline(&mut pipeline).await?; - - // Recreate the pipeline to replicate a more accurate example - let pipeline = Pipeline::new("test_r_p_cvswqbachesvare_2", None, None, None); - collection - .upsert_documents(generate_dummy_documents(3), None) - .await?; let results = collection - .query() - .vector_recall( - "Here is some query", - &pipeline, + .vector_search( + "Test query string", + &mut pipeline, Some( json!({ - "hnsw": { - "ef_search": 2 - } + "fields": [ + "title", "body" + ] }) .into(), ), + None, ) - .fetch_all() .await?; - assert!(results.len() == 3); - collection.archive().await?; + // results.into_iter().for_each(|r| { + // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); + // }); Ok(()) } #[sqlx::test] - async fn can_filter_vector_search() -> anyhow::Result<()> { + async fn generate_er_diagram() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cfd_1", - Some(model), - Some(splitter), + let mut pipeline = MultiFieldPipeline::new( + "test_p_ged_57", Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); - let mut collection = Collection::new("test_r_c_cfd_2", None); - collection.add_pipeline(&mut pipeline).await?; - collection - .upsert_documents(generate_dummy_documents(5), None) - .await?; - - let filters = vec![ - (5, json!({}).into()), - ( - 3, json!({ - "metadata": { - "id": { - "$lt": 3 + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + } + }, + "body": { + "embed": { + "model": "intfloat/e5-small", + "splitter": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + } + }, + "notes": { + "embed": { + "model": "intfloat/e5-small" + } } - } - }) - .into(), - ), - ( - 1, - json!({ - "full_text_search": { - "configuration": "english", - "text": "1", - } }) .into(), ), - ]; - - for (expected_result_count, filter) in filters { - let results = collection - .query() - .vector_recall("Here is some query", &pipeline, None) - .filter(filter) - .fetch_all() - .await?; - assert_eq!(results.len(), expected_result_count); - } - + )?; + let mut collection = Collection::new("test_r_c_ged_1", None); + collection.add_pipeline(&mut pipeline).await?; + let diagram = collection.generate_er_diagram(&mut pipeline).await?; + assert!(!diagram.is_empty()); collection.archive().await?; Ok(()) } - /////////////////////////////// - // Working With Documents ///// - /////////////////////////////// + // TODO: Test + // - remote embeddings + // - some kind of simlutaneous upload with async threads and join + // - test the splitting is working correctly + // - test that different splitters and models are working correctly - #[sqlx::test] - async fn can_upsert_and_filter_get_documents() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cuafgd_1", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); + // TODO: DO + // - update upsert_documents to not re run pipeline if it is not part of the schema - let mut collection = Collection::new("test_r_c_cuagd_2", None); - collection.add_pipeline(&mut pipeline).await?; + // #[sqlx::test] + // async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cschpfp_0", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "hnsw": { + // "m": 100, + // "ef_construction": 200 + // } + // }) + // .into(), + // ), + // ); + // let collection_name = "test_r_c_cschpfp_1"; + // let mut collection = Collection::new(collection_name, None); + // collection.add_pipeline(&mut pipeline).await?; + // let full_embeddings_table_name = pipeline.create_or_get_embeddings_table().await?; + // let embeddings_table_name = full_embeddings_table_name.split('.').collect::>()[1]; + // let pool = get_or_initialize_pool(&None).await?; + // let results: Vec<(String, String)> = sqlx::query_as(&query_builder!( + // "select indexname, indexdef from pg_indexes where tablename = '%d' and schemaname = '%d'", + // embeddings_table_name, + // collection_name + // )).fetch_all(&pool).await?; + // let names = results.iter().map(|(name, _)| name).collect::>(); + // let definitions = results + // .iter() + // .map(|(_, definition)| definition) + // .collect::>(); + // assert!(names.contains(&&format!("{}_pipeline_hnsw_vector_index", pipeline.name))); + // assert!(definitions.contains(&&format!("CREATE INDEX {}_pipeline_hnsw_vector_index ON {} USING hnsw (embedding vector_cosine_ops) WITH (m='100', ef_construction='200')", pipeline.name, full_embeddings_table_name))); + // Ok(()) + // } - // Test basic upsert - let documents = vec![ - serde_json::json!({"id": 1, "random_key": 10, "text": "hello world 1"}).into(), - serde_json::json!({"id": 2, "random_key": 11, "text": "hello world 2"}).into(), - serde_json::json!({"id": 3, "random_key": 12, "text": "hello world 3"}).into(), - ]; - collection.upsert_documents(documents.clone(), None).await?; - let document = &collection.get_documents(None).await?[0]; - assert_eq!(document["document"]["text"], "hello world 1"); - - // Test upsert of text and metadata - let documents = vec![ - serde_json::json!({"id": 1, "text": "hello world new"}).into(), - serde_json::json!({"id": 2, "random_key": 12}).into(), - serde_json::json!({"id": 3, "random_key": 13}).into(), - ]; - collection.upsert_documents(documents.clone(), None).await?; + // #[sqlx::test] + // async fn disable_enable_pipeline() -> anyhow::Result<()> { + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new("test_p_dep_0", Some(model), Some(splitter), None); + // let mut collection = Collection::new("test_r_c_dep_1", None); + // collection.add_pipeline(&mut pipeline).await?; + // let queried_pipeline = &collection.get_pipelines().await?[0]; + // assert_eq!(pipeline.name, queried_pipeline.name); + // collection.disable_pipeline(&pipeline).await?; + // let queried_pipelines = &collection.get_pipelines().await?; + // assert!(queried_pipelines.is_empty()); + // collection.enable_pipeline(&pipeline).await?; + // let queried_pipeline = &collection.get_pipelines().await?[0]; + // assert_eq!(pipeline.name, queried_pipeline.name); + // collection.archive().await?; + // Ok(()) + // } - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "metadata": { - "random_key": { - "$eq": 12 - } - } - } - }) - .into(), - )) - .await?; - assert_eq!(documents[0]["document"]["text"], "hello world 2"); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "metadata": { - "random_key": { - "$gte": 13 - } - } - } - }) - .into(), - )) - .await?; - assert_eq!(documents[0]["document"]["text"], "hello world 3"); + // #[sqlx::test] + // async fn sync_multiple_pipelines() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline1 = Pipeline::new( + // "test_r_p_smp_0", + // Some(model.clone()), + // Some(splitter.clone()), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut pipeline2 = Pipeline::new( + // "test_r_p_smp_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_smp_3", None); + // collection.add_pipeline(&mut pipeline1).await?; + // collection.add_pipeline(&mut pipeline2).await?; + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let status_1 = pipeline1.get_status().await?; + // let status_2 = pipeline2.get_status().await?; + // assert!( + // status_1.chunks_status.synced == status_1.chunks_status.total + // && status_1.chunks_status.not_synced == 0 + // ); + // assert!( + // status_2.chunks_status.synced == status_2.chunks_status.total + // && status_2.chunks_status.not_synced == 0 + // ); + // collection.archive().await?; + // Ok(()) + // } - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "full_text_search": { - "configuration": "english", - "text": "new" - } - } - }) - .into(), - )) - .await?; - assert_eq!(documents[0]["document"]["text"], "hello world new"); - assert_eq!(documents[0]["document"]["id"].as_i64().unwrap(), 1); + // /////////////////////////////// + // // Various Searches /////////// + // /////////////////////////////// - collection.archive().await?; - Ok(()) - } + // #[sqlx::test] + // async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswle_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cvswle_28", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let mut pipeline = Pipeline::new("test_r_p_cvswle_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let results = collection + // .vector_search("Here is some query", &mut pipeline, None, None) + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - #[sqlx::test] - async fn can_paginate_get_documents() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let mut collection = Collection::new("test_r_c_cpgd_2", None); - collection - .upsert_documents(generate_dummy_documents(10), None) - .await?; + // #[sqlx::test] + // async fn can_vector_search_with_remote_embeddings() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::new( + // Some("text-embedding-ada-002".to_string()), + // Some("openai".to_string()), + // None, + // ); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswre_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cvswre_21", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let mut pipeline = Pipeline::new("test_r_p_cvswre_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let results = collection + // .vector_search("Here is some query", &mut pipeline, None, Some(10)) + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - let documents = collection - .get_documents(Some( - serde_json::json!({ - "limit": 5, - "offset": 0 - }) - .into(), - )) - .await?; - assert_eq!( - documents - .into_iter() - .map(|d| d["row_id"].as_i64().unwrap()) - .collect::>(), - vec![1, 2, 3, 4, 5] - ); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "limit": 2, - "offset": 5 - }) - .into(), - )) - .await?; - let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - assert_eq!( - documents - .into_iter() - .map(|d| d["row_id"].as_i64().unwrap()) - .collect::>(), - vec![6, 7] - ); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "limit": 2, - "last_row_id": last_row_id - }) - .into(), - )) - .await?; - let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - assert_eq!( - documents - .into_iter() - .map(|d| d["row_id"].as_i64().unwrap()) - .collect::>(), - vec![8, 9] - ); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "limit": 1, - "last_row_id": last_row_id - }) - .into(), - )) - .await?; - assert_eq!( - documents - .into_iter() - .map(|d| d["row_id"].as_i64().unwrap()) - .collect::>(), - vec![10] - ); + // #[sqlx::test] + // async fn can_vector_search_with_query_builder() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswqb_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cvswqb_4", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let pipeline = Pipeline::new("test_r_p_cvswqb_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(4), None) + // .await?; + // let results = collection + // .query() + // .vector_recall("Here is some query", &pipeline, None) + // .limit(3) + // .fetch_all() + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - collection.archive().await?; - Ok(()) - } + // #[sqlx::test] + // async fn can_vector_search_with_query_builder_and_pass_model_parameters_in_search( + // ) -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::new( + // Some("hkunlp/instructor-base".to_string()), + // Some("python".to_string()), + // Some(json!({"instruction": "Represent the Wikipedia document for retrieval: "}).into()), + // ); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswqbapmpis_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cvswqbapmpis_4", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let pipeline = Pipeline::new("test_r_p_cvswqbapmpis_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let results = collection + // .query() + // .vector_recall( + // "Here is some query", + // &pipeline, + // Some( + // json!({ + // "instruction": "Represent the Wikipedia document for retrieval: " + // }) + // .into(), + // ), + // ) + // .limit(10) + // .fetch_all() + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - #[sqlx::test] - async fn can_filter_and_paginate_get_documents() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cfapgd_1", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); + // #[sqlx::test] + // async fn can_vector_search_with_query_builder_with_remote_embeddings() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::new( + // Some("text-embedding-ada-002".to_string()), + // Some("openai".to_string()), + // None, + // ); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswqbwre_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cvswqbwre_5", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let pipeline = Pipeline::new("test_r_p_cvswqbwre_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(4), None) + // .await?; + // let results = collection + // .query() + // .vector_recall("Here is some query", &pipeline, None) + // .limit(3) + // .fetch_all() + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - let mut collection = Collection::new("test_r_c_cfapgd_1", None); - collection.add_pipeline(&mut pipeline).await?; + // #[sqlx::test] + // async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value( + // ) -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = + // Pipeline::new("test_r_p_cvswqbachesv_1", Some(model), Some(splitter), None); + // let mut collection = Collection::new("test_r_c_cvswqbachesv_3", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let pipeline = Pipeline::new("test_r_p_cvswqbachesv_1", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let results = collection + // .query() + // .vector_recall( + // "Here is some query", + // &pipeline, + // Some( + // json!({ + // "hnsw": { + // "ef_search": 2 + // } + // }) + // .into(), + // ), + // ) + // .fetch_all() + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - collection - .upsert_documents(generate_dummy_documents(10), None) - .await?; + // #[sqlx::test] + // async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value_and_remote_embeddings( + // ) -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::new( + // Some("text-embedding-ada-002".to_string()), + // Some("openai".to_string()), + // None, + // ); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cvswqbachesvare_2", + // Some(model), + // Some(splitter), + // None, + // ); + // let mut collection = Collection::new("test_r_c_cvswqbachesvare_7", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Recreate the pipeline to replicate a more accurate example + // let pipeline = Pipeline::new("test_r_p_cvswqbachesvare_2", None, None, None); + // collection + // .upsert_documents(generate_dummy_documents(3), None) + // .await?; + // let results = collection + // .query() + // .vector_recall( + // "Here is some query", + // &pipeline, + // Some( + // json!({ + // "hnsw": { + // "ef_search": 2 + // } + // }) + // .into(), + // ), + // ) + // .fetch_all() + // .await?; + // assert!(results.len() == 3); + // collection.archive().await?; + // Ok(()) + // } - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "metadata": { - "id": { - "$gte": 2 - } - } - }, - "limit": 2, - "offset": 0 - }) - .into(), - )) - .await?; - assert_eq!( - documents - .into_iter() - .map(|d| d["document"]["id"].as_i64().unwrap()) - .collect::>(), - vec![2, 3] - ); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "metadata": { - "id": { - "$lte": 5 - } - } - }, - "limit": 100, - "offset": 4 - }) - .into(), - )) - .await?; - let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - assert_eq!( - documents - .into_iter() - .map(|d| d["document"]["id"].as_i64().unwrap()) - .collect::>(), - vec![4, 5] - ); - - let documents = collection - .get_documents(Some( - serde_json::json!({ - "filter": { - "full_text_search": { - "configuration": "english", - "text": "document" - } - }, - "limit": 100, - "last_row_id": last_row_id - }) - .into(), - )) - .await?; - assert_eq!( - documents - .into_iter() - .map(|d| d["document"]["id"].as_i64().unwrap()) - .collect::>(), - vec![6, 7, 8, 9] - ); + // #[sqlx::test] + // async fn can_filter_vector_search() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cfd_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + // let mut collection = Collection::new("test_r_c_cfd_2", None); + // collection.add_pipeline(&mut pipeline).await?; + // collection + // .upsert_documents(generate_dummy_documents(5), None) + // .await?; + + // let filters = vec![ + // (5, json!({}).into()), + // ( + // 3, + // json!({ + // "metadata": { + // "id": { + // "$lt": 3 + // } + // } + // }) + // .into(), + // ), + // ( + // 1, + // json!({ + // "full_text_search": { + // "configuration": "english", + // "text": "1", + // } + // }) + // .into(), + // ), + // ]; + + // for (expected_result_count, filter) in filters { + // let results = collection + // .query() + // .vector_recall("Here is some query", &pipeline, None) + // .filter(filter) + // .fetch_all() + // .await?; + // assert_eq!(results.len(), expected_result_count); + // } - collection.archive().await?; - Ok(()) - } + // collection.archive().await?; + // Ok(()) + // } - #[sqlx::test] - async fn can_filter_and_delete_documents() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let model = Model::default(); - let splitter = Splitter::default(); - let mut pipeline = Pipeline::new( - "test_r_p_cfadd_1", - Some(model), - Some(splitter), - Some( - serde_json::json!({ - "full_text_search": { - "active": true, - "configuration": "english" - } - }) - .into(), - ), - ); + // /////////////////////////////// + // // Working With Documents ///// + // /////////////////////////////// - let mut collection = Collection::new("test_r_c_cfadd_1", None); - collection.add_pipeline(&mut pipeline).await?; - collection - .upsert_documents(generate_dummy_documents(10), None) - .await?; + // #[sqlx::test] + // async fn can_upsert_and_filter_get_documents() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cuafgd_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); - collection - .delete_documents( - serde_json::json!({ - "metadata": { - "id": { - "$lt": 2 - } - } - }) - .into(), - ) - .await?; - let documents = collection.get_documents(None).await?; - assert_eq!(documents.len(), 8); - assert!(documents - .iter() - .all(|d| d["document"]["id"].as_i64().unwrap() >= 2)); - - collection - .delete_documents( - serde_json::json!({ - "full_text_search": { - "configuration": "english", - "text": "2" - } - }) - .into(), - ) - .await?; - let documents = collection.get_documents(None).await?; - assert_eq!(documents.len(), 7); - assert!(documents - .iter() - .all(|d| d["document"]["id"].as_i64().unwrap() > 2)); - - collection - .delete_documents( - serde_json::json!({ - "metadata": { - "id": { - "$gte": 6 - } - }, - "full_text_search": { - "configuration": "english", - "text": "6" - } - }) - .into(), - ) - .await?; - let documents = collection.get_documents(None).await?; - assert_eq!(documents.len(), 6); - assert!(documents - .iter() - .all(|d| d["document"]["id"].as_i64().unwrap() != 6)); + // let mut collection = Collection::new("test_r_c_cuagd_2", None); + // collection.add_pipeline(&mut pipeline).await?; + + // // Test basic upsert + // let documents = vec![ + // serde_json::json!({"id": 1, "random_key": 10, "text": "hello world 1"}).into(), + // serde_json::json!({"id": 2, "random_key": 11, "text": "hello world 2"}).into(), + // serde_json::json!({"id": 3, "random_key": 12, "text": "hello world 3"}).into(), + // ]; + // collection.upsert_documents(documents.clone(), None).await?; + // let document = &collection.get_documents(None).await?[0]; + // assert_eq!(document["document"]["text"], "hello world 1"); + + // // Test upsert of text and metadata + // let documents = vec![ + // serde_json::json!({"id": 1, "text": "hello world new"}).into(), + // serde_json::json!({"id": 2, "random_key": 12}).into(), + // serde_json::json!({"id": 3, "random_key": 13}).into(), + // ]; + // collection.upsert_documents(documents.clone(), None).await?; + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "metadata": { + // "random_key": { + // "$eq": 12 + // } + // } + // } + // }) + // .into(), + // )) + // .await?; + // assert_eq!(documents[0]["document"]["text"], "hello world 2"); + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "metadata": { + // "random_key": { + // "$gte": 13 + // } + // } + // } + // }) + // .into(), + // )) + // .await?; + // assert_eq!(documents[0]["document"]["text"], "hello world 3"); + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "full_text_search": { + // "configuration": "english", + // "text": "new" + // } + // } + // }) + // .into(), + // )) + // .await?; + // assert_eq!(documents[0]["document"]["text"], "hello world new"); + // assert_eq!(documents[0]["document"]["id"].as_i64().unwrap(), 1); - collection.archive().await?; - Ok(()) - } + // collection.archive().await?; + // Ok(()) + // } - #[sqlx::test] - fn can_order_documents() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let mut collection = Collection::new("test_r_c_cod_1", None); - collection - .upsert_documents( - vec![ - json!({ - "id": 1, - "text": "Test Document 1", - "number": 99, - "nested_number": { - "number": 3 - }, + // #[sqlx::test] + // async fn can_paginate_get_documents() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let mut collection = Collection::new("test_r_c_cpgd_2", None); + // collection + // .upsert_documents(generate_dummy_documents(10), None) + // .await?; + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "limit": 5, + // "offset": 0 + // }) + // .into(), + // )) + // .await?; + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["row_id"].as_i64().unwrap()) + // .collect::>(), + // vec![1, 2, 3, 4, 5] + // ); - "tie": 2, - }) - .into(), - json!({ - "id": 2, - "text": "Test Document 1", - "number": 98, - "nested_number": { - "number": 2 - }, - "tie": 2, - }) - .into(), - json!({ - "id": 3, - "text": "Test Document 1", - "number": 97, - "nested_number": { - "number": 1 - }, - "tie": 2 - }) - .into(), - ], - None, - ) - .await?; - let documents = collection - .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) - .await?; - assert_eq!( - documents - .iter() - .map(|d| d["document"]["number"].as_i64().unwrap()) - .collect::>(), - vec![97, 98, 99] - ); - let documents = collection - .get_documents(Some( - json!({"order_by": {"nested_number": {"number": "asc"}}}).into(), - )) - .await?; - assert_eq!( - documents - .iter() - .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) - .collect::>(), - vec![1, 2, 3] - ); - let documents = collection - .get_documents(Some( - json!({"order_by": {"nested_number": {"number": "asc"}, "tie": "desc"}}).into(), - )) - .await?; - assert_eq!( - documents - .iter() - .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) - .collect::>(), - vec![1, 2, 3] - ); - collection.archive().await?; - Ok(()) - } + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "limit": 2, + // "offset": 5 + // }) + // .into(), + // )) + // .await?; + // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["row_id"].as_i64().unwrap()) + // .collect::>(), + // vec![6, 7] + // ); - #[sqlx::test] - fn can_merge_metadata() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let mut collection = Collection::new("test_r_c_cmm_4", None); - collection - .upsert_documents( - vec![ - json!({ - "id": 1, - "text": "Test Document 1", - "number": 99, - "second_number": 10, - }) - .into(), - json!({ - "id": 2, - "text": "Test Document 1", - "number": 98, - "second_number": 11, - }) - .into(), - json!({ - "id": 3, - "text": "Test Document 1", - "number": 97, - "second_number": 12, - }) - .into(), - ], - None, - ) - .await?; - let documents = collection - .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) - .await?; - assert_eq!( - documents - .iter() - .map(|d| ( - d["document"]["number"].as_i64().unwrap(), - d["document"]["second_number"].as_i64().unwrap() - )) - .collect::>(), - vec![(97, 12), (98, 11), (99, 10)] - ); - collection - .upsert_documents( - vec![ - json!({ - "id": 1, - "number": 0, - "another_number": 1 - }) - .into(), - json!({ - "id": 2, - "number": 1, - "another_number": 2 - }) - .into(), - json!({ - "id": 3, - "number": 2, - "another_number": 3 - }) - .into(), - ], - Some( - json!({ - "metadata": { - "merge": true - } - }) - .into(), - ), - ) - .await?; - let documents = collection - .get_documents(Some( - json!({"order_by": {"number": {"number": "asc"}}}).into(), - )) - .await?; + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "limit": 2, + // "last_row_id": last_row_id + // }) + // .into(), + // )) + // .await?; + // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["row_id"].as_i64().unwrap()) + // .collect::>(), + // vec![8, 9] + // ); - assert_eq!( - documents - .iter() - .map(|d| ( - d["document"]["number"].as_i64().unwrap(), - d["document"]["another_number"].as_i64().unwrap(), - d["document"]["second_number"].as_i64().unwrap() - )) - .collect::>(), - vec![(0, 1, 10), (1, 2, 11), (2, 3, 12)] - ); - collection.archive().await?; - Ok(()) - } + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "limit": 1, + // "last_row_id": last_row_id + // }) + // .into(), + // )) + // .await?; + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["row_id"].as_i64().unwrap()) + // .collect::>(), + // vec![10] + // ); + + // collection.archive().await?; + // Ok(()) + // } + + // #[sqlx::test] + // async fn can_filter_and_paginate_get_documents() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cfapgd_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + + // let mut collection = Collection::new("test_r_c_cfapgd_1", None); + // collection.add_pipeline(&mut pipeline).await?; + + // collection + // .upsert_documents(generate_dummy_documents(10), None) + // .await?; + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "metadata": { + // "id": { + // "$gte": 2 + // } + // } + // }, + // "limit": 2, + // "offset": 0 + // }) + // .into(), + // )) + // .await?; + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["document"]["id"].as_i64().unwrap()) + // .collect::>(), + // vec![2, 3] + // ); + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "metadata": { + // "id": { + // "$lte": 5 + // } + // } + // }, + // "limit": 100, + // "offset": 4 + // }) + // .into(), + // )) + // .await?; + // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["document"]["id"].as_i64().unwrap()) + // .collect::>(), + // vec![4, 5] + // ); + + // let documents = collection + // .get_documents(Some( + // serde_json::json!({ + // "filter": { + // "full_text_search": { + // "configuration": "english", + // "text": "document" + // } + // }, + // "limit": 100, + // "last_row_id": last_row_id + // }) + // .into(), + // )) + // .await?; + // assert_eq!( + // documents + // .into_iter() + // .map(|d| d["document"]["id"].as_i64().unwrap()) + // .collect::>(), + // vec![6, 7, 8, 9] + // ); + + // collection.archive().await?; + // Ok(()) + // } + + // #[sqlx::test] + // async fn can_filter_and_delete_documents() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let model = Model::default(); + // let splitter = Splitter::default(); + // let mut pipeline = Pipeline::new( + // "test_r_p_cfadd_1", + // Some(model), + // Some(splitter), + // Some( + // serde_json::json!({ + // "full_text_search": { + // "active": true, + // "configuration": "english" + // } + // }) + // .into(), + // ), + // ); + + // let mut collection = Collection::new("test_r_c_cfadd_1", None); + // collection.add_pipeline(&mut pipeline).await?; + // collection + // .upsert_documents(generate_dummy_documents(10), None) + // .await?; + + // collection + // .delete_documents( + // serde_json::json!({ + // "metadata": { + // "id": { + // "$lt": 2 + // } + // } + // }) + // .into(), + // ) + // .await?; + // let documents = collection.get_documents(None).await?; + // assert_eq!(documents.len(), 8); + // assert!(documents + // .iter() + // .all(|d| d["document"]["id"].as_i64().unwrap() >= 2)); + + // collection + // .delete_documents( + // serde_json::json!({ + // "full_text_search": { + // "configuration": "english", + // "text": "2" + // } + // }) + // .into(), + // ) + // .await?; + // let documents = collection.get_documents(None).await?; + // assert_eq!(documents.len(), 7); + // assert!(documents + // .iter() + // .all(|d| d["document"]["id"].as_i64().unwrap() > 2)); + + // collection + // .delete_documents( + // serde_json::json!({ + // "metadata": { + // "id": { + // "$gte": 6 + // } + // }, + // "full_text_search": { + // "configuration": "english", + // "text": "6" + // } + // }) + // .into(), + // ) + // .await?; + // let documents = collection.get_documents(None).await?; + // assert_eq!(documents.len(), 6); + // assert!(documents + // .iter() + // .all(|d| d["document"]["id"].as_i64().unwrap() != 6)); + + // collection.archive().await?; + // Ok(()) + // } + + // #[sqlx::test] + // fn can_order_documents() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let mut collection = Collection::new("test_r_c_cod_1", None); + // collection + // .upsert_documents( + // vec![ + // json!({ + // "id": 1, + // "text": "Test Document 1", + // "number": 99, + // "nested_number": { + // "number": 3 + // }, + + // "tie": 2, + // }) + // .into(), + // json!({ + // "id": 2, + // "text": "Test Document 1", + // "number": 98, + // "nested_number": { + // "number": 2 + // }, + // "tie": 2, + // }) + // .into(), + // json!({ + // "id": 3, + // "text": "Test Document 1", + // "number": 97, + // "nested_number": { + // "number": 1 + // }, + // "tie": 2 + // }) + // .into(), + // ], + // None, + // ) + // .await?; + // let documents = collection + // .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + // .await?; + // assert_eq!( + // documents + // .iter() + // .map(|d| d["document"]["number"].as_i64().unwrap()) + // .collect::>(), + // vec![97, 98, 99] + // ); + // let documents = collection + // .get_documents(Some( + // json!({"order_by": {"nested_number": {"number": "asc"}}}).into(), + // )) + // .await?; + // assert_eq!( + // documents + // .iter() + // .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) + // .collect::>(), + // vec![1, 2, 3] + // ); + // let documents = collection + // .get_documents(Some( + // json!({"order_by": {"nested_number": {"number": "asc"}, "tie": "desc"}}).into(), + // )) + // .await?; + // assert_eq!( + // documents + // .iter() + // .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) + // .collect::>(), + // vec![1, 2, 3] + // ); + // collection.archive().await?; + // Ok(()) + // } + + // #[sqlx::test] + // fn can_merge_metadata() -> anyhow::Result<()> { + // internal_init_logger(None, None).ok(); + // let mut collection = Collection::new("test_r_c_cmm_4", None); + // collection + // .upsert_documents( + // vec![ + // json!({ + // "id": 1, + // "text": "Test Document 1", + // "number": 99, + // "second_number": 10, + // }) + // .into(), + // json!({ + // "id": 2, + // "text": "Test Document 1", + // "number": 98, + // "second_number": 11, + // }) + // .into(), + // json!({ + // "id": 3, + // "text": "Test Document 1", + // "number": 97, + // "second_number": 12, + // }) + // .into(), + // ], + // None, + // ) + // .await?; + // let documents = collection + // .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + // .await?; + // assert_eq!( + // documents + // .iter() + // .map(|d| ( + // d["document"]["number"].as_i64().unwrap(), + // d["document"]["second_number"].as_i64().unwrap() + // )) + // .collect::>(), + // vec![(97, 12), (98, 11), (99, 10)] + // ); + // collection + // .upsert_documents( + // vec![ + // json!({ + // "id": 1, + // "number": 0, + // "another_number": 1 + // }) + // .into(), + // json!({ + // "id": 2, + // "number": 1, + // "another_number": 2 + // }) + // .into(), + // json!({ + // "id": 3, + // "number": 2, + // "another_number": 3 + // }) + // .into(), + // ], + // Some( + // json!({ + // "metadata": { + // "merge": true + // } + // }) + // .into(), + // ), + // ) + // .await?; + // let documents = collection + // .get_documents(Some( + // json!({"order_by": {"number": {"number": "asc"}}}).into(), + // )) + // .await?; + + // assert_eq!( + // documents + // .iter() + // .map(|d| ( + // d["document"]["number"].as_i64().unwrap(), + // d["document"]["another_number"].as_i64().unwrap(), + // d["document"]["second_number"].as_i64().unwrap() + // )) + // .collect::>(), + // vec![(0, 1, 10), (1, 2, 11), (2, 3, 12)] + // ); + // collection.archive().await?; + // Ok(()) + // } } diff --git a/pgml-sdks/pgml/src/models.rs b/pgml-sdks/pgml/src/models.rs index 07440d4e3..634fff369 100644 --- a/pgml-sdks/pgml/src/models.rs +++ b/pgml-sdks/pgml/src/models.rs @@ -12,10 +12,19 @@ pub struct Pipeline { pub id: i64, pub name: String, pub created_at: DateTime, - pub model_id: i64, - pub splitter_id: i64, + pub schema: Json, pub active: bool, - pub parameters: Json, +} + +// A multi field pipeline +#[enum_def] +#[derive(FromRow)] +pub struct MultiFieldPipeline { + pub id: i64, + pub name: String, + pub created_at: DateTime, + pub active: bool, + pub schema: Json, } // A model used to perform some task @@ -65,18 +74,16 @@ pub struct Document { #[serde(with = "uuid::serde::compact")] // See: https://docs.rs/uuid/latest/uuid/serde/index.html pub source_uuid: Uuid, - pub metadata: Json, - pub text: String, + pub document: Json, } impl Document { pub fn into_user_friendly_json(mut self) -> Json { - self.metadata["text"] = self.text.into(); serde_json::json!({ "row_id": self.id, "created_at": self.created_at, "source_uuid": self.source_uuid, - "document": self.metadata, + "document": self.document, }) .into() } @@ -109,7 +116,14 @@ pub struct Chunk { pub id: i64, pub created_at: DateTime, pub document_id: i64, - pub splitter_id: i64, pub chunk_index: i64, pub chunk: String, } + +// A tsvector of a document +#[derive(FromRow)] +pub struct TSVector { + pub id: i64, + pub created_at: DateTime, + pub document_id: i64, +} diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs new file mode 100644 index 000000000..8b32f4acb --- /dev/null +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -0,0 +1,755 @@ +use anyhow::Context; +use indicatif::MultiProgress; +use rust_bridge::{alias, alias_manual, alias_methods}; +use serde::Deserialize; +use sqlx::{Executor, PgConnection, PgPool}; +use std::sync::atomic::Ordering::Relaxed; +use std::{collections::HashMap, sync::atomic::AtomicBool}; +use tokio::join; +use tracing::instrument; + +use crate::{ + collection::ProjectInfo, + get_or_initialize_pool, + model::{Model, ModelRuntime}, + models, queries, query_builder, + remote_embeddings::build_remote_embeddings, + splitter::Splitter, + types::{DateTime, Json, TryToNumeric}, + utils, +}; + +#[cfg(feature = "python")] +use crate::{model::ModelPython, splitter::SplitterPython, types::JsonPython}; + +type ParsedSchema = HashMap; + +#[derive(Deserialize)] +struct ValidEmbedAction { + model: String, + source: Option, + model_parameters: Option, + splitter: Option, + splitter_parameters: Option, + hnsw: Option, +} + +#[derive(Deserialize, Debug, Clone)] +pub struct FullTextSearchAction { + configuration: String, +} + +#[derive(Deserialize)] +struct ValidFieldAction { + embed: Option, + full_text_search: Option, +} + +#[derive(Debug, Clone)] +pub struct HNSW { + m: u64, + ef_construction: u64, +} + +impl Default for HNSW { + fn default() -> Self { + Self { + m: 16, + ef_construction: 64, + } + } +} + +impl TryFrom for HNSW { + type Error = anyhow::Error; + fn try_from(value: Json) -> anyhow::Result { + let m = if !value["hnsw"]["m"].is_null() { + value["hnsw"]["m"] + .try_to_u64() + .context("hnsw.m must be an integer")? + } else { + 16 + }; + let ef_construction = if !value["hnsw"]["ef_construction"].is_null() { + value["hnsw"]["ef_construction"] + .try_to_u64() + .context("hnsw.ef_construction must be an integer")? + } else { + 64 + }; + Ok(Self { m, ef_construction }) + } +} + +#[derive(Debug, Clone)] +pub struct EmbedAction { + pub splitter: Option, + pub model: Model, + pub hnsw: HNSW, +} + +#[derive(Debug, Clone)] +pub struct FieldAction { + pub embed: Option, + pub full_text_search: Option, +} + +impl TryFrom for FieldAction { + type Error = anyhow::Error; + fn try_from(value: ValidFieldAction) -> Result { + let embed = value + .embed + .map(|v| { + let model = Model::new(Some(v.model), v.source, v.model_parameters); + let splitter = v + .splitter + .map(|v2| Splitter::new(Some(v2), v.splitter_parameters)); + let hnsw = v + .hnsw + .map(|v2| HNSW::try_from(v2)) + .unwrap_or_else(|| Ok(HNSW::default()))?; + anyhow::Ok(EmbedAction { + model, + splitter, + hnsw, + }) + }) + .transpose()?; + Ok(Self { + embed, + full_text_search: value.full_text_search, + }) + } +} + +#[derive(Debug, Clone)] +pub struct MultiFieldPipelineDatabaseData { + pub id: i64, + pub created_at: DateTime, +} + +#[derive(Debug)] +pub struct MultiFieldPipeline { + // TODO: Make the schema and parsed_schema optional fields only required if they try to save a new pipeline that does not exist + pub name: String, + pub schema: Option, + pub parsed_schema: Option, + project_info: Option, + database_data: Option, +} + +pub enum PipelineTableTypes { + Embedding, + TSVector, +} + +fn validate_schema(schema: &Json) -> anyhow::Result<()> { + Ok(()) +} + +fn json_to_schema(schema: &Json) -> anyhow::Result { + schema + .as_object() + .context("Schema object must be a JSON object")? + .iter() + .try_fold(ParsedSchema::new(), |mut acc, (key, value)| { + if acc.contains_key(key) { + Err(anyhow::anyhow!("Schema contains duplicate keys")) + } else { + // First lets deserialize it normally + let action: ValidFieldAction = serde_json::from_value(value.to_owned())?; + // Now lets actually build the models and splitters + acc.insert(key.to_owned(), action.try_into()?); + Ok(acc) + } + }) +} + +impl MultiFieldPipeline { + pub fn new(name: &str, schema: Option) -> anyhow::Result { + let parsed_schema = schema.as_ref().map(|s| json_to_schema(&s)).transpose()?; + Ok(Self { + name: name.to_string(), + schema, + parsed_schema, + project_info: None, + database_data: None, + }) + } + + #[instrument(skip(self))] + pub(crate) async fn verify_in_database(&mut self, throw_if_exists: bool) -> anyhow::Result<()> { + if self.database_data.is_none() { + let pool = self.get_pool().await?; + + let project_info = self + .project_info + .as_ref() + .context("Cannot verify pipeline wihtout project info")?; + + let pipeline: Option = sqlx::query_as(&query_builder!( + "SELECT * FROM %s WHERE name = $1", + format!("{}.pipelines", project_info.name) + )) + .bind(&self.name) + .fetch_optional(&pool) + .await?; + + let pipeline = if let Some(pipeline) = pipeline { + if throw_if_exists { + anyhow::bail!("Pipeline {} already exists", pipeline.name); + } + + let mut parsed_schema = json_to_schema(&pipeline.schema)?; + + for (_key, value) in parsed_schema.iter_mut() { + if let Some(embed) = &mut value.embed { + embed.model.set_project_info(project_info.clone()); + embed.model.verify_in_database(false).await?; + if let Some(splitter) = &mut embed.splitter { + splitter.set_project_info(project_info.clone()); + splitter.verify_in_database(false).await?; + } + } + } + self.schema = Some(pipeline.schema.clone()); + self.parsed_schema = Some(parsed_schema.clone()); + + pipeline + } else { + let schema = self + .schema + .as_ref() + .context("Pipeline must have schema to store in database")?; + let mut parsed_schema = json_to_schema(schema)?; + + for (_key, value) in parsed_schema.iter_mut() { + if let Some(embed) = &mut value.embed { + embed.model.set_project_info(project_info.clone()); + embed.model.verify_in_database(false).await?; + if let Some(splitter) = &mut embed.splitter { + splitter.set_project_info(project_info.clone()); + splitter.verify_in_database(false).await?; + } + } + } + self.parsed_schema = Some(parsed_schema); + + sqlx::query_as(&query_builder!( + "INSERT INTO %s (name, schema) VALUES ($1, $2) RETURNING *", + format!("{}.pipelines", project_info.name) + )) + .bind(&self.name) + .bind(&self.schema) + .fetch_one(&pool) + .await? + }; + self.database_data = Some(MultiFieldPipelineDatabaseData { + id: pipeline.id, + created_at: pipeline.created_at, + }) + } + Ok(()) + } + + #[instrument(skip(self))] + pub(crate) async fn create_tables(&mut self) -> anyhow::Result<()> { + self.verify_in_database(false).await?; + let pool = self.get_pool().await?; + + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to create_or_get_tables")?; + let collection_name = &project_info.name; + let documents_table_name = format!("{}.documents", collection_name); + + let schema = format!("{}_{}", collection_name, self.name); + + let mut transaction = pool.begin().await?; + transaction + .execute(query_builder!("CREATE SCHEMA IF NOT EXISTS %s", schema).as_str()) + .await?; + + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to create_tables")?; + + for (key, value) in parsed_schema.iter() { + if let Some(embed) = &value.embed { + let embeddings_table_name = format!("{}.{}_embeddings", schema, key); + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" + ) + .bind(&schema) + .bind(&embeddings_table_name).fetch_one(&pool).await?; + + if !exists { + let embedding_length = match &embed.model.runtime { + ModelRuntime::Python => { + let embedding: (Vec,) = sqlx::query_as( + "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") + .bind(&embed.model.name) + .bind(&embed.model.parameters) + .fetch_one(&pool).await?; + embedding.0.len() as i64 + } + t => { + let remote_embeddings = build_remote_embeddings( + t.to_owned(), + &embed.model.name, + Some(&embed.model.parameters), + )?; + remote_embeddings.get_embedding_size().await? + } + }; + + let chunks_table_name = format!("{}.{}_chunks", schema, key); + + // Create the chunks table + transaction + .execute( + query_builder!( + queries::CREATE_CHUNKS_TABLE, + chunks_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_chunk_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + chunks_table_name, + "document_id" + ) + .as_str(), + ) + .await?; + + // Create the embeddings table + sqlx::query(&query_builder!( + queries::CREATE_EMBEDDINGS_TABLE, + &embeddings_table_name, + chunks_table_name, + embedding_length + )) + .execute(&mut *transaction) + .await?; + let index_name = format!("{}_pipeline_chunk_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + &embeddings_table_name, + "chunk_id" + ) + .as_str(), + ) + .await?; + let index_with_parameters = format!( + "WITH (m = {}, ef_construction = {})", + embed.hnsw.m, embed.hnsw.ef_construction + ); + let index_name = format!("{}_pipeline_hnsw_vector_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX_USING_HNSW, + "", + index_name, + &embeddings_table_name, + "embedding vector_cosine_ops", + index_with_parameters + ) + .as_str(), + ) + .await?; + } + } + + // Create the tsvectors table + if value.full_text_search.is_some() { + let tsvectors_table_name = format!("{}.{}_tsvectors", schema, key); + transaction + .execute( + query_builder!( + queries::CREATE_DOCUMENTS_TSVECTORS_TABLE, + tsvectors_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_tsvector_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX_USING_GIN, + "", + index_name, + tsvectors_table_name, + "ts" + ) + .as_str(), + ) + .await?; + } + } + transaction.commit().await?; + + Ok(()) + } + + #[instrument(skip(self))] + pub(crate) async fn execute( + &mut self, + document_ids: &Option>, + mp: MultiProgress, + ) -> anyhow::Result<()> { + self.verify_in_database(false).await?; + self.create_tables().await?; + + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to execute")?; + + for (key, value) in parsed_schema.iter() { + if let Some(embed) = &value.embed { + let chunk_ids = self + .sync_chunks(key, &embed.splitter, document_ids, &mp) + .await?; + self.sync_embeddings(key, &embed.model, &chunk_ids, &mp) + .await?; + } + if let Some(full_text_search) = &value.full_text_search { + self.sync_tsvectors(key, &full_text_search.configuration, document_ids, &mp) + .await?; + } + } + Ok(()) + } + + #[instrument(skip(self))] + async fn sync_chunks( + &self, + key: &str, + splitter: &Option, + document_ids: &Option>, + mp: &MultiProgress, + ) -> anyhow::Result> { + let pool = self.get_pool().await?; + + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync chunks")?; + + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + let documents_table_name = format!("{}.documents", project_info.name); + let json_key_query = format!("document->>'{}'", key); + + if let Some(splitter) = splitter { + let splitter_database_data = splitter + .database_data + .as_ref() + .context("Splitter must be verified to sync chunks")?; + + let progress_bar = mp + .add(utils::default_progress_spinner(1)) + .with_prefix(format!("{} - {}", self.name.clone(), key)) + .with_message("Generating chunks"); + + let is_done = AtomicBool::new(false); + let work = async { + let chunk_ids: Result, _> = if document_ids.is_some() { + sqlx::query(&query_builder!( + queries::GENERATE_CHUNKS_FOR_DOCUMENT_IDS, + &chunks_table_name, + &json_key_query, + documents_table_name, + &chunks_table_name + )) + .bind(splitter_database_data.id) + .bind(document_ids) + .execute(&pool) + .await + .map_err(|e| { + is_done.store(true, Relaxed); + e + })?; + sqlx::query_scalar(&query_builder!( + "SELECT id FROM %s WHERE document_id = ANY($1)", + &chunks_table_name + )) + .bind(document_ids) + .fetch_all(&pool) + .await + } else { + sqlx::query_scalar(&query_builder!( + queries::GENERATE_CHUNKS, + &chunks_table_name, + &json_key_query, + documents_table_name, + &chunks_table_name + )) + .bind(splitter_database_data.id) + .fetch_all(&pool) + .await + }; + is_done.store(true, Relaxed); + chunk_ids + }; + let progress_work = async { + while !is_done.load(Relaxed) { + progress_bar.inc(1); + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + }; + let (chunk_ids, _) = join!(work, progress_work); + progress_bar.set_message("Done generating chunks"); + progress_bar.finish(); + chunk_ids.map_err(anyhow::Error::msg) + } else { + sqlx::query_scalar(&query_builder!( + r#" + INSERT INTO %s( + document_id, chunk_index, chunk + ) + SELECT + id, + 1, + %d + FROM %s + ON CONFLICT (document_id, chunk_index) DO NOTHING + RETURNING id + "#, + &chunks_table_name, + &json_key_query, + &documents_table_name + )) + .fetch_all(&pool) + .await + .map_err(anyhow::Error::msg) + } + } + + #[instrument(skip(self))] + async fn sync_embeddings( + &self, + key: &str, + model: &Model, + chunk_ids: &Vec, + mp: &MultiProgress, + ) -> anyhow::Result<()> { + let pool = self.get_pool().await?; + + // Remove the stored name from the parameters + let mut parameters = model.parameters.clone(); + parameters + .as_object_mut() + .context("Model parameters must be an object")? + .remove("name"); + + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync chunks")?; + + let progress_bar = mp + .add(utils::default_progress_spinner(1)) + .with_prefix(self.name.clone()) + .with_message("Generating emmbeddings"); + + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + let embeddings_table_name = + format!("{}_{}.{}_embeddings", project_info.name, self.name, key); + + let is_done = AtomicBool::new(false); + // We need to be careful about how we handle errors here. We do not want to return an error + // from the async block before setting is_done to true. If we do, the progress bar will + // will load forever. We also want to make sure to propogate any errors we have + let work = async { + let res = match model.runtime { + ModelRuntime::Python => sqlx::query(&query_builder!( + queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, + embeddings_table_name, + chunks_table_name, + embeddings_table_name + )) + .bind(&model.name) + .bind(¶meters) + .bind(chunk_ids) + .execute(&pool) + .await + .map_err(|e| anyhow::anyhow!(e)) + .map(|_t| ()), + r => { + let remote_embeddings = + build_remote_embeddings(r, &model.name, Some(¶meters))?; + remote_embeddings + .generate_embeddings( + &embeddings_table_name, + &chunks_table_name, + chunk_ids, + &pool, + ) + .await + .map(|_t| ()) + } + }; + is_done.store(true, Relaxed); + res + }; + let progress_work = async { + while !is_done.load(Relaxed) { + progress_bar.inc(1); + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + }; + let (res, _) = join!(work, progress_work); + res?; + progress_bar.set_message("done generating embeddings"); + progress_bar.finish(); + Ok(()) + } + + #[instrument(skip(self))] + async fn sync_tsvectors( + &self, + key: &str, + configuration: &str, + document_ids: &Option>, + mp: &MultiProgress, + ) -> anyhow::Result<()> { + let pool = self.get_pool().await?; + + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync TSVectors")?; + + let progress_bar = mp + .add(utils::default_progress_spinner(1)) + .with_prefix(self.name.clone()) + .with_message("Syncing TSVectors for full text search"); + + let documents_table_name = format!("{}.documents", project_info.name); + let tsvectors_table_name = format!("{}_{}.{}_tsvectors", project_info.name, self.name, key); + let json_key_query = format!("document->>'{}'", key); + + let is_done = AtomicBool::new(false); + let work = async { + let res = if document_ids.is_some() { + sqlx::query(&query_builder!( + queries::GENERATE_TSVECTORS_FOR_DOCUMENT_IDS, + tsvectors_table_name, + configuration, + json_key_query, + documents_table_name + )) + .bind(document_ids) + .execute(&pool) + .await + } else { + sqlx::query(&query_builder!( + queries::GENERATE_TSVECTORS, + tsvectors_table_name, + configuration, + json_key_query, + documents_table_name + )) + .execute(&pool) + .await + }; + is_done.store(true, Relaxed); + res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) + }; + let progress_work = async { + while !is_done.load(Relaxed) { + progress_bar.inc(1); + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + }; + let (res, _) = join!(work, progress_work); + res?; + progress_bar.set_message("Done syncing TSVectors for full text search"); + progress_bar.finish(); + + Ok(()) + } + + async fn get_pool(&self) -> anyhow::Result { + let database_url = &self + .project_info + .as_ref() + .context("Project info required to call method pipeline.get_pool()")? + .database_url; + get_or_initialize_pool(database_url).await + } + + #[instrument(skip(self))] + pub(crate) fn set_project_info(&mut self, project_info: ProjectInfo) { + if let Some(parsed_schema) = &mut self.parsed_schema { + for (_key, value) in parsed_schema.iter_mut() { + if let Some(embed) = &mut value.embed { + embed.model.set_project_info(project_info.clone()); + if let Some(splitter) = &mut embed.splitter { + splitter.set_project_info(project_info.clone()); + } + } + } + } + self.project_info = Some(project_info); + } + + #[instrument] + pub(crate) async fn create_multi_field_pipelines_table( + project_info: &ProjectInfo, + conn: &mut PgConnection, + ) -> anyhow::Result<()> { + let pipelines_table_name = format!("{}.pipelines", project_info.name); + sqlx::query(&query_builder!( + queries::CREATE_MULTI_FIELD_PIPELINES_TABLE, + pipelines_table_name + )) + .execute(&mut *conn) + .await?; + conn.execute( + query_builder!( + queries::CREATE_INDEX, + "", + "pipeline_name_index", + pipelines_table_name, + "name" + ) + .as_str(), + ) + .await?; + Ok(()) + } +} + +impl TryFrom for MultiFieldPipeline { + type Error = anyhow::Error; + fn try_from(value: models::Pipeline) -> anyhow::Result { + let parsed_schema = json_to_schema(&value.schema).unwrap(); + // NOTE: We do not set the database data here even though we have it + // self.verify_in_database() also verifies all models in the schema so we don't want to set it here + Ok(Self { + name: value.name, + schema: Some(value.schema), + parsed_schema: Some(parsed_schema), + project_info: None, + database_data: None, + }) + } +} diff --git a/pgml-sdks/pgml/src/pipeline.rs b/pgml-sdks/pgml/src/pipeline.rs index dceff4270..395729ac9 100644 --- a/pgml-sdks/pgml/src/pipeline.rs +++ b/pgml-sdks/pgml/src/pipeline.rs @@ -155,167 +155,169 @@ impl Pipeline { /// ``` #[instrument(skip(self))] pub async fn get_status(&mut self) -> anyhow::Result { - let pool = self.get_pool().await?; - - self.verify_in_database(false).await?; - let embeddings_table_name = self.create_or_get_embeddings_table().await?; - - let database_data = self - .database_data - .as_ref() - .context("Pipeline must be verified to get status")?; - - let parameters = self - .parameters - .as_ref() - .context("Pipeline must be verified to get status")?; - - let project_name = &self.project_info.as_ref().unwrap().name; - - // TODO: Maybe combine all of these into one query so it is faster - let chunks_status: (Option, Option) = sqlx::query_as(&query_builder!( - "SELECT (SELECT COUNT(DISTINCT document_id) FROM %s WHERE splitter_id = $1), COUNT(id) FROM %s", - format!("{}.chunks", project_name), - format!("{}.documents", project_name) - )) - .bind(database_data.splitter_id) - .fetch_one(&pool).await?; - let chunks_status = InvividualSyncStatus { - synced: chunks_status.0.unwrap_or(0), - not_synced: chunks_status.1.unwrap_or(0) - chunks_status.0.unwrap_or(0), - total: chunks_status.1.unwrap_or(0), - }; - - let embeddings_status: (Option, Option) = sqlx::query_as(&query_builder!( - "SELECT (SELECT count(*) FROM %s), (SELECT count(*) FROM %s WHERE splitter_id = $1)", - embeddings_table_name, - format!("{}.chunks", project_name) - )) - .bind(database_data.splitter_id) - .fetch_one(&pool) - .await?; - let embeddings_status = InvividualSyncStatus { - synced: embeddings_status.0.unwrap_or(0), - not_synced: embeddings_status.1.unwrap_or(0) - embeddings_status.0.unwrap_or(0), - total: embeddings_status.1.unwrap_or(0), - }; - - let tsvectors_status = if parameters["full_text_search"]["active"] - == serde_json::Value::Bool(true) - { - sqlx::query_as(&query_builder!( - "SELECT (SELECT COUNT(*) FROM %s WHERE configuration = $1), (SELECT COUNT(*) FROM %s)", - format!("{}.documents_tsvectors", project_name), - format!("{}.documents", project_name) - )) - .bind(parameters["full_text_search"]["configuration"].as_str()) - .fetch_one(&pool).await? - } else { - (Some(0), Some(0)) - }; - let tsvectors_status = InvividualSyncStatus { - synced: tsvectors_status.0.unwrap_or(0), - not_synced: tsvectors_status.1.unwrap_or(0) - tsvectors_status.0.unwrap_or(0), - total: tsvectors_status.1.unwrap_or(0), - }; - - Ok(PipelineSyncData { - chunks_status, - embeddings_status, - tsvectors_status, - }) + unimplemented!() + // let pool = self.get_pool().await?; + + // self.verify_in_database(false).await?; + // let embeddings_table_name = self.create_or_get_embeddings_table().await?; + + // let database_data = self + // .database_data + // .as_ref() + // .context("Pipeline must be verified to get status")?; + + // let parameters = self + // .parameters + // .as_ref() + // .context("Pipeline must be verified to get status")?; + + // let project_name = &self.project_info.as_ref().unwrap().name; + + // // TODO: Maybe combine all of these into one query so it is faster + // let chunks_status: (Option, Option) = sqlx::query_as(&query_builder!( + // "SELECT (SELECT COUNT(DISTINCT document_id) FROM %s WHERE splitter_id = $1), COUNT(id) FROM %s", + // format!("{}.chunks", project_name), + // format!("{}.documents", project_name) + // )) + // .bind(database_data.splitter_id) + // .fetch_one(&pool).await?; + // let chunks_status = InvividualSyncStatus { + // synced: chunks_status.0.unwrap_or(0), + // not_synced: chunks_status.1.unwrap_or(0) - chunks_status.0.unwrap_or(0), + // total: chunks_status.1.unwrap_or(0), + // }; + + // let embeddings_status: (Option, Option) = sqlx::query_as(&query_builder!( + // "SELECT (SELECT count(*) FROM %s), (SELECT count(*) FROM %s WHERE splitter_id = $1)", + // embeddings_table_name, + // format!("{}.chunks", project_name) + // )) + // .bind(database_data.splitter_id) + // .fetch_one(&pool) + // .await?; + // let embeddings_status = InvividualSyncStatus { + // synced: embeddings_status.0.unwrap_or(0), + // not_synced: embeddings_status.1.unwrap_or(0) - embeddings_status.0.unwrap_or(0), + // total: embeddings_status.1.unwrap_or(0), + // }; + + // let tsvectors_status = if parameters["full_text_search"]["active"] + // == serde_json::Value::Bool(true) + // { + // sqlx::query_as(&query_builder!( + // "SELECT (SELECT COUNT(*) FROM %s WHERE configuration = $1), (SELECT COUNT(*) FROM %s)", + // format!("{}.documents_tsvectors", project_name), + // format!("{}.documents", project_name) + // )) + // .bind(parameters["full_text_search"]["configuration"].as_str()) + // .fetch_one(&pool).await? + // } else { + // (Some(0), Some(0)) + // }; + // let tsvectors_status = InvividualSyncStatus { + // synced: tsvectors_status.0.unwrap_or(0), + // not_synced: tsvectors_status.1.unwrap_or(0) - tsvectors_status.0.unwrap_or(0), + // total: tsvectors_status.1.unwrap_or(0), + // }; + + // Ok(PipelineSyncData { + // chunks_status, + // embeddings_status, + // tsvectors_status, + // }) } #[instrument(skip(self))] pub(crate) async fn verify_in_database(&mut self, throw_if_exists: bool) -> anyhow::Result<()> { - if self.database_data.is_none() { - let pool = self.get_pool().await?; - - let project_info = self - .project_info - .as_ref() - .expect("Cannot verify pipeline without project info"); - - let pipeline: Option = sqlx::query_as(&query_builder!( - "SELECT * FROM %s WHERE name = $1", - format!("{}.pipelines", project_info.name) - )) - .bind(&self.name) - .fetch_optional(&pool) - .await?; - - let pipeline = if let Some(p) = pipeline { - if throw_if_exists { - anyhow::bail!("Pipeline {} already exists", p.name); - } - let model: models::Model = sqlx::query_as( - "SELECT id, created_at, runtime::TEXT, hyperparams FROM pgml.models WHERE id = $1", - ) - .bind(p.model_id) - .fetch_one(&pool) - .await?; - let mut model: Model = model.into(); - model.set_project_info(project_info.clone()); - self.model = Some(model); - - let splitter: models::Splitter = - sqlx::query_as("SELECT * FROM pgml.splitters WHERE id = $1") - .bind(p.splitter_id) - .fetch_one(&pool) - .await?; - let mut splitter: Splitter = splitter.into(); - splitter.set_project_info(project_info.clone()); - self.splitter = Some(splitter); - - p - } else { - let model = self - .model - .as_mut() - .expect("Cannot save pipeline without model"); - model.set_project_info(project_info.clone()); - model.verify_in_database(false).await?; - - let splitter = self - .splitter - .as_mut() - .expect("Cannot save pipeline without splitter"); - splitter.set_project_info(project_info.clone()); - splitter.verify_in_database(false).await?; - - sqlx::query_as(&query_builder!( - "INSERT INTO %s (name, model_id, splitter_id, parameters) VALUES ($1, $2, $3, $4) RETURNING *", - format!("{}.pipelines", project_info.name) - )) - .bind(&self.name) - .bind( - model - .database_data - .as_ref() - .context("Cannot save pipeline without model")? - .id, - ) - .bind( - splitter - .database_data - .as_ref() - .context("Cannot save pipeline without splitter")? - .id, - ) - .bind(&self.parameters) - .fetch_one(&pool) - .await? - }; - - self.database_data = Some(PipelineDatabaseData { - id: pipeline.id, - created_at: pipeline.created_at, - model_id: pipeline.model_id, - splitter_id: pipeline.splitter_id, - }); - self.parameters = Some(pipeline.parameters); - } - Ok(()) + unimplemented!() + // if self.database_data.is_none() { + // let pool = self.get_pool().await?; + + // let project_info = self + // .project_info + // .as_ref() + // .expect("Cannot verify pipeline without project info"); + + // let pipeline: Option = sqlx::query_as(&query_builder!( + // "SELECT * FROM %s WHERE name = $1", + // format!("{}.pipelines", project_info.name) + // )) + // .bind(&self.name) + // .fetch_optional(&pool) + // .await?; + + // let pipeline = if let Some(p) = pipeline { + // if throw_if_exists { + // anyhow::bail!("Pipeline {} already exists", p.name); + // } + // let model: models::Model = sqlx::query_as( + // "SELECT id, created_at, runtime::TEXT, hyperparams FROM pgml.models WHERE id = $1", + // ) + // .bind(p.model_id) + // .fetch_one(&pool) + // .await?; + // let mut model: Model = model.into(); + // model.set_project_info(project_info.clone()); + // self.model = Some(model); + + // let splitter: models::Splitter = + // sqlx::query_as("SELECT * FROM pgml.splitters WHERE id = $1") + // .bind(p.splitter_id) + // .fetch_one(&pool) + // .await?; + // let mut splitter: Splitter = splitter.into(); + // splitter.set_project_info(project_info.clone()); + // self.splitter = Some(splitter); + + // p + // } else { + // let model = self + // .model + // .as_mut() + // .expect("Cannot save pipeline without model"); + // model.set_project_info(project_info.clone()); + // model.verify_in_database(false).await?; + + // let splitter = self + // .splitter + // .as_mut() + // .expect("Cannot save pipeline without splitter"); + // splitter.set_project_info(project_info.clone()); + // splitter.verify_in_database(false).await?; + + // sqlx::query_as(&query_builder!( + // "INSERT INTO %s (name, model_id, splitter_id, parameters) VALUES ($1, $2, $3, $4) RETURNING *", + // format!("{}.pipelines", project_info.name) + // )) + // .bind(&self.name) + // .bind( + // model + // .database_data + // .as_ref() + // .context("Cannot save pipeline without model")? + // .id, + // ) + // .bind( + // splitter + // .database_data + // .as_ref() + // .context("Cannot save pipeline without splitter")? + // .id, + // ) + // .bind(&self.parameters) + // .fetch_one(&pool) + // .await? + // }; + + // self.database_data = Some(PipelineDatabaseData { + // id: pipeline.id, + // created_at: pipeline.created_at, + // model_id: pipeline.model_id, + // splitter_id: pipeline.splitter_id, + // }); + // self.parameters = Some(pipeline.parameters); + // } + // Ok(()) } #[instrument(skip(self, mp))] @@ -324,17 +326,18 @@ impl Pipeline { document_ids: &Option>, mp: MultiProgress, ) -> anyhow::Result<()> { - // TODO: Chunk document_ids if there are too many - - // A couple notes on the following methods - // - Atomic bools are required to work nicely with pyo3 otherwise we would use cells - // - We use green threads because they are cheap, but we want to be super careful to not - // return an error before stopping the green thread. To meet that end, we map errors and - // return types often - let chunk_ids = self.sync_chunks(document_ids, &mp).await?; - self.sync_embeddings(chunk_ids, &mp).await?; - self.sync_tsvectors(document_ids, &mp).await?; - Ok(()) + unimplemented!() + // // TODO: Chunk document_ids if there are too many + + // // A couple notes on the following methods + // // - Atomic bools are required to work nicely with pyo3 otherwise we would use cells + // // - We use green threads because they are cheap, but we want to be super careful to not + // // return an error before stopping the green thread. To meet that end, we map errors and + // // return types often + // let chunk_ids = self.sync_chunks(document_ids, &mp).await?; + // self.sync_embeddings(chunk_ids, &mp).await?; + // self.sync_tsvectors(document_ids, &mp).await?; + // Ok(()) } #[instrument(skip(self, mp))] @@ -343,79 +346,80 @@ impl Pipeline { document_ids: &Option>, mp: &MultiProgress, ) -> anyhow::Result>> { - self.verify_in_database(false).await?; - let pool = self.get_pool().await?; - - let database_data = self - .database_data - .as_mut() - .context("Pipeline must be verified to generate chunks")?; - - let project_info = self - .project_info - .as_ref() - .context("Pipeline must have project info to generate chunks")?; - - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(self.name.clone()) - .with_message("generating chunks"); - - // This part is a bit tricky - // We want to return the ids for all chunks we inserted OR would have inserted if they didn't already exist - // The query is structured in such a way to not insert any chunks that already exist so we - // can't rely on the data returned from the inset queries, we need to query the chunks table - // It is important we return the ids for chunks we would have inserted if they didn't already exist so we are robust to random crashes - let is_done = AtomicBool::new(false); - let work = async { - let chunk_ids: Result>, _> = if document_ids.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_CHUNKS_FOR_DOCUMENT_IDS, - &format!("{}.chunks", project_info.name), - &format!("{}.documents", project_info.name), - &format!("{}.chunks", project_info.name) - )) - .bind(database_data.splitter_id) - .bind(document_ids) - .execute(&pool) - .await - .map_err(|e| { - is_done.store(true, Relaxed); - e - })?; - sqlx::query_scalar(&query_builder!( - "SELECT id FROM %s WHERE document_id = ANY($1)", - &format!("{}.chunks", project_info.name) - )) - .bind(document_ids) - .fetch_all(&pool) - .await - .map(Some) - } else { - sqlx::query(&query_builder!( - queries::GENERATE_CHUNKS, - &format!("{}.chunks", project_info.name), - &format!("{}.documents", project_info.name), - &format!("{}.chunks", project_info.name) - )) - .bind(database_data.splitter_id) - .execute(&pool) - .await - .map(|_t| None) - }; - is_done.store(true, Relaxed); - chunk_ids - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - }; - let (chunk_ids, _) = join!(work, progress_work); - progress_bar.set_message("done generating chunks"); - progress_bar.finish(); - Ok(chunk_ids?) + unimplemented!() + // self.verify_in_database(false).await?; + // let pool = self.get_pool().await?; + + // let database_data = self + // .database_data + // .as_mut() + // .context("Pipeline must be verified to generate chunks")?; + + // let project_info = self + // .project_info + // .as_ref() + // .context("Pipeline must have project info to generate chunks")?; + + // let progress_bar = mp + // .add(utils::default_progress_spinner(1)) + // .with_prefix(self.name.clone()) + // .with_message("generating chunks"); + + // // This part is a bit tricky + // // We want to return the ids for all chunks we inserted OR would have inserted if they didn't already exist + // // The query is structured in such a way to not insert any chunks that already exist so we + // // can't rely on the data returned from the inset queries, we need to query the chunks table + // // It is important we return the ids for chunks we would have inserted if they didn't already exist so we are robust to random crashes + // let is_done = AtomicBool::new(false); + // let work = async { + // let chunk_ids: Result>, _> = if document_ids.is_some() { + // sqlx::query(&query_builder!( + // queries::GENERATE_CHUNKS_FOR_DOCUMENT_IDS, + // &format!("{}.chunks", project_info.name), + // &format!("{}.documents", project_info.name), + // &format!("{}.chunks", project_info.name) + // )) + // .bind(database_data.splitter_id) + // .bind(document_ids) + // .execute(&pool) + // .await + // .map_err(|e| { + // is_done.store(true, Relaxed); + // e + // })?; + // sqlx::query_scalar(&query_builder!( + // "SELECT id FROM %s WHERE document_id = ANY($1)", + // &format!("{}.chunks", project_info.name) + // )) + // .bind(document_ids) + // .fetch_all(&pool) + // .await + // .map(Some) + // } else { + // sqlx::query(&query_builder!( + // queries::GENERATE_CHUNKS, + // &format!("{}.chunks", project_info.name), + // &format!("{}.documents", project_info.name), + // &format!("{}.chunks", project_info.name) + // )) + // .bind(database_data.splitter_id) + // .execute(&pool) + // .await + // .map(|_t| None) + // }; + // is_done.store(true, Relaxed); + // chunk_ids + // }; + // let progress_work = async { + // while !is_done.load(Relaxed) { + // progress_bar.inc(1); + // tokio::time::sleep(std::time::Duration::from_millis(100)).await; + // } + // }; + // let (chunk_ids, _) = join!(work, progress_work); + // progress_bar.set_message("done generating chunks"); + // progress_bar.finish(); + // Ok(chunk_ids?) } #[instrument(skip(self, mp))] @@ -424,99 +428,100 @@ impl Pipeline { chunk_ids: Option>, mp: &MultiProgress, ) -> anyhow::Result<()> { - self.verify_in_database(false).await?; - let pool = self.get_pool().await?; - - let embeddings_table_name = self.create_or_get_embeddings_table().await?; - - let model = self - .model - .as_ref() - .context("Pipeline must be verified to generate embeddings")?; - - let database_data = self - .database_data - .as_mut() - .context("Pipeline must be verified to generate embeddings")?; - - let project_info = self - .project_info - .as_ref() - .context("Pipeline must have project info to generate embeddings")?; - - // Remove the stored name from the parameters - let mut parameters = model.parameters.clone(); - parameters - .as_object_mut() - .context("Model parameters must be an object")? - .remove("name"); - - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(self.name.clone()) - .with_message("generating emmbeddings"); - - let is_done = AtomicBool::new(false); - // We need to be careful about how we handle errors here. We do not want to return an error - // from the async block before setting is_done to true. If we do, the progress bar will - // will load forever. We also want to make sure to propogate any errors we have - let work = async { - let res = match model.runtime { - ModelRuntime::Python => if chunk_ids.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, - embeddings_table_name, - &format!("{}.chunks", project_info.name), - embeddings_table_name - )) - .bind(&model.name) - .bind(¶meters) - .bind(database_data.splitter_id) - .bind(chunk_ids) - .execute(&pool) - .await - } else { - sqlx::query(&query_builder!( - queries::GENERATE_EMBEDDINGS, - embeddings_table_name, - &format!("{}.chunks", project_info.name), - embeddings_table_name - )) - .bind(&model.name) - .bind(¶meters) - .bind(database_data.splitter_id) - .execute(&pool) - .await - } - .map_err(|e| anyhow::anyhow!(e)) - .map(|_t| ()), - r => { - let remote_embeddings = build_remote_embeddings(r, &model.name, ¶meters)?; - remote_embeddings - .generate_embeddings( - &embeddings_table_name, - &format!("{}.chunks", project_info.name), - database_data.splitter_id, - chunk_ids, - &pool, - ) - .await - .map(|_t| ()) - } - }; - is_done.store(true, Relaxed); - res - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - }; - let (res, _) = join!(work, progress_work); - progress_bar.set_message("done generating embeddings"); - progress_bar.finish(); - res + unimplemented!() + // self.verify_in_database(false).await?; + // let pool = self.get_pool().await?; + + // let embeddings_table_name = self.create_or_get_embeddings_table().await?; + + // let model = self + // .model + // .as_ref() + // .context("Pipeline must be verified to generate embeddings")?; + + // let database_data = self + // .database_data + // .as_mut() + // .context("Pipeline must be verified to generate embeddings")?; + + // let project_info = self + // .project_info + // .as_ref() + // .context("Pipeline must have project info to generate embeddings")?; + + // // Remove the stored name from the parameters + // let mut parameters = model.parameters.clone(); + // parameters + // .as_object_mut() + // .context("Model parameters must be an object")? + // .remove("name"); + + // let progress_bar = mp + // .add(utils::default_progress_spinner(1)) + // .with_prefix(self.name.clone()) + // .with_message("generating emmbeddings"); + + // let is_done = AtomicBool::new(false); + // // We need to be careful about how we handle errors here. We do not want to return an error + // // from the async block before setting is_done to true. If we do, the progress bar will + // // will load forever. We also want to make sure to propogate any errors we have + // let work = async { + // let res = match model.runtime { + // ModelRuntime::Python => if chunk_ids.is_some() { + // sqlx::query(&query_builder!( + // queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, + // embeddings_table_name, + // &format!("{}.chunks", project_info.name), + // embeddings_table_name + // )) + // .bind(&model.name) + // .bind(¶meters) + // .bind(database_data.splitter_id) + // .bind(chunk_ids) + // .execute(&pool) + // .await + // } else { + // sqlx::query(&query_builder!( + // queries::GENERATE_EMBEDDINGS, + // embeddings_table_name, + // &format!("{}.chunks", project_info.name), + // embeddings_table_name + // )) + // .bind(&model.name) + // .bind(¶meters) + // .bind(database_data.splitter_id) + // .execute(&pool) + // .await + // } + // .map_err(|e| anyhow::anyhow!(e)) + // .map(|_t| ()), + // r => { + // let remote_embeddings = build_remote_embeddings(r, &model.name, ¶meters)?; + // remote_embeddings + // .generate_embeddings( + // &embeddings_table_name, + // &format!("{}.chunks", project_info.name), + // database_data.splitter_id, + // chunk_ids, + // &pool, + // ) + // .await + // .map(|_t| ()) + // } + // }; + // is_done.store(true, Relaxed); + // res + // }; + // let progress_work = async { + // while !is_done.load(Relaxed) { + // progress_bar.inc(1); + // tokio::time::sleep(std::time::Duration::from_millis(100)).await; + // } + // }; + // let (res, _) = join!(work, progress_work); + // progress_bar.set_message("done generating embeddings"); + // progress_bar.finish(); + // res } #[instrument(skip(self))] @@ -525,223 +530,226 @@ impl Pipeline { document_ids: &Option>, mp: &MultiProgress, ) -> anyhow::Result<()> { - self.verify_in_database(false).await?; - let pool = self.get_pool().await?; - - let parameters = self - .parameters - .as_ref() - .context("Pipeline must be verified to generate tsvectors")?; - - if parameters["full_text_search"]["active"] != serde_json::Value::Bool(true) { - return Ok(()); - } - - let project_info = self - .project_info - .as_ref() - .context("Pipeline must have project info to generate tsvectors")?; - - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(self.name.clone()) - .with_message("generating tsvectors for full text search"); - - let configuration = parameters["full_text_search"]["configuration"] - .as_str() - .context("Full text search configuration must be a string")?; - - let is_done = AtomicBool::new(false); - let work = async { - let res = if document_ids.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_TSVECTORS_FOR_DOCUMENT_IDS, - format!("{}.documents_tsvectors", project_info.name), - configuration, - configuration, - format!("{}.documents", project_info.name) - )) - .bind(document_ids) - .execute(&pool) - .await - } else { - sqlx::query(&query_builder!( - queries::GENERATE_TSVECTORS, - format!("{}.documents_tsvectors", project_info.name), - configuration, - configuration, - format!("{}.documents", project_info.name) - )) - .execute(&pool) - .await - }; - is_done.store(true, Relaxed); - res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - }; - let (res, _) = join!(work, progress_work); - progress_bar.set_message("done generating tsvectors for full text search"); - progress_bar.finish(); - res + unimplemented!() + // self.verify_in_database(false).await?; + // let pool = self.get_pool().await?; + + // let parameters = self + // .parameters + // .as_ref() + // .context("Pipeline must be verified to generate tsvectors")?; + + // if parameters["full_text_search"]["active"] != serde_json::Value::Bool(true) { + // return Ok(()); + // } + + // let project_info = self + // .project_info + // .as_ref() + // .context("Pipeline must have project info to generate tsvectors")?; + + // let progress_bar = mp + // .add(utils::default_progress_spinner(1)) + // .with_prefix(self.name.clone()) + // .with_message("generating tsvectors for full text search"); + + // let configuration = parameters["full_text_search"]["configuration"] + // .as_str() + // .context("Full text search configuration must be a string")?; + + // let is_done = AtomicBool::new(false); + // let work = async { + // let res = if document_ids.is_some() { + // sqlx::query(&query_builder!( + // queries::GENERATE_TSVECTORS_FOR_DOCUMENT_IDS, + // format!("{}.documents_tsvectors", project_info.name), + // configuration, + // configuration, + // format!("{}.documents", project_info.name) + // )) + // .bind(document_ids) + // .execute(&pool) + // .await + // } else { + // sqlx::query(&query_builder!( + // queries::GENERATE_TSVECTORS, + // format!("{}.documents_tsvectors", project_info.name), + // configuration, + // configuration, + // format!("{}.documents", project_info.name) + // )) + // .execute(&pool) + // .await + // }; + // is_done.store(true, Relaxed); + // res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) + // }; + // let progress_work = async { + // while !is_done.load(Relaxed) { + // progress_bar.inc(1); + // tokio::time::sleep(std::time::Duration::from_millis(100)).await; + // } + // }; + // let (res, _) = join!(work, progress_work); + // progress_bar.set_message("done generating tsvectors for full text search"); + // progress_bar.finish(); + // res } #[instrument(skip(self))] pub(crate) async fn create_or_get_embeddings_table(&mut self) -> anyhow::Result { - self.verify_in_database(false).await?; - let pool = self.get_pool().await?; - - let collection_name = &self - .project_info - .as_ref() - .context("Pipeline must have project info to get the embeddings table name")? - .name; - let embeddings_table_name = format!("{}.{}_embeddings", collection_name, self.name); - - // Notice that we actually check for existence of the table in the database instead of - // blindly creating it with `CREATE TABLE IF NOT EXISTS`. This is because we want to avoid - // generating embeddings just to get the length if we don't need to - let exists: bool = sqlx::query_scalar( - "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" - ) - .bind(&self - .project_info - .as_ref() - .context("Pipeline must have project info to get the embeddings table name")?.name) - .bind(format!("{}_embeddings", self.name)).fetch_one(&pool).await?; - - if !exists { - let model = self - .model - .as_ref() - .context("Pipeline must be verified to create embeddings table")?; - - // Remove the stored name from the model parameters - let mut model_parameters = model.parameters.clone(); - model_parameters - .as_object_mut() - .context("Model parameters must be an object")? - .remove("name"); - - let embedding_length = match &model.runtime { - ModelRuntime::Python => { - let embedding: (Vec,) = sqlx::query_as( - "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") - .bind(&model.name) - .bind(model_parameters) - .fetch_one(&pool).await?; - embedding.0.len() as i64 - } - t => { - let remote_embeddings = - build_remote_embeddings(t.to_owned(), &model.name, &model_parameters)?; - remote_embeddings.get_embedding_size().await? - } - }; - - let mut transaction = pool.begin().await?; - sqlx::query(&query_builder!( - queries::CREATE_EMBEDDINGS_TABLE, - &embeddings_table_name, - &format!( - "{}.chunks", - self.project_info - .as_ref() - .context("Pipeline must have project info to create the embeddings table")? - .name - ), - embedding_length - )) - .execute(&mut *transaction) - .await?; - let index_name = format!("{}_pipeline_created_at_index", self.name); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - &embeddings_table_name, - "created_at" - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_chunk_id_index", self.name); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - &embeddings_table_name, - "chunk_id" - ) - .as_str(), - ) - .await?; - // See: https://github.com/pgvector/pgvector - let (m, ef_construction) = match &self.parameters { - Some(p) => { - let m = if !p["hnsw"]["m"].is_null() { - p["hnsw"]["m"] - .try_to_u64() - .context("hnsw.m must be an integer")? - } else { - 16 - }; - let ef_construction = if !p["hnsw"]["ef_construction"].is_null() { - p["hnsw"]["ef_construction"] - .try_to_u64() - .context("hnsw.ef_construction must be an integer")? - } else { - 64 - }; - (m, ef_construction) - } - None => (16, 64), - }; - let index_with_parameters = - format!("WITH (m = {}, ef_construction = {})", m, ef_construction); - let index_name = format!("{}_pipeline_hnsw_vector_index", self.name); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX_USING_HNSW, - "", - index_name, - &embeddings_table_name, - "embedding vector_cosine_ops", - index_with_parameters - ) - .as_str(), - ) - .await?; - transaction.commit().await?; - } - - Ok(embeddings_table_name) + unimplemented!() + // self.verify_in_database(false).await?; + // let pool = self.get_pool().await?; + + // let collection_name = &self + // .project_info + // .as_ref() + // .context("Pipeline must have project info to get the embeddings table name")? + // .name; + // let embeddings_table_name = format!("{}.{}_embeddings", collection_name, self.name); + + // // Notice that we actually check for existence of the table in the database instead of + // // blindly creating it with `CREATE TABLE IF NOT EXISTS`. This is because we want to avoid + // // generating embeddings just to get the length if we don't need to + // let exists: bool = sqlx::query_scalar( + // "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" + // ) + // .bind(&self + // .project_info + // .as_ref() + // .context("Pipeline must have project info to get the embeddings table name")?.name) + // .bind(format!("{}_embeddings", self.name)).fetch_one(&pool).await?; + + // if !exists { + // let model = self + // .model + // .as_ref() + // .context("Pipeline must be verified to create embeddings table")?; + + // // Remove the stored name from the model parameters + // let mut model_parameters = model.parameters.clone(); + // model_parameters + // .as_object_mut() + // .context("Model parameters must be an object")? + // .remove("name"); + + // let embedding_length = match &model.runtime { + // ModelRuntime::Python => { + // let embedding: (Vec,) = sqlx::query_as( + // "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") + // .bind(&model.name) + // .bind(model_parameters) + // .fetch_one(&pool).await?; + // embedding.0.len() as i64 + // } + // t => { + // let remote_embeddings = + // build_remote_embeddings(t.to_owned(), &model.name, &model_parameters)?; + // remote_embeddings.get_embedding_size().await? + // } + // }; + + // let mut transaction = pool.begin().await?; + // sqlx::query(&query_builder!( + // queries::CREATE_EMBEDDINGS_TABLE, + // &embeddings_table_name, + // &format!( + // "{}.chunks", + // self.project_info + // .as_ref() + // .context("Pipeline must have project info to create the embeddings table")? + // .name + // ), + // embedding_length + // )) + // .execute(&mut *transaction) + // .await?; + // let index_name = format!("{}_pipeline_created_at_index", self.name); + // transaction + // .execute( + // query_builder!( + // queries::CREATE_INDEX, + // "", + // index_name, + // &embeddings_table_name, + // "created_at" + // ) + // .as_str(), + // ) + // .await?; + // let index_name = format!("{}_pipeline_chunk_id_index", self.name); + // transaction + // .execute( + // query_builder!( + // queries::CREATE_INDEX, + // "", + // index_name, + // &embeddings_table_name, + // "chunk_id" + // ) + // .as_str(), + // ) + // .await?; + // // See: https://github.com/pgvector/pgvector + // let (m, ef_construction) = match &self.parameters { + // Some(p) => { + // let m = if !p["hnsw"]["m"].is_null() { + // p["hnsw"]["m"] + // .try_to_u64() + // .context("hnsw.m must be an integer")? + // } else { + // 16 + // }; + // let ef_construction = if !p["hnsw"]["ef_construction"].is_null() { + // p["hnsw"]["ef_construction"] + // .try_to_u64() + // .context("hnsw.ef_construction must be an integer")? + // } else { + // 64 + // }; + // (m, ef_construction) + // } + // None => (16, 64), + // }; + // let index_with_parameters = + // format!("WITH (m = {}, ef_construction = {})", m, ef_construction); + // let index_name = format!("{}_pipeline_hnsw_vector_index", self.name); + // transaction + // .execute( + // query_builder!( + // queries::CREATE_INDEX_USING_HNSW, + // "", + // index_name, + // &embeddings_table_name, + // "embedding vector_cosine_ops", + // index_with_parameters + // ) + // .as_str(), + // ) + // .await?; + // transaction.commit().await?; + // } + + // Ok(embeddings_table_name) } #[instrument(skip(self))] pub(crate) fn set_project_info(&mut self, project_info: ProjectInfo) { - if self.model.is_some() { - self.model - .as_mut() - .unwrap() - .set_project_info(project_info.clone()); - } - if self.splitter.is_some() { - self.splitter - .as_mut() - .unwrap() - .set_project_info(project_info.clone()); - } - self.project_info = Some(project_info); + unimplemented!() + // if self.model.is_some() { + // self.model + // .as_mut() + // .unwrap() + // .set_project_info(project_info.clone()); + // } + // if self.splitter.is_some() { + // self.splitter + // .as_mut() + // .unwrap() + // .set_project_info(project_info.clone()); + // } + // self.project_info = Some(project_info); } /// Convert the [Pipeline] to [Json] @@ -760,94 +768,98 @@ impl Pipeline { /// ``` #[instrument(skip(self))] pub async fn to_dict(&mut self) -> anyhow::Result { - self.verify_in_database(false).await?; - - let status = self.get_status().await?; - - let model_dict = self - .model - .as_mut() - .context("Pipeline must be verified to call to_dict")? - .to_dict() - .await?; - - let splitter_dict = self - .splitter - .as_mut() - .context("Pipeline must be verified to call to_dict")? - .to_dict() - .await?; - - let database_data = self - .database_data - .as_ref() - .context("Pipeline must be verified to call to_dict")?; - - let parameters = self - .parameters - .as_ref() - .context("Pipeline must be verified to call to_dict")?; - - Ok(serde_json::json!({ - "id": database_data.id, - "name": self.name, - "model": *model_dict, - "splitter": *splitter_dict, - "parameters": *parameters, - "status": *Json::from(status), - }) - .into()) + unimplemented!() + // self.verify_in_database(false).await?; + + // let status = self.get_status().await?; + + // let model_dict = self + // .model + // .as_mut() + // .context("Pipeline must be verified to call to_dict")? + // .to_dict() + // .await?; + + // let splitter_dict = self + // .splitter + // .as_mut() + // .context("Pipeline must be verified to call to_dict")? + // .to_dict() + // .await?; + + // let database_data = self + // .database_data + // .as_ref() + // .context("Pipeline must be verified to call to_dict")?; + + // let parameters = self + // .parameters + // .as_ref() + // .context("Pipeline must be verified to call to_dict")?; + + // Ok(serde_json::json!({ + // "id": database_data.id, + // "name": self.name, + // "model": *model_dict, + // "splitter": *splitter_dict, + // "parameters": *parameters, + // "status": *Json::from(status), + // }) + // .into()) } async fn get_pool(&self) -> anyhow::Result { - let database_url = &self - .project_info - .as_ref() - .context("Project info required to call method pipeline.get_pool()")? - .database_url; - get_or_initialize_pool(database_url).await + unimplemented!() + // let database_url = &self + // .project_info + // .as_ref() + // .context("Project info required to call method pipeline.get_pool()")? + // .database_url; + // get_or_initialize_pool(database_url).await } pub(crate) async fn create_pipelines_table( project_info: &ProjectInfo, conn: &mut PgConnection, ) -> anyhow::Result<()> { - let pipelines_table_name = format!("{}.pipelines", project_info.name); - sqlx::query(&query_builder!( - queries::CREATE_PIPELINES_TABLE, - pipelines_table_name - )) - .execute(&mut *conn) - .await?; - conn.execute( - query_builder!( - queries::CREATE_INDEX, - "", - "pipeline_name_index", - pipelines_table_name, - "name" - ) - .as_str(), - ) - .await?; - Ok(()) + unimplemented!() + // let pipelines_table_name = format!("{}.pipelines", project_info.name); + // sqlx::query(&query_builder!( + // queries::CREATE_PIPELINES_TABLE, + // pipelines_table_name + // )) + // .execute(&mut *conn) + // .await?; + // conn.execute( + // query_builder!( + // queries::CREATE_INDEX, + // "", + // "pipeline_name_index", + // pipelines_table_name, + // "name" + // ) + // .as_str(), + // ) + // .await?; + // Ok(()) } } impl From for Pipeline { fn from(x: models::PipelineWithModelAndSplitter) -> Self { - Self { - model: Some(x.clone().into()), - splitter: Some(x.clone().into()), - name: x.pipeline_name, - project_info: None, - database_data: Some(PipelineDatabaseData { - id: x.pipeline_id, - created_at: x.pipeline_created_at, - model_id: x.model_id, - splitter_id: x.splitter_id, - }), - parameters: Some(x.pipeline_parameters), - } + unimplemented!() + // Self { + // model: Some(x.clone().into()), + // splitter: Some(x.clone().into()), + // name: x.pipeline_name, + // project_info: None, + // database_data: Some(PipelineDatabaseData { + // id: x.pipeline_id, + // created_at: x.pipeline_created_at, + // model_id: x.model_id, + // splitter_id: x.splitter_id, + // }), + // parameters: Some(x.pipeline_parameters), + // } } } diff --git a/pgml-sdks/pgml/src/queries.rs b/pgml-sdks/pgml/src/queries.rs index 8e793691e..08e7a8d4e 100644 --- a/pgml-sdks/pgml/src/queries.rs +++ b/pgml-sdks/pgml/src/queries.rs @@ -26,13 +26,23 @@ CREATE TABLE IF NOT EXISTS %s ( ); "#; +pub const CREATE_MULTI_FIELD_PIPELINES_TABLE: &str = r#" +CREATE TABLE IF NOT EXISTS %s ( + id serial8 PRIMARY KEY, + name text NOT NULL, + created_at timestamp NOT NULL DEFAULT now(), + active BOOLEAN NOT NULL DEFAULT TRUE, + schema jsonb NOT NULL, + UNIQUE (name) +); +"#; + pub const CREATE_DOCUMENTS_TABLE: &str = r#" CREATE TABLE IF NOT EXISTS %s ( id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(), source_uuid uuid NOT NULL, - metadata jsonb NOT NULL DEFAULT '{}', - text text NOT NULL, + document jsonb NOT NULL, UNIQUE (source_uuid) ); "#; @@ -50,10 +60,9 @@ CREATE TABLE IF NOT EXISTS pgml.splitters ( pub const CREATE_CHUNKS_TABLE: &str = r#"CREATE TABLE IF NOT EXISTS %s ( id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(), document_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, - splitter_id int8 NOT NULL REFERENCES pgml.splitters ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, chunk_index int8 NOT NULL, chunk text NOT NULL, - UNIQUE (document_id, splitter_id, chunk_index) + UNIQUE (document_id, chunk_index) ); "#; @@ -72,9 +81,8 @@ CREATE TABLE IF NOT EXISTS %s ( id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(), document_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, - configuration text NOT NULL, ts tsvector, - UNIQUE (configuration, document_id) + UNIQUE (document_id) ); "#; @@ -97,26 +105,24 @@ CREATE INDEX %d IF NOT EXISTS %s on %s using hnsw (%d) %d; // Other Big Queries //////// ///////////////////////////// pub const GENERATE_TSVECTORS: &str = r#" -INSERT INTO %s (document_id, configuration, ts) +INSERT INTO %s (document_id, ts) SELECT id, - '%d' configuration, - to_tsvector('%d', text) ts + to_tsvector('%d', %d) ts FROM %s -ON CONFLICT (document_id, configuration) DO UPDATE SET ts = EXCLUDED.ts; +ON CONFLICT (document_id) DO NOTHING; "#; pub const GENERATE_TSVECTORS_FOR_DOCUMENT_IDS: &str = r#" -INSERT INTO %s (document_id, configuration, ts) +INSERT INTO %s (document_id, ts) SELECT id, - '%d' configuration, - to_tsvector('%d', text) ts + to_tsvector('%d', %d) ts FROM %s WHERE id = ANY ($1) -ON CONFLICT (document_id, configuration) DO NOTHING; +ON CONFLICT (document_id) DO NOTHING; "#; pub const GENERATE_EMBEDDINGS: &str = r#" @@ -153,8 +159,7 @@ SELECT FROM %s WHERE - splitter_id = $3 - AND id = ANY ($4) + id = ANY ($3) AND id NOT IN ( SELECT chunk_id @@ -229,12 +234,10 @@ WITH splitter as ( id = $1 ) INSERT INTO %s( - document_id, splitter_id, chunk_index, - chunk + document_id, chunk_index, chunk ) SELECT document_id, - $1, (chunk).chunk_index, (chunk).chunk FROM @@ -250,7 +253,7 @@ FROM ( SELECT id, - text + %d as text FROM %s WHERE @@ -259,12 +262,10 @@ FROM document_id FROM %s - WHERE - splitter_id = $1 ) ) AS documents ) chunks -ON CONFLICT (document_id, splitter_id, chunk_index) DO NOTHING +ON CONFLICT (document_id, chunk_index) DO NOTHING RETURNING id "#; @@ -279,12 +280,10 @@ WITH splitter as ( id = $1 ) INSERT INTO %s( - document_id, splitter_id, chunk_index, - chunk + document_id, chunk_index, chunk ) SELECT document_id, - $1, (chunk).chunk_index, (chunk).chunk FROM @@ -300,7 +299,7 @@ FROM ( SELECT id, - text + %d AS text FROM %s WHERE @@ -310,11 +309,9 @@ FROM document_id FROM %s - WHERE - splitter_id = $1 ) ) AS documents ) chunks -ON CONFLICT (document_id, splitter_id, chunk_index) DO NOTHING +ON CONFLICT (document_id, chunk_index) DO NOTHING RETURNING id "#; diff --git a/pgml-sdks/pgml/src/query_builder.rs b/pgml-sdks/pgml/src/query_builder.rs index 98fbe104a..11b2405e8 100644 --- a/pgml-sdks/pgml/src/query_builder.rs +++ b/pgml-sdks/pgml/src/query_builder.rs @@ -124,98 +124,98 @@ impl QueryBuilder { pipeline: &Pipeline, query_parameters: Option, ) -> Self { - // Save these in case of failure - self.pipeline = Some(pipeline.clone()); - self.query_string = Some(query.to_owned()); - self.query_parameters = query_parameters.clone(); + unimplemented!() + // // Save these in case of failure + // self.pipeline = Some(pipeline.clone()); + // self.query_string = Some(query.to_owned()); + // self.query_parameters = query_parameters.clone(); - let mut query_parameters = query_parameters.unwrap_or_default().0; - // If they did set hnsw, remove it before we pass it to the model - query_parameters - .as_object_mut() - .expect("Query parameters must be a Json object") - .remove("hnsw"); - let embeddings_table_name = - format!("{}.{}_embeddings", self.collection.name, pipeline.name); - - // Build the pipeline CTE - let mut pipeline_cte = Query::select(); - pipeline_cte - .from_as( - self.collection.pipelines_table_name.to_table_tuple(), - SIden::Str("pipeline"), - ) - .columns([models::PipelineIden::ModelId]) - .and_where(Expr::col(models::PipelineIden::Name).eq(&pipeline.name)); - let mut pipeline_cte = CommonTableExpression::from_select(pipeline_cte); - pipeline_cte.table_name(Alias::new("pipeline")); + // let mut query_parameters = query_parameters.unwrap_or_default().0; + // // If they did set hnsw, remove it before we pass it to the model + // query_parameters + // .as_object_mut() + // .expect("Query parameters must be a Json object") + // .remove("hnsw"); + // let embeddings_table_name = + // format!("{}.{}_embeddings", self.collection.name, pipeline.name); - // Build the model CTE - let mut model_cte = Query::select(); - model_cte - .from_as( - (SIden::Str("pgml"), SIden::Str("models")), - SIden::Str("model"), - ) - .columns([models::ModelIden::Hyperparams]) - .and_where(Expr::cust("id = (SELECT model_id FROM pipeline)")); - let mut model_cte = CommonTableExpression::from_select(model_cte); - model_cte.table_name(Alias::new("model")); + // // Build the pipeline CTE + // let mut pipeline_cte = Query::select(); + // pipeline_cte + // .from_as( + // self.collection.pipelines_table_name.to_table_tuple(), + // SIden::Str("pipeline"), + // ) + // .columns([models::PipelineIden::ModelId]) + // .and_where(Expr::col(models::PipelineIden::Name).eq(&pipeline.name)); + // let mut pipeline_cte = CommonTableExpression::from_select(pipeline_cte); + // pipeline_cte.table_name(Alias::new("pipeline")); - // Build the embedding CTE - let mut embedding_cte = Query::select(); - embedding_cte.expr_as( - Func::cast_as( - Func::cust(SIden::Str("pgml.embed")).args([ - Expr::cust("transformer => (SELECT hyperparams->>'name' FROM model)"), - Expr::cust_with_values("text => $1", [query]), - Expr::cust_with_values("kwargs => $1", [query_parameters]), - ]), - Alias::new("vector"), - ), - Alias::new("embedding"), - ); - let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); - embedding_cte.table_name(Alias::new("embedding")); + // // Build the model CTE + // let mut model_cte = Query::select(); + // model_cte + // .from_as( + // (SIden::Str("pgml"), SIden::Str("models")), + // SIden::Str("model"), + // ) + // .columns([models::ModelIden::Hyperparams]) + // .and_where(Expr::cust("id = (SELECT model_id FROM pipeline)")); + // let mut model_cte = CommonTableExpression::from_select(model_cte); + // model_cte.table_name(Alias::new("model")); - // Build the where clause - let mut with_clause = WithClause::new(); - self.with = with_clause - .cte(pipeline_cte) - .cte(model_cte) - .cte(embedding_cte) - .to_owned(); + // // Build the embedding CTE + // let mut embedding_cte = Query::select(); + // embedding_cte.expr_as( + // Func::cast_as( + // Func::cust(SIden::Str("pgml.embed")).args([ + // Expr::cust("transformer => (SELECT hyperparams->>'name' FROM model)"), + // Expr::cust_with_values("text => $1", [query]), + // Expr::cust_with_values("kwargs => $1", [query_parameters]), + // ]), + // Alias::new("vector"), + // ), + // Alias::new("embedding"), + // ); + // let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); + // embedding_cte.table_name(Alias::new("embedding")); - // Build the query - self.query - .expr(Expr::cust( - "(embeddings.embedding <=> (SELECT embedding from embedding)) score", - )) - .columns([ - (SIden::Str("chunks"), SIden::Str("chunk")), - (SIden::Str("documents"), SIden::Str("metadata")), - ]) - .from_as( - embeddings_table_name.to_table_tuple(), - SIden::Str("embeddings"), - ) - .join_as( - JoinType::InnerJoin, - self.collection.chunks_table_name.to_table_tuple(), - Alias::new("chunks"), - Expr::col((SIden::Str("chunks"), SIden::Str("id"))) - .equals((SIden::Str("embeddings"), SIden::Str("chunk_id"))), - ) - .join_as( - JoinType::InnerJoin, - self.collection.documents_table_name.to_table_tuple(), - Alias::new("documents"), - Expr::col((SIden::Str("documents"), SIden::Str("id"))) - .equals((SIden::Str("chunks"), SIden::Str("document_id"))), - ) - .order_by(SIden::Str("score"), Order::Asc); + // // Build the where clause + // let mut with_clause = WithClause::new(); + // self.with = with_clause + // .cte(pipeline_cte) + // .cte(model_cte) + // .cte(embedding_cte) + // .to_owned(); - self + // // Build the query + // self.query + // .expr(Expr::cust( + // "(embeddings.embedding <=> (SELECT embedding from embedding)) score", + // )) + // .columns([ + // (SIden::Str("chunks"), SIden::Str("chunk")), + // (SIden::Str("documents"), SIden::Str("metadata")), + // ]) + // .from_as( + // embeddings_table_name.to_table_tuple(), + // SIden::Str("embeddings"), + // ) + // .join_as( + // JoinType::InnerJoin, + // self.collection.chunks_table_name.to_table_tuple(), + // Alias::new("chunks"), + // Expr::col((SIden::Str("chunks"), SIden::Str("id"))) + // .equals((SIden::Str("embeddings"), SIden::Str("chunk_id"))), + // ) + // .join_as( + // JoinType::InnerJoin, + // self.collection.documents_table_name.to_table_tuple(), + // Alias::new("documents"), + // Expr::col((SIden::Str("documents"), SIden::Str("id"))) + // .equals((SIden::Str("chunks"), SIden::Str("document_id"))), + // ) + // .order_by(SIden::Str("score"), Order::Asc); + // self } #[instrument(skip(self))] @@ -277,7 +277,7 @@ impl QueryBuilder { .remove("hnsw"); let remote_embeddings = - build_remote_embeddings(model.runtime, &model.name, &query_parameters)?; + build_remote_embeddings(model.runtime, &model.name, Some(&query_parameters))?; let mut embeddings = remote_embeddings .embed(vec![self .query_string diff --git a/pgml-sdks/pgml/src/remote_embeddings.rs b/pgml-sdks/pgml/src/remote_embeddings.rs index bcb84146c..e963b3c0f 100644 --- a/pgml-sdks/pgml/src/remote_embeddings.rs +++ b/pgml-sdks/pgml/src/remote_embeddings.rs @@ -8,7 +8,7 @@ use crate::{model::ModelRuntime, models, query_builder, types::Json}; pub fn build_remote_embeddings<'a>( source: ModelRuntime, model_name: &'a str, - _model_parameters: &'a Json, + _model_parameters: Option<&'a Json>, ) -> anyhow::Result + Sync + Send + 'a>> { match source { // OpenAI endpoint for embedddings does not take any model parameters @@ -46,34 +46,22 @@ pub trait RemoteEmbeddings<'a> { &self, embeddings_table_name: &str, chunks_table_name: &str, - splitter_id: i64, - chunk_ids: &Option>, + chunk_ids: &Vec, pool: &PgPool, limit: Option, ) -> anyhow::Result> { let limit = limit.unwrap_or(1000); - match chunk_ids { - Some(cids) => sqlx::query_as(&query_builder!( - "SELECT * FROM %s WHERE splitter_id = $1 AND id NOT IN (SELECT chunk_id FROM %s) AND id = ANY ($2) LIMIT $3", - chunks_table_name, - embeddings_table_name - )) - .bind(splitter_id) - .bind(cids) - .bind(limit) - .fetch_all(pool) - .await, - None => sqlx::query_as(&query_builder!( - "SELECT * FROM %s WHERE splitter_id = $1 AND id NOT IN (SELECT chunk_id FROM %s) LIMIT $2", - chunks_table_name, - embeddings_table_name - )) - .bind(splitter_id) - .bind(limit) - .fetch_all(pool) - .await - }.map_err(|e| anyhow::anyhow!(e)) + sqlx::query_as(&query_builder!( + "SELECT * FROM %s WHERE id NOT IN (SELECT chunk_id FROM %s) AND id = ANY ($1) LIMIT $2", + chunks_table_name, + embeddings_table_name + )) + .bind(chunk_ids) + .bind(limit) + .fetch_all(pool) + .await + .map_err(|e| anyhow::anyhow!(e)) } #[instrument(skip(self, response))] @@ -104,8 +92,7 @@ pub trait RemoteEmbeddings<'a> { &self, embeddings_table_name: &str, chunks_table_name: &str, - splitter_id: i64, - chunk_ids: Option>, + chunk_ids: &Vec, pool: &PgPool, ) -> anyhow::Result<()> { loop { @@ -113,8 +100,7 @@ pub trait RemoteEmbeddings<'a> { .get_chunks( embeddings_table_name, chunks_table_name, - splitter_id, - &chunk_ids, + chunk_ids, pool, None, ) @@ -183,8 +169,11 @@ mod tests { #[tokio::test] async fn openai_remote_embeddings() -> anyhow::Result<()> { let params = serde_json::json!({}).into(); - let openai_remote_embeddings = - build_remote_embeddings(ModelRuntime::OpenAI, "text-embedding-ada-002", ¶ms)?; + let openai_remote_embeddings = build_remote_embeddings( + ModelRuntime::OpenAI, + "text-embedding-ada-002", + Some(¶ms), + )?; let embedding_size = openai_remote_embeddings.get_embedding_size().await?; assert!(embedding_size > 0); Ok(()) diff --git a/pgml-sdks/pgml/src/search_query_builder.rs b/pgml-sdks/pgml/src/search_query_builder.rs new file mode 100644 index 000000000..7c03e590b --- /dev/null +++ b/pgml-sdks/pgml/src/search_query_builder.rs @@ -0,0 +1,258 @@ +use anyhow::Context; +use serde::Deserialize; +use std::collections::HashMap; + +use sea_query::{ + Alias, CommonTableExpression, Expr, Func, JoinType, Order, PostgresQueryBuilder, Query, + QueryStatementWriter, SimpleExpr, WithClause, +}; +use sea_query_binder::{SqlxBinder, SqlxValues}; + +use crate::{ + collection::Collection, + model::ModelRuntime, + models, + multi_field_pipeline::MultiFieldPipeline, + remote_embeddings::build_remote_embeddings, + types::{IntoTableNameAndSchema, Json, SIden}, +}; + +#[derive(Debug, Deserialize)] +struct ValidSemanticSearchAction { + query: String, + model_parameters: Option, + boost: Option, +} + +#[derive(Debug, Deserialize)] +struct ValidMatchAction { + query: String, + boost: Option, +} + +#[derive(Debug, Deserialize)] +struct ValidQueryAction { + full_text_search: Option>, + semantic_search: Option>, +} + +#[derive(Debug, Deserialize)] +struct ValidQuery { + query: ValidQueryAction, + limit: Option, +} + +pub async fn build_search_query( + collection: &Collection, + query: Json, + pipeline: &MultiFieldPipeline, +) -> anyhow::Result<(String, SqlxValues)> { + let valid_query: ValidQuery = serde_json::from_value(query.0)?; + let limit = valid_query.limit.unwrap_or(10); + + let pipeline_table = format!("{}.pipelines", collection.name); + let documents_table = format!("{}.documents", collection.name); + + let mut with_clause = WithClause::new(); + let mut sub_query = Query::select(); + let mut sum_expression: Option = None; + + let mut pipeline_cte = Query::select(); + pipeline_cte + .from(pipeline_table.to_table_tuple()) + .columns([models::PipelineIden::Schema]) + .and_where(Expr::col(models::PipelineIden::Name).eq(&pipeline.name)); + let mut pipeline_cte = CommonTableExpression::from_select(pipeline_cte); + pipeline_cte.table_name(Alias::new("pipeline")); + with_clause.cte(pipeline_cte); + + for (key, vsa) in valid_query.query.semantic_search.unwrap_or_default() { + let model_runtime = pipeline + .parsed_schema + .as_ref() + .map(|s| { + // Any of these errors means they have a malformed query + anyhow::Ok( + s.get(&key) + .as_ref() + .context(format!("Bad query - {key} does not exist in schema"))? + .embed + .as_ref() + .context(format!( + "Bad query - {key} does not have any directive to embed" + ))? + .model + .runtime, + ) + }) + .transpose()? + .unwrap_or(ModelRuntime::Python); + + match model_runtime { + ModelRuntime::Python => { + // Build the embedding CTE + let mut embedding_cte = Query::select(); + embedding_cte.expr_as( + Func::cust(SIden::Str("pgml.embed")).args([ + Expr::cust(format!( + "transformer => (SELECT schema #>> '{{{key},embed,model}}' FROM pipeline)", + )), + Expr::cust_with_values("text => $1", [&vsa.query]), + Expr::cust(format!("kwargs => COALESCE((SELECT schema #> '{{{key},embed,model_parameters}}' FROM pipeline), '{{}}'::jsonb)")), + ]), + Alias::new("embedding"), + ); + let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); + embedding_cte.table_name(Alias::new(format!("{key}_embedding"))); + with_clause.cte(embedding_cte); + + // Add to the sum expression + let boost = vsa.boost.unwrap_or(1.); + sum_expression = if let Some(expr) = sum_expression { + Some(expr.add(Expr::cust(format!( + // r#"((1 - MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector)) * {boost})"# + r#"(MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector))"# + )))) + } else { + Some(Expr::cust(format!( + // r#"((1 - MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector)) * {boost})"# + r#"(MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector))"# + ))) + }; + } + ModelRuntime::OpenAI => { + // We can unwrap here as we know this is all set from above + let model = &pipeline + .parsed_schema + .as_ref() + .unwrap() + .get(&key) + .unwrap() + .embed + .as_ref() + .unwrap() + .model; + + // Get the remote embedding + let embedding = { + let remote_embeddings = build_remote_embeddings( + model.runtime, + &model.name, + vsa.model_parameters.as_ref(), + )?; + let mut embeddings = remote_embeddings.embed(vec![vsa.query]).await?; + std::mem::take(&mut embeddings[0]) + }; + + // Add to the sum expression + let boost = vsa.boost.unwrap_or(1.); + sum_expression = if let Some(expr) = sum_expression { + Some(expr.add(Expr::cust_with_values( + format!( + // r#"((1 - MIN("{key}_embeddings".embedding <=> $1::vector)) * {boost})"#, + r#"(MIN("{key}_embeddings".embedding <=> $1::vector))"#, + ), + [embedding], + ))) + } else { + Some(Expr::cust_with_values( + format!( + r#"(MIN("{key}_embeddings".embedding <=> $1::vector))"# // r#"((1 - MIN("{key}_embeddings".embedding <=> $1::vector)) * {boost})"# + ), + [embedding], + )) + }; + } + } + + // Do the proper inner joins + let chunks_table = format!("{}_{}.{}_chunks", collection.name, pipeline.name, key); + let embeddings_table = format!("{}_{}.{}_embeddings", collection.name, pipeline.name, key); + sub_query.join_as( + JoinType::InnerJoin, + chunks_table.to_table_tuple(), + Alias::new(format!("{key}_chunks")), + Expr::col(( + SIden::String(format!("{key}_chunks")), + SIden::Str("document_id"), + )) + .equals((SIden::Str("documents"), SIden::Str("id"))), + ); + sub_query.join_as( + JoinType::InnerJoin, + embeddings_table.to_table_tuple(), + Alias::new(format!("{key}_embeddings")), + Expr::col(( + SIden::String(format!("{key}_embeddings")), + SIden::Str("chunk_id"), + )) + .equals((SIden::String(format!("{key}_chunks")), SIden::Str("id"))), + ); + } + + for (key, vma) in valid_query.query.full_text_search.unwrap_or_default() { + let full_text_table = format!("{}_{}.{}_tsvectors", collection.name, pipeline.name, key); + + // Inner join the tsvectors table + sub_query.join_as( + JoinType::InnerJoin, + full_text_table.to_table_tuple(), + Alias::new(format!("{key}_tsvectors")), + Expr::col(( + SIden::String(format!("{key}_tsvectors")), + SIden::Str("document_id"), + )) + .equals((SIden::Str("documents"), SIden::Str("id"))), + ); + + // TODO: Maybe add this?? + // Do the proper where statement + // sub_query.and_where(Expr::cust_with_values( + // format!( + // r#""{key}_tsvectors".ts @@ plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1)"#, + // ), + // [&vma.query], + // )); + + // Add to the sum expression + let boost = vma.boost.unwrap_or(1.); + sum_expression = if let Some(expr) = sum_expression { + Some(expr.add(Expr::cust_with_values(format!( + r#"(MAX(ts_rank("{key}_tsvectors".ts, plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1), 32)) * {boost})"#, + ), + [vma.query] + ))) + } else { + Some(Expr::cust_with_values( + format!( + r#"(MAX(ts_rank("{key}_tsvectors".ts, plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1), 32)) * {boost})"#, + ), + [vma.query], + )) + }; + } + + // Finalize the sub query + sub_query + .column((SIden::Str("documents"), SIden::Str("document"))) + .expr_as(sum_expression.unwrap(), Alias::new("score")) + .from_as(documents_table.to_table_tuple(), Alias::new("documents")) + .group_by_col((SIden::Str("documents"), SIden::Str("id"))) + .order_by(SIden::Str("score"), Order::Desc) + .limit(limit); + + // Combine to make the real query + let mut sql_query = Query::select(); + sql_query + .expr(Expr::cust("json_array_elements(json_agg(q))")) + .from_subquery(sub_query, Alias::new("q")); + + let query_string = sql_query + .clone() + .with(with_clause.clone()) + .to_string(PostgresQueryBuilder); + println!("{}", query_string); + + let (sql, values) = sql_query.with(with_clause).build_sqlx(PostgresQueryBuilder); + Ok((sql, values)) +} diff --git a/pgml-sdks/pgml/src/types.rs b/pgml-sdks/pgml/src/types.rs index bdf7308a3..1a51e4f20 100644 --- a/pgml-sdks/pgml/src/types.rs +++ b/pgml-sdks/pgml/src/types.rs @@ -3,12 +3,12 @@ use futures::{Stream, StreamExt}; use itertools::Itertools; use rust_bridge::alias_manual; use sea_query::Iden; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use std::ops::{Deref, DerefMut}; /// A wrapper around serde_json::Value // #[derive(sqlx::Type, sqlx::FromRow, Debug)] -#[derive(alias_manual, sqlx::Type, Debug, Clone)] +#[derive(alias_manual, sqlx::Type, Debug, Clone, Deserialize, PartialEq, Eq)] #[sqlx(transparent)] pub struct Json(pub serde_json::Value); From 9df35284246d7c8fc3e38cd289d51eca857b4e8e Mon Sep 17 00:00:00 2001 From: SilasMarvin <19626586+SilasMarvin@users.noreply.github.com> Date: Fri, 12 Jan 2024 16:01:09 -0800 Subject: [PATCH 02/72] Working fast site search and vector search --- pgml-sdks/pgml/src/collection.rs | 32 ++- pgml-sdks/pgml/src/filter_builder.rs | 144 +++------- pgml-sdks/pgml/src/lib.rs | 250 ++++++++++++++---- pgml-sdks/pgml/src/multi_field_pipeline.rs | 223 +++++++++------- pgml-sdks/pgml/src/queries.rs | 52 +--- pgml-sdks/pgml/src/query_builder.rs | 4 +- pgml-sdks/pgml/src/remote_embeddings.rs | 12 +- pgml-sdks/pgml/src/search_query_builder.rs | 236 ++++++++++------- .../pgml/src/vector_search_query_builder.rs | 245 +++++++++++++++++ 9 files changed, 798 insertions(+), 400 deletions(-) create mode 100644 pgml-sdks/pgml/src/vector_search_query_builder.rs diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index ac1f1a486..e414ed62a 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -15,6 +15,7 @@ use std::time::SystemTime; use tracing::{instrument, warn}; use walkdir::WalkDir; +use crate::vector_search_query_builder::build_vector_search_query; use crate::{ filter_builder, get_or_initialize_pool, model::ModelRuntime, @@ -718,7 +719,6 @@ impl Collection { let pool = get_or_initialize_pool(&self.database_url).await?; let (query, values) = crate::search_query_builder::build_search_query(self, query, pipeline).await?; - println!("\n\n{query}\n\n"); let results: Vec<(Json,)> = sqlx::query_as_with(&query, values).fetch_all(&pool).await?; Ok(results.into_iter().map(|r| r.0).collect()) } @@ -755,8 +755,9 @@ impl Collection { ) -> anyhow::Result> { let pool = get_or_initialize_pool(&self.database_url).await?; - let query_parameters = query_parameters.unwrap_or_default(); - let top_k = top_k.unwrap_or(5); + let (query, sqlx_values) = + build_vector_search_query(query, self, query_parameters.unwrap_or_default(), pipeline) + .await?; // With this system, we only do the wrong type of vector search once // let runtime = if pipeline.model.is_some() { @@ -1163,6 +1164,15 @@ pgmlc ||..|| pipelines for (key, field_action) in parsed_schema.iter() { let nice_name_key = key.replace(' ', "_"); + + let relations = format!( + r#" +documents ||..|{{ {nice_name_key}_chunks +{nice_name_key}_chunks ||.|| {nice_name_key}_embeddings + "# + ); + uml_relations.push_str(&relations); + if let Some(_embed_action) = &field_action.embed { let entites = format!( r#" @@ -1170,7 +1180,7 @@ entity "{schema}.{key}_chunks" as {nice_name_key}_chunks {{ id : bigint -- created_at : timestamp without time zone - documnt_id : bigint + document_id : bigint chunk_index : bigint chunk : text }} @@ -1180,19 +1190,12 @@ entity "{schema}.{key}_embeddings" as {nice_name_key}_embeddings {{ -- created_at : timestamp without time zone chunk_id : bigint + document_id : bigint embedding : vector }} "# ); uml_entites.push_str(&entites); - - let relations = format!( - r#" -documents ||..|{{ {nice_name_key}_chunks -{nice_name_key}_chunks ||.|| {nice_name_key}_embeddings - "# - ); - uml_relations.push_str(&relations); } if let Some(_full_text_search_action) = &field_action.full_text_search { @@ -1202,7 +1205,8 @@ entity "{schema}.{key}_tsvectors" as {nice_name_key}_tsvectors {{ id : bigint -- created_at : timestamp without time zone - documnt_id : bigint + chunk_id : bigint + document_id : bigint tsvectors : tsvector }} "# @@ -1211,7 +1215,7 @@ entity "{schema}.{key}_tsvectors" as {nice_name_key}_tsvectors {{ let relations = format!( r#" -documents ||..|| {nice_name_key}_tsvectors +{nice_name_key}_chunks ||..|| {nice_name_key}_tsvectors "# ); uml_relations.push_str(&relations); diff --git a/pgml-sdks/pgml/src/filter_builder.rs b/pgml-sdks/pgml/src/filter_builder.rs index 32b9f4126..f820441a8 100644 --- a/pgml-sdks/pgml/src/filter_builder.rs +++ b/pgml-sdks/pgml/src/filter_builder.rs @@ -1,49 +1,8 @@ -use sea_query::{ - extension::postgres::PgExpr, value::ArrayType, Condition, Expr, IntoCondition, SimpleExpr, -}; - -fn get_sea_query_array_type(value: &serde_json::Value) -> ArrayType { - if value.is_null() { - panic!("Invalid metadata filter configuration") - } else if value.is_string() { - ArrayType::String - } else if value.is_i64() || value.is_u64() { - ArrayType::BigInt - } else if value.is_f64() { - ArrayType::Double - } else if value.is_boolean() { - ArrayType::Bool - } else if value.is_array() { - let value = value - .as_array() - .expect("Invalid metadata filter configuration"); - get_sea_query_array_type(&value[0]) - } else { - panic!("Invalid metadata filter configuration") - } -} +use anyhow::Context; +use sea_query::{extension::postgres::PgExpr, Condition, Expr, IntoCondition, SimpleExpr}; fn serde_value_to_sea_query_value(value: &serde_json::Value) -> sea_query::Value { - if value.is_string() { - sea_query::Value::String(Some(Box::new(value.as_str().unwrap().to_string()))) - } else if value.is_i64() { - sea_query::Value::BigInt(Some(value.as_i64().unwrap())) - } else if value.is_f64() { - sea_query::Value::Double(Some(value.as_f64().unwrap())) - } else if value.is_boolean() { - sea_query::Value::Bool(Some(value.as_bool().unwrap())) - } else if value.is_array() { - let value = value.as_array().unwrap(); - let ty = get_sea_query_array_type(&value[0]); - let value = Some(Box::new( - value.iter().map(serde_value_to_sea_query_value).collect(), - )); - sea_query::Value::Array(ty, value) - } else if value.is_object() { - sea_query::Value::Json(Some(Box::new(value.clone()))) - } else { - panic!("Invalid metadata filter configuration") - } + sea_query::Value::Json(Some(Box::new(value.clone()))) } fn reconstruct_json(path: Vec, value: serde_json::Value) -> serde_json::Value { @@ -102,36 +61,13 @@ fn value_is_object_and_is_comparison_operator(value: &serde_json::Value) -> bool }) } -fn get_value_type(value: &serde_json::Value) -> String { - if value.is_object() { - let (_, value) = value - .as_object() - .expect("Invalid metadata filter configuration") - .iter() - .next() - .unwrap(); - get_value_type(value) - } else if value.is_array() { - let value = &value.as_array().unwrap()[0]; - get_value_type(value) - } else if value.is_string() { - "text".to_string() - } else if value.is_i64() || value.is_f64() { - "float8".to_string() - } else if value.is_boolean() { - "bool".to_string() - } else { - panic!("Invalid metadata filter configuration") - } -} - fn build_recursive<'a>( table_name: &'a str, column_name: &'a str, path: Vec, filter: serde_json::Value, condition: Option, -) -> Condition { +) -> anyhow::Result { if filter.is_object() { let mut condition = condition.unwrap_or(Condition::all()); for (key, value) in filter.as_object().unwrap() { @@ -180,41 +116,38 @@ fn build_recursive<'a>( .contains(Expr::val(serde_value_to_sea_query_value(&json))) } } else { - // If we are not checking whether two values are equal or not equal, we need to cast it to the correct type before doing the comparison - let ty = get_value_type(value); let expression = Expr::cust( format!( - "(\"{}\".\"{}\"#>>'{{{}}}')::{}", + "\"{}\".\"{}\"#>'{{{}}}'", table_name, column_name, - local_path.join(","), - ty + local_path.join(",") ) .as_str(), ); let expression = Expr::expr(expression); build_expression(expression, value.clone()) }; - expression.into_condition() + Ok(expression.into_condition()) } else { build_recursive(table_name, column_name, local_path, value.clone(), None) } } - }; + }?; condition = condition.add(sub_condition); } - condition + Ok(condition) } else if filter.is_array() { - let mut condition = condition.expect("Invalid metadata filter configuration"); + let mut condition = condition.context("Invalid metadata filter configuration")?; for value in filter.as_array().unwrap() { let local_path = path.clone(); let new_condition = - build_recursive(table_name, column_name, local_path, value.clone(), None); + build_recursive(table_name, column_name, local_path, value.clone(), None)?; condition = condition.add(new_condition); } - condition + Ok(condition) } else { - panic!("Invalid metadata filter configuration") + anyhow::bail!("Invalid metadata filter configuration") } } @@ -233,7 +166,7 @@ impl<'a> FilterBuilder<'a> { } } - pub fn build(self) -> Condition { + pub fn build(self) -> anyhow::Result { build_recursive( self.table_name, self.column_name, @@ -276,39 +209,41 @@ mod tests { } #[test] - fn eq_operator() { + fn eq_operator() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "id": {"$eq": 1}, "id2": {"id3": {"$eq": "test"}}, "id4": {"id5": {"id6": {"$eq": true}}}, "id7": {"id8": {"id9": {"id10": {"$eq": [1, 2, 3]}}}} })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE "test_table"."metadata" @> E'{\"id\":1}' AND "test_table"."metadata" @> E'{\"id2\":{\"id3\":\"test\"}}' AND "test_table"."metadata" @> E'{\"id4\":{\"id5\":{\"id6\":true}}}' AND "test_table"."metadata" @> E'{\"id7\":{\"id8\":{\"id9\":{\"id10\":[1,2,3]}}}}'"# ); + Ok(()) } #[test] - fn ne_operator() { + fn ne_operator() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "id": {"$ne": 1}, "id2": {"id3": {"$ne": "test"}}, "id4": {"id5": {"id6": {"$ne": true}}}, "id7": {"id8": {"id9": {"id10": {"$ne": [1, 2, 3]}}}} })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE NOT "test_table"."metadata" @> E'{\"id\":1}' AND NOT "test_table"."metadata" @> E'{\"id2\":{\"id3\":\"test\"}}' AND NOT "test_table"."metadata" @> E'{\"id4\":{\"id5\":{\"id6\":true}}}' AND NOT "test_table"."metadata" @> E'{\"id7\":{\"id8\":{\"id9\":{\"id10\":[1,2,3]}}}}'"# ); + Ok(()) } #[test] - fn numeric_comparison_operators() { + fn numeric_comparison_operators() -> anyhow::Result<()> { let basic_comparison_operators = vec![">", ">=", "<", "<="]; let basic_comparison_operators_names = vec!["$gt", "$gte", "$lt", "$lte"]; for (operator, name) in basic_comparison_operators @@ -319,20 +254,22 @@ mod tests { "id": {name: 1}, "id2": {"id3": {name: 1}} })) - .build() + .build()? .to_valid_sql_query(); + println!("{sql}"); assert_eq!( sql, format!( - r##"SELECT "id" FROM "test_table" WHERE ("test_table"."metadata"#>>'{{id}}')::float8 {} 1 AND ("test_table"."metadata"#>>'{{id2,id3}}')::float8 {} 1"##, + r##"SELECT "id" FROM "test_table" WHERE "test_table"."metadata"#>'{{id}}' {} '1' AND "test_table"."metadata"#>'{{id2,id3}}' {} '1'"##, operator, operator ) ); } + Ok(()) } #[test] - fn array_comparison_operators() { + fn array_comparison_operators() -> anyhow::Result<()> { let array_comparison_operators = vec!["IN", "NOT IN"]; let array_comparison_operators_names = vec!["$in", "$nin"]; for (operator, name) in array_comparison_operators @@ -343,68 +280,72 @@ mod tests { "id": {name: [1]}, "id2": {"id3": {name: [1]}} })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, format!( - r##"SELECT "id" FROM "test_table" WHERE ("test_table"."metadata"#>>'{{id}}')::float8 {} (1) AND ("test_table"."metadata"#>>'{{id2,id3}}')::float8 {} (1)"##, + r##"SELECT "id" FROM "test_table" WHERE "test_table"."metadata"#>'{{id}}' {} ('1') AND "test_table"."metadata"#>'{{id2,id3}}' {} ('1')"##, operator, operator ) ); } + Ok(()) } #[test] - fn and_operator() { + fn and_operator() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "$and": [ {"id": {"$eq": 1}}, {"id2": {"id3": {"$eq": 1}}} ] })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE "test_table"."metadata" @> E'{\"id\":1}' AND "test_table"."metadata" @> E'{\"id2\":{\"id3\":1}}'"# ); + Ok(()) } #[test] - fn or_operator() { + fn or_operator() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "$or": [ {"id": {"$eq": 1}}, {"id2": {"id3": {"$eq": 1}}} ] })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE "test_table"."metadata" @> E'{\"id\":1}' OR "test_table"."metadata" @> E'{\"id2\":{\"id3\":1}}'"# ); + Ok(()) } #[test] - fn not_operator() { + fn not_operator() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "$not": [ {"id": {"$eq": 1}}, {"id2": {"id3": {"$eq": 1}}} ] })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE NOT ("test_table"."metadata" @> E'{\"id\":1}' AND "test_table"."metadata" @> E'{\"id2\":{\"id3\":1}}')"# ); + Ok(()) } #[test] - fn random_difficult_tests() { + fn random_difficult_tests() -> anyhow::Result<()> { let sql = construct_filter_builder_with_json(json!({ "$and": [ {"$or": [ @@ -415,7 +356,7 @@ mod tests { {"id4": {"$eq": 1}} ] })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, @@ -431,7 +372,7 @@ mod tests { {"id4": {"$eq": 1}} ] })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, @@ -443,11 +384,12 @@ mod tests { {"uuid2": {"$eq": "2"}} ]} })) - .build() + .build()? .to_valid_sql_query(); assert_eq!( sql, r#"SELECT "id" FROM "test_table" WHERE "test_table"."metadata" @> E'{\"metadata\":{\"uuid\":\"1\"}}' OR "test_table"."metadata" @> E'{\"metadata\":{\"uuid2\":\"2\"}}'"# ); + Ok(()) } } diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index c0d4cb8e4..148daebe6 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -34,6 +34,7 @@ mod splitter; pub mod transformer_pipeline; pub mod types; mod utils; +mod vector_search_query_builder; // Re-export pub use builtins::Builtins; @@ -238,7 +239,138 @@ mod tests { { "id": i, "title": format!("Test document: {}", i), - "body": format!("Here is the body for test document {}", i), + "body": format!(r#" +Here is the body for test document {} + +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler +Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler + + {} + + "#, i, i), "notes": format!("Here are some notes or something for test document {}", i), "metadata": { "uuid": i * 10, @@ -285,7 +417,7 @@ mod tests { internal_init_logger(None, None).ok(); let mut pipeline1 = MultiFieldPipeline::new("test_r_p_carps_1", Some(json!({}).into()))?; let mut pipeline2 = MultiFieldPipeline::new("test_r_p_carps_2", Some(json!({}).into()))?; - let mut collection = Collection::new("test_r_c_carps_7", None); + let mut collection = Collection::new("test_r_c_carps_8", None); collection.add_pipeline(&mut pipeline1).await?; collection.add_pipeline(&mut pipeline2).await?; let pipelines = collection.get_pipelines().await?; @@ -301,7 +433,7 @@ mod tests { #[sqlx::test] async fn can_add_pipeline_and_upsert_documents() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_capaud_33"; + let collection_name = "test_r_c_capaud_36"; let pipeline_name = "test_r_p_capaud_6"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, @@ -313,9 +445,11 @@ mod tests { } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { "model": "intfloat/e5-small", - "splitter": "recursive_character" }, "full_text_search": { "configuration": "english" @@ -364,7 +498,7 @@ mod tests { #[sqlx::test] async fn can_upsert_documents_and_add_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cudaap_34"; + let collection_name = "test_r_c_cudaap_35"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(2); collection.upsert_documents(documents.clone(), None).await?; @@ -379,9 +513,11 @@ mod tests { } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { "model": "intfloat/e5-small", - "splitter": "recursive_character" }, "full_text_search": { "configuration": "english" @@ -414,23 +550,23 @@ mod tests { .fetch_all(&pool) .await?; assert!(body_chunks.len() == 2); - collection.archive().await?; let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); let tsvectors: Vec = sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) .fetch_all(&pool) .await?; assert!(tsvectors.len() == 2); + collection.archive().await?; Ok(()) } #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cs_44"; + let collection_name = "test_r_c_cs_61"; let mut collection = Collection::new(collection_name, None); - let documents = generate_dummy_documents(10000); - collection.upsert_documents(documents.clone(), None).await?; + // let documents = generate_dummy_documents(10000); + // collection.upsert_documents(documents.clone(), None).await?; let pipeline_name = "test_r_p_cs_7"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, @@ -462,35 +598,41 @@ mod tests { .into(), ), )?; - collection.add_pipeline(&mut pipeline).await?; + // collection.add_pipeline(&mut pipeline).await?; let results = collection .search( json!({ "query": { - // "full_text_search": { - // "title": { - // "query": "test", - // "boost": 4.0 - // }, - // "body": { - // "query": "Test", - // "boost": 1.2 - // } - // }, + "full_text_search": { + "title": { + "query": "test", + "boost": 4.0 + }, + "body": { + "query": "Test", + "boost": 1.2 + } + }, "semantic_search": { "title": { "query": "This is a test", "boost": 2.0 }, - // "body": { - // "query": "This is the body test", - // "boost": 1.01 - // }, - // "notes": { - // "query": "This is the notes test", - // "boost": 1.01 - // } + "body": { + "query": "This is the body test", + "boost": 1.01 + }, + "notes": { + "query": "This is the notes test", + "boost": 1.01 + } + }, + "filter": { + "id": { + "$gt": 1 + } } + }, "limit": 5 }) @@ -505,20 +647,17 @@ mod tests { .collect(); assert_eq!(ids, vec![1, 2, 0, 3, 7]); collection.archive().await?; - // results.into_iter().for_each(|r| { - // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); - // }); Ok(()) } #[sqlx::test] async fn can_search_with_remote_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cswre_47"; + let collection_name = "test_r_c_cswre_50"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; - let pipeline_name = "test_r_p_cswre_7"; + let pipeline_name = "test_r_p_cswre_8"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, Some( @@ -562,6 +701,11 @@ mod tests { "query": "This is the body test", "boost": 1.01 }, + }, + "filter": { + "id": { + "$gt": 1 + } } }, "limit": 5 @@ -575,18 +719,15 @@ mod tests { .into_iter() .map(|r| r["document"]["id"].as_u64().unwrap()) .collect(); - assert_eq!(ids, vec![1, 2, 3, 4, 0]); + assert_eq!(ids, vec![2, 3, 0, 1, 4]); collection.archive().await?; - // results.into_iter().for_each(|r| { - // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); - // }); Ok(()) } #[sqlx::test] async fn can_vector_search() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cvs_0"; + let collection_name = "test_r_c_cvs_2"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; @@ -599,6 +740,9 @@ mod tests { "embed": { "model": "intfloat/e5-small" }, + "full_text_search": { + "configuration": "english" + } }, "body": { "embed": { @@ -613,13 +757,24 @@ mod tests { collection.add_pipeline(&mut pipeline).await?; let results = collection .vector_search( - "Test query string", + "Test document: 2", &mut pipeline, Some( json!({ - "fields": [ - "title", "body" - ] + "query": { + "fields": { + "title": { + "full_text_search": "test", + }, + "body": {}, + }, + "filter": { + "id": { + "$lt": 100 + } + } + }, + "limit": 5 }) .into(), ), @@ -648,9 +803,11 @@ mod tests { } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { - "model": "intfloat/e5-small", - "splitter": "recursive_character" + "model": "intfloat/e5-small" }, "full_text_search": { "configuration": "english" @@ -665,10 +822,11 @@ mod tests { .into(), ), )?; - let mut collection = Collection::new("test_r_c_ged_1", None); + let mut collection = Collection::new("test_r_c_ged_2", None); collection.add_pipeline(&mut pipeline).await?; let diagram = collection.generate_er_diagram(&mut pipeline).await?; assert!(!diagram.is_empty()); + println!("{diagram}"); collection.archive().await?; Ok(()) } diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index 8b32f4acb..451746b12 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -24,13 +24,17 @@ use crate::{model::ModelPython, splitter::SplitterPython, types::JsonPython}; type ParsedSchema = HashMap; +#[derive(Deserialize)] +struct ValidSplitterAction { + model: Option, + parameters: Option, +} + #[derive(Deserialize)] struct ValidEmbedAction { model: String, source: Option, - model_parameters: Option, - splitter: Option, - splitter_parameters: Option, + parameters: Option, hnsw: Option, } @@ -41,6 +45,7 @@ pub struct FullTextSearchAction { #[derive(Deserialize)] struct ValidFieldAction { + splitter: Option, embed: Option, full_text_search: Option, } @@ -81,15 +86,20 @@ impl TryFrom for HNSW { } } +#[derive(Debug, Clone)] +pub struct SplitterAction { + pub model: Splitter, +} + #[derive(Debug, Clone)] pub struct EmbedAction { - pub splitter: Option, pub model: Model, pub hnsw: HNSW, } #[derive(Debug, Clone)] pub struct FieldAction { + pub splitter: Option, pub embed: Option, pub full_text_search: Option, } @@ -100,22 +110,23 @@ impl TryFrom for FieldAction { let embed = value .embed .map(|v| { - let model = Model::new(Some(v.model), v.source, v.model_parameters); - let splitter = v - .splitter - .map(|v2| Splitter::new(Some(v2), v.splitter_parameters)); + let model = Model::new(Some(v.model), v.source, v.parameters); let hnsw = v .hnsw .map(|v2| HNSW::try_from(v2)) .unwrap_or_else(|| Ok(HNSW::default()))?; - anyhow::Ok(EmbedAction { - model, - splitter, - hnsw, - }) + anyhow::Ok(EmbedAction { model, hnsw }) + }) + .transpose()?; + let splitter = value + .splitter + .map(|v| { + let splitter = Splitter::new(v.model, v.parameters); + anyhow::Ok(SplitterAction { model: splitter }) }) .transpose()?; Ok(Self { + splitter, embed, full_text_search: value.full_text_search, }) @@ -138,15 +149,6 @@ pub struct MultiFieldPipeline { database_data: Option, } -pub enum PipelineTableTypes { - Embedding, - TSVector, -} - -fn validate_schema(schema: &Json) -> anyhow::Result<()> { - Ok(()) -} - fn json_to_schema(schema: &Json) -> anyhow::Result { schema .as_object() @@ -167,7 +169,7 @@ fn json_to_schema(schema: &Json) -> anyhow::Result { impl MultiFieldPipeline { pub fn new(name: &str, schema: Option) -> anyhow::Result { - let parsed_schema = schema.as_ref().map(|s| json_to_schema(&s)).transpose()?; + let parsed_schema = schema.as_ref().map(|s| json_to_schema(s)).transpose()?; Ok(Self { name: name.to_string(), schema, @@ -203,13 +205,13 @@ impl MultiFieldPipeline { let mut parsed_schema = json_to_schema(&pipeline.schema)?; for (_key, value) in parsed_schema.iter_mut() { + if let Some(splitter) = &mut value.splitter { + splitter.model.set_project_info(project_info.clone()); + splitter.model.verify_in_database(false).await?; + } if let Some(embed) = &mut value.embed { embed.model.set_project_info(project_info.clone()); embed.model.verify_in_database(false).await?; - if let Some(splitter) = &mut embed.splitter { - splitter.set_project_info(project_info.clone()); - splitter.verify_in_database(false).await?; - } } } self.schema = Some(pipeline.schema.clone()); @@ -224,13 +226,13 @@ impl MultiFieldPipeline { let mut parsed_schema = json_to_schema(schema)?; for (_key, value) in parsed_schema.iter_mut() { + if let Some(splitter) = &mut value.splitter { + splitter.model.set_project_info(project_info.clone()); + splitter.model.verify_in_database(false).await?; + } if let Some(embed) = &mut value.embed { embed.model.set_project_info(project_info.clone()); embed.model.verify_in_database(false).await?; - if let Some(splitter) = &mut embed.splitter { - splitter.set_project_info(project_info.clone()); - splitter.verify_in_database(false).await?; - } } } self.parsed_schema = Some(parsed_schema); @@ -277,6 +279,32 @@ impl MultiFieldPipeline { .context("Pipeline must have schema to create_tables")?; for (key, value) in parsed_schema.iter() { + // Create the chunks table + let chunks_table_name = format!("{}.{}_chunks", schema, key); + transaction + .execute( + query_builder!( + queries::CREATE_CHUNKS_TABLE, + chunks_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_chunk_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + chunks_table_name, + "document_id" + ) + .as_str(), + ) + .await?; + if let Some(embed) = &value.embed { let embeddings_table_name = format!("{}.{}_embeddings", schema, key); let exists: bool = sqlx::query_scalar( @@ -305,43 +333,17 @@ impl MultiFieldPipeline { } }; - let chunks_table_name = format!("{}.{}_chunks", schema, key); - - // Create the chunks table - transaction - .execute( - query_builder!( - queries::CREATE_CHUNKS_TABLE, - chunks_table_name, - documents_table_name - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_chunk_document_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - chunks_table_name, - "document_id" - ) - .as_str(), - ) - .await?; - // Create the embeddings table sqlx::query(&query_builder!( queries::CREATE_EMBEDDINGS_TABLE, &embeddings_table_name, chunks_table_name, + documents_table_name, embedding_length )) .execute(&mut *transaction) .await?; - let index_name = format!("{}_pipeline_chunk_id_index", key); + let index_name = format!("{}_pipeline_embedding_chunk_id_index", key); transaction .execute( query_builder!( @@ -354,11 +356,24 @@ impl MultiFieldPipeline { .as_str(), ) .await?; + let index_name = format!("{}_pipeline_embedding_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + &embeddings_table_name, + "document_id" + ) + .as_str(), + ) + .await?; let index_with_parameters = format!( "WITH (m = {}, ef_construction = {})", embed.hnsw.m, embed.hnsw.ef_construction ); - let index_name = format!("{}_pipeline_hnsw_vector_index", key); + let index_name = format!("{}_pipeline_embedding_hnsw_vector_index", key); transaction .execute( query_builder!( @@ -381,14 +396,41 @@ impl MultiFieldPipeline { transaction .execute( query_builder!( - queries::CREATE_DOCUMENTS_TSVECTORS_TABLE, + queries::CREATE_CHUNKS_TSVECTORS_TABLE, tsvectors_table_name, + chunks_table_name, documents_table_name ) .as_str(), ) .await?; - let index_name = format!("{}_tsvector_index", key); + let index_name = format!("{}_pipeline_tsvector_chunk_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + tsvectors_table_name, + "chunk_id" + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + tsvectors_table_name, + "document_id" + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_index", key); transaction .execute( query_builder!( @@ -423,15 +465,20 @@ impl MultiFieldPipeline { .context("Pipeline must have schema to execute")?; for (key, value) in parsed_schema.iter() { + let chunk_ids = self + .sync_chunks( + key, + value.splitter.as_ref().map(|v| &v.model), + document_ids, + &mp, + ) + .await?; if let Some(embed) = &value.embed { - let chunk_ids = self - .sync_chunks(key, &embed.splitter, document_ids, &mp) - .await?; self.sync_embeddings(key, &embed.model, &chunk_ids, &mp) .await?; } if let Some(full_text_search) = &value.full_text_search { - self.sync_tsvectors(key, &full_text_search.configuration, document_ids, &mp) + self.sync_tsvectors(key, &full_text_search.configuration, &chunk_ids, &mp) .await?; } } @@ -442,7 +489,7 @@ impl MultiFieldPipeline { async fn sync_chunks( &self, key: &str, - splitter: &Option, + splitter: Option<&Splitter>, document_ids: &Option>, mp: &MultiProgress, ) -> anyhow::Result> { @@ -627,7 +674,7 @@ impl MultiFieldPipeline { &self, key: &str, configuration: &str, - document_ids: &Option>, + chunk_ids: &Vec, mp: &MultiProgress, ) -> anyhow::Result<()> { let pool = self.get_pool().await?; @@ -642,34 +689,20 @@ impl MultiFieldPipeline { .with_prefix(self.name.clone()) .with_message("Syncing TSVectors for full text search"); - let documents_table_name = format!("{}.documents", project_info.name); + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); let tsvectors_table_name = format!("{}_{}.{}_tsvectors", project_info.name, self.name, key); - let json_key_query = format!("document->>'{}'", key); let is_done = AtomicBool::new(false); let work = async { - let res = if document_ids.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_TSVECTORS_FOR_DOCUMENT_IDS, - tsvectors_table_name, - configuration, - json_key_query, - documents_table_name - )) - .bind(document_ids) - .execute(&pool) - .await - } else { - sqlx::query(&query_builder!( - queries::GENERATE_TSVECTORS, - tsvectors_table_name, - configuration, - json_key_query, - documents_table_name - )) - .execute(&pool) - .await - }; + let res = sqlx::query(&query_builder!( + queries::GENERATE_TSVECTORS_FOR_CHUNK_IDS, + tsvectors_table_name, + configuration, + chunks_table_name + )) + .bind(chunk_ids) + .execute(&pool) + .await; is_done.store(true, Relaxed); res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) }; @@ -700,11 +733,11 @@ impl MultiFieldPipeline { pub(crate) fn set_project_info(&mut self, project_info: ProjectInfo) { if let Some(parsed_schema) = &mut self.parsed_schema { for (_key, value) in parsed_schema.iter_mut() { + if let Some(splitter) = &mut value.splitter { + splitter.model.set_project_info(project_info.clone()); + } if let Some(embed) = &mut value.embed { embed.model.set_project_info(project_info.clone()); - if let Some(splitter) = &mut embed.splitter { - splitter.set_project_info(project_info.clone()); - } } } } diff --git a/pgml-sdks/pgml/src/queries.rs b/pgml-sdks/pgml/src/queries.rs index 08e7a8d4e..e15094987 100644 --- a/pgml-sdks/pgml/src/queries.rs +++ b/pgml-sdks/pgml/src/queries.rs @@ -71,18 +71,20 @@ CREATE TABLE IF NOT EXISTS %s ( id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(), chunk_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, + document_id int8 NOT NULL REFERENCES %s, embedding vector(%d) NOT NULL, UNIQUE (chunk_id) ); "#; -pub const CREATE_DOCUMENTS_TSVECTORS_TABLE: &str = r#" +pub const CREATE_CHUNKS_TSVECTORS_TABLE: &str = r#" CREATE TABLE IF NOT EXISTS %s ( id serial8 PRIMARY KEY, created_at timestamp NOT NULL DEFAULT now(), - document_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, + chunk_id int8 NOT NULL REFERENCES %s ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, + document_id int8 NOT NULL REFERENCES %s, ts tsvector, - UNIQUE (document_id) + UNIQUE (chunk_id) ); "#; @@ -104,53 +106,23 @@ CREATE INDEX %d IF NOT EXISTS %s on %s using hnsw (%d) %d; ///////////////////////////// // Other Big Queries //////// ///////////////////////////// -pub const GENERATE_TSVECTORS: &str = r#" -INSERT INTO %s (document_id, ts) -SELECT - id, - to_tsvector('%d', %d) ts -FROM - %s -ON CONFLICT (document_id) DO NOTHING; -"#; - -pub const GENERATE_TSVECTORS_FOR_DOCUMENT_IDS: &str = r#" -INSERT INTO %s (document_id, ts) +pub const GENERATE_TSVECTORS_FOR_CHUNK_IDS: &str = r#" +INSERT INTO %s (chunk_id, document_id, ts) SELECT id, - to_tsvector('%d', %d) ts + document_id, + to_tsvector('%d', chunk) ts FROM %s WHERE id = ANY ($1) -ON CONFLICT (document_id) DO NOTHING; -"#; - -pub const GENERATE_EMBEDDINGS: &str = r#" -INSERT INTO %s (chunk_id, embedding) -SELECT - id, - pgml.embed( - text => chunk, - transformer => $1, - kwargs => $2 - ) -FROM - %s -WHERE - splitter_id = $3 - AND id NOT IN ( - SELECT - chunk_id - from - %s - ) ON CONFLICT (chunk_id) DO NOTHING; "#; pub const GENERATE_EMBEDDINGS_FOR_CHUNK_IDS: &str = r#" -INSERT INTO %s (chunk_id, embedding) +INSERT INTO %s (chunk_id, document_id, embedding) SELECT id, + document_id, pgml.embed( text => chunk, transformer => $1, @@ -266,7 +238,7 @@ FROM ) AS documents ) chunks ON CONFLICT (document_id, chunk_index) DO NOTHING -RETURNING id +RETURNING id, document_id "#; pub const GENERATE_CHUNKS_FOR_DOCUMENT_IDS: &str = r#" diff --git a/pgml-sdks/pgml/src/query_builder.rs b/pgml-sdks/pgml/src/query_builder.rs index 11b2405e8..5ebc7ef8a 100644 --- a/pgml-sdks/pgml/src/query_builder.rs +++ b/pgml-sdks/pgml/src/query_builder.rs @@ -71,7 +71,9 @@ impl QueryBuilder { #[instrument(skip(self))] fn filter_metadata(mut self, filter: serde_json::Value) -> Self { - let filter = filter_builder::FilterBuilder::new(filter, "documents", "metadata").build(); + let filter = filter_builder::FilterBuilder::new(filter, "documents", "metadata") + .build() + .expect("Error building filter"); self.query.cond_where(filter); self } diff --git a/pgml-sdks/pgml/src/remote_embeddings.rs b/pgml-sdks/pgml/src/remote_embeddings.rs index e963b3c0f..54c7d2828 100644 --- a/pgml-sdks/pgml/src/remote_embeddings.rs +++ b/pgml-sdks/pgml/src/remote_embeddings.rs @@ -115,11 +115,19 @@ pub trait RemoteEmbeddings<'a> { let embeddings = self.embed(chunk_texts).await?; let query_string_values = (0..embeddings.len()) - .map(|i| format!("(${}, ${})", i * 2 + 1, i * 2 + 2)) + .map(|i| { + query_builder!( + "($%d, $%d, (SELECT document_id FROM %s WHERE id = $%d))", + i * 2 + 1, + i * 2 + 2, + chunks_table_name, + i * 2 + 1 + ) + }) .collect::>() .join(","); let query_string = format!( - "INSERT INTO %s (chunk_id, embedding) VALUES {}", + "INSERT INTO %s (chunk_id, embedding, document_id) VALUES {}", query_string_values ); diff --git a/pgml-sdks/pgml/src/search_query_builder.rs b/pgml-sdks/pgml/src/search_query_builder.rs index 7c03e590b..1e6f093b6 100644 --- a/pgml-sdks/pgml/src/search_query_builder.rs +++ b/pgml-sdks/pgml/src/search_query_builder.rs @@ -10,6 +10,7 @@ use sea_query_binder::{SqlxBinder, SqlxValues}; use crate::{ collection::Collection, + filter_builder::FilterBuilder, model::ModelRuntime, models, multi_field_pipeline::MultiFieldPipeline, @@ -31,14 +32,15 @@ struct ValidMatchAction { } #[derive(Debug, Deserialize)] -struct ValidQueryAction { +struct ValidQueryActions { full_text_search: Option>, semantic_search: Option>, + filter: Option, } #[derive(Debug, Deserialize)] struct ValidQuery { - query: ValidQueryAction, + query: ValidQueryActions, limit: Option, } @@ -53,8 +55,9 @@ pub async fn build_search_query( let pipeline_table = format!("{}.pipelines", collection.name); let documents_table = format!("{}.documents", collection.name); + let mut query = Query::select(); + let mut score_table_names = Vec::new(); let mut with_clause = WithClause::new(); - let mut sub_query = Query::select(); let mut sum_expression: Option = None; let mut pipeline_cte = Query::select(); @@ -88,6 +91,10 @@ pub async fn build_search_query( .transpose()? .unwrap_or(ModelRuntime::Python); + // Build the CTE we actually use later + let embeddings_table = format!("{}_{}.{}_embeddings", collection.name, pipeline.name, key); + let cte_name = format!("{key}_embedding_score"); + let mut score_cte = Query::select(); match model_runtime { ModelRuntime::Python => { // Build the embedding CTE @@ -106,19 +113,12 @@ pub async fn build_search_query( embedding_cte.table_name(Alias::new(format!("{key}_embedding"))); with_clause.cte(embedding_cte); - // Add to the sum expression - let boost = vsa.boost.unwrap_or(1.); - sum_expression = if let Some(expr) = sum_expression { - Some(expr.add(Expr::cust(format!( - // r#"((1 - MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector)) * {boost})"# - r#"(MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector))"# - )))) - } else { - Some(Expr::cust(format!( - // r#"((1 - MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector)) * {boost})"# - r#"(MIN("{key}_embeddings".embedding <=> (SELECT embedding FROM "{key}_embedding")::vector))"# + // Build the score CTE + score_cte + .column((SIden::Str("embeddings"), SIden::Str("document_id"))) + .expr(Expr::cust(format!( + r#"MIN(embeddings.embedding <=> (SELECT embedding FROM "{key}_embedding")::vector) AS score"# ))) - }; } ModelRuntime::OpenAI => { // We can unwrap here as we know this is all set from above @@ -144,115 +144,149 @@ pub async fn build_search_query( std::mem::take(&mut embeddings[0]) }; - // Add to the sum expression - let boost = vsa.boost.unwrap_or(1.); - sum_expression = if let Some(expr) = sum_expression { - Some(expr.add(Expr::cust_with_values( - format!( - // r#"((1 - MIN("{key}_embeddings".embedding <=> $1::vector)) * {boost})"#, - r#"(MIN("{key}_embeddings".embedding <=> $1::vector))"#, - ), - [embedding], - ))) - } else { - Some(Expr::cust_with_values( - format!( - r#"(MIN("{key}_embeddings".embedding <=> $1::vector))"# // r#"((1 - MIN("{key}_embeddings".embedding <=> $1::vector)) * {boost})"# - ), + // Build the score CTE + score_cte + .column((SIden::Str("embeddings"), SIden::Str("document_id"))) + .expr(Expr::cust_with_values( + r#"MIN(embeddings.embedding <=> $1::vector) AS score"#, [embedding], )) - }; } + }; + + score_cte + .from_as(embeddings_table.to_table_tuple(), Alias::new("embeddings")) + .group_by_col((SIden::Str("embeddings"), SIden::Str("id"))) + .limit(limit); + + if let Some(filter) = &valid_query.query.filter { + let filter = FilterBuilder::new(filter.clone().0, "documents", "document").build()?; + score_cte.cond_where(filter); + score_cte.join_as( + JoinType::InnerJoin, + documents_table.to_table_tuple(), + Alias::new("documents"), + Expr::col((SIden::Str("documents"), SIden::Str("id"))) + .equals((SIden::Str("embeddings"), SIden::Str("document_id"))), + ); } - // Do the proper inner joins - let chunks_table = format!("{}_{}.{}_chunks", collection.name, pipeline.name, key); - let embeddings_table = format!("{}_{}.{}_embeddings", collection.name, pipeline.name, key); - sub_query.join_as( - JoinType::InnerJoin, - chunks_table.to_table_tuple(), - Alias::new(format!("{key}_chunks")), - Expr::col(( - SIden::String(format!("{key}_chunks")), - SIden::Str("document_id"), - )) - .equals((SIden::Str("documents"), SIden::Str("id"))), - ); - sub_query.join_as( - JoinType::InnerJoin, - embeddings_table.to_table_tuple(), - Alias::new(format!("{key}_embeddings")), - Expr::col(( - SIden::String(format!("{key}_embeddings")), - SIden::Str("chunk_id"), - )) - .equals((SIden::String(format!("{key}_chunks")), SIden::Str("id"))), - ); + let mut score_cte = CommonTableExpression::from_select(score_cte); + score_cte.table_name(Alias::new(&cte_name)); + with_clause.cte(score_cte); + + // Add to the sum expression + let boost = vsa.boost.unwrap_or(1.); + sum_expression = if let Some(expr) = sum_expression { + Some(expr.add(Expr::cust(format!( + r#"COALESCE((1 - "{cte_name}".score) * {boost}, 0.0)"# + )))) + } else { + Some(Expr::cust(format!( + r#"COALESCE((1 - "{cte_name}".score) * {boost}, 0.0)"# + ))) + }; + score_table_names.push(cte_name); } for (key, vma) in valid_query.query.full_text_search.unwrap_or_default() { let full_text_table = format!("{}_{}.{}_tsvectors", collection.name, pipeline.name, key); - // Inner join the tsvectors table - sub_query.join_as( - JoinType::InnerJoin, - full_text_table.to_table_tuple(), - Alias::new(format!("{key}_tsvectors")), - Expr::col(( - SIden::String(format!("{key}_tsvectors")), - SIden::Str("document_id"), + // Build the score CTE + let cte_name = format!("{key}_tsvectors_score"); + let mut score_cte = Query::select(); + score_cte + .column(SIden::Str("document_id")) + .expr_as( + Expr::cust_with_values( + format!( + r#"MAX(ts_rank(tsvectors.ts, plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1), 32))"#, + ), + [&vma.query], + ), + Alias::new("score") + ) + .from_as( + full_text_table.to_table_tuple(), + Alias::new("tsvectors"), + ) + .and_where(Expr::cust_with_values( + format!( + r#"tsvectors.ts @@ plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1)"#, + ), + [&vma.query], )) - .equals((SIden::Str("documents"), SIden::Str("id"))), - ); - - // TODO: Maybe add this?? - // Do the proper where statement - // sub_query.and_where(Expr::cust_with_values( - // format!( - // r#""{key}_tsvectors".ts @@ plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1)"#, - // ), - // [&vma.query], - // )); + .group_by_col(SIden::Str("document_id")) + .limit(limit); + let mut score_cte = CommonTableExpression::from_select(score_cte); + score_cte.table_name(Alias::new(&cte_name)); + with_clause.cte(score_cte); // Add to the sum expression - let boost = vma.boost.unwrap_or(1.); + let boost = vma.boost.unwrap_or(1.0); sum_expression = if let Some(expr) = sum_expression { - Some(expr.add(Expr::cust_with_values(format!( - r#"(MAX(ts_rank("{key}_tsvectors".ts, plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1), 32)) * {boost})"#, - ), - [vma.query] - ))) + Some(expr.add(Expr::cust(format!( + r#"COALESCE("{cte_name}".score * {boost}, 0.0)"# + )))) } else { - Some(Expr::cust_with_values( - format!( - r#"(MAX(ts_rank("{key}_tsvectors".ts, plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1), 32)) * {boost})"#, - ), - [vma.query], - )) + Some(Expr::cust(format!( + r#"COALESCE("{cte_name}".score * {boost}, 0.0)"# + ))) }; + score_table_names.push(cte_name); } - // Finalize the sub query - sub_query - .column((SIden::Str("documents"), SIden::Str("document"))) - .expr_as(sum_expression.unwrap(), Alias::new("score")) - .from_as(documents_table.to_table_tuple(), Alias::new("documents")) - .group_by_col((SIden::Str("documents"), SIden::Str("id"))) - .order_by(SIden::Str("score"), Order::Desc) - .limit(limit); + let query = if let Some(select_from) = score_table_names.first() { + let score_table_names_e: Vec = score_table_names + .clone() + .into_iter() + .map(|t| Expr::col((SIden::String(t), SIden::Str("document_id"))).into()) + .collect(); + for i in 1..score_table_names_e.len() { + query.full_outer_join( + SIden::String(score_table_names[i].to_string()), + Expr::col(( + SIden::String(score_table_names[i].to_string()), + SIden::Str("document_id"), + )) + .eq(Func::coalesce(score_table_names_e[0..i].to_vec())), + ); + } + let id_select_expression = Func::coalesce(score_table_names_e); + + let sum_expression = sum_expression + .context("query requires some scoring through full_text_search or semantic_search")?; + query + .expr_as(id_select_expression, Alias::new("id")) + .expr_as(sum_expression, Alias::new("score")) + .column(SIden::Str("document")) + .from(SIden::String(select_from.to_string())) + .join_as( + JoinType::InnerJoin, + documents_table.to_table_tuple(), + Alias::new("documents"), + Expr::col((SIden::Str("documents"), SIden::Str("id"))).equals(SIden::Str("id")), + ) + .limit(limit) + .order_by(SIden::Str("score"), Order::Desc); - // Combine to make the real query - let mut sql_query = Query::select(); - sql_query - .expr(Expr::cust("json_array_elements(json_agg(q))")) - .from_subquery(sub_query, Alias::new("q")); + let mut combined_query = Query::select(); + combined_query + .expr(Expr::cust("json_array_elements(json_agg(q))")) + .from_subquery(query, Alias::new("q")); + combined_query + } else { + // TODO: Maybe let users filter documents only here? + anyhow::bail!("If you are only looking to filter documents checkout the `get_documents` method on the Collection") + }; - let query_string = sql_query + // TODO: Remove this + let query_string = query .clone() .with(with_clause.clone()) .to_string(PostgresQueryBuilder); - println!("{}", query_string); + println!("\nTHE QUERY: \n{query_string}\n"); - let (sql, values) = sql_query.with(with_clause).build_sqlx(PostgresQueryBuilder); + let (sql, values) = query.with(with_clause).build_sqlx(PostgresQueryBuilder); Ok((sql, values)) } diff --git a/pgml-sdks/pgml/src/vector_search_query_builder.rs b/pgml-sdks/pgml/src/vector_search_query_builder.rs new file mode 100644 index 000000000..3dbb7c468 --- /dev/null +++ b/pgml-sdks/pgml/src/vector_search_query_builder.rs @@ -0,0 +1,245 @@ +use anyhow::Context; +use serde::Deserialize; +use std::collections::HashMap; + +use sea_query::{ + Alias, CommonTableExpression, Expr, Func, JoinType, Order, PostgresQueryBuilder, Query, + QueryStatementWriter, SimpleExpr, WithClause, +}; +use sea_query_binder::{SqlxBinder, SqlxValues}; + +use crate::{ + collection::Collection, + filter_builder::FilterBuilder, + model::ModelRuntime, + models, + multi_field_pipeline::MultiFieldPipeline, + remote_embeddings::build_remote_embeddings, + types::{IntoTableNameAndSchema, Json, SIden}, +}; + +#[derive(Debug, Deserialize)] +struct ValidFullTextSearchAction { + configuration: String, + text: String, +} + +#[derive(Debug, Deserialize)] +struct ValidField { + model_parameters: Option, + full_text_search: Option, +} + +#[derive(Debug, Deserialize)] +struct ValidQueryActions { + fields: Option>, + filter: Option, +} + +#[derive(Debug, Deserialize)] +struct ValidQuery { + query: ValidQueryActions, + limit: Option, +} + +pub async fn build_vector_search_query( + query_text: &str, + collection: &Collection, + query: Json, + pipeline: &MultiFieldPipeline, +) -> anyhow::Result<(String, SqlxValues)> { + let valid_query: ValidQuery = serde_json::from_value(query.0)?; + let limit = valid_query.limit.unwrap_or(10); + let fields = valid_query.query.fields.unwrap_or_default(); + + if fields.is_empty() { + anyhow::bail!("at least one field is required to search over") + } + + let pipeline_table = format!("{}.pipelines", collection.name); + let documents_table = format!("{}.documents", collection.name); + + let mut queries = Vec::new(); + let mut with_clause = WithClause::new(); + + let mut pipeline_cte = Query::select(); + pipeline_cte + .from(pipeline_table.to_table_tuple()) + .columns([models::PipelineIden::Schema]) + .and_where(Expr::col(models::PipelineIden::Name).eq(&pipeline.name)); + let mut pipeline_cte = CommonTableExpression::from_select(pipeline_cte); + pipeline_cte.table_name(Alias::new("pipeline")); + with_clause.cte(pipeline_cte); + + for (key, vf) in fields { + let model_runtime = pipeline + .parsed_schema + .as_ref() + .map(|s| { + // Any of these errors means they have a malformed query + anyhow::Ok( + s.get(&key) + .as_ref() + .context(format!("Bad query - {key} does not exist in schema"))? + .embed + .as_ref() + .context(format!( + "Bad query - {key} does not have any directive to embed" + ))? + .model + .runtime, + ) + }) + .transpose()? + .unwrap_or(ModelRuntime::Python); + + let chunks_table = format!("{}_{}.{}_chunks", collection.name, pipeline.name, key); + let embeddings_table = format!("{}_{}.{}_embeddings", collection.name, pipeline.name, key); + + let mut query = Query::select(); + + match model_runtime { + ModelRuntime::Python => { + // Build the embedding CTE + let mut embedding_cte = Query::select(); + embedding_cte.expr_as( + Func::cust(SIden::Str("pgml.embed")).args([ + Expr::cust(format!( + "transformer => (SELECT schema #>> '{{{key},embed,model}}' FROM pipeline)", + )), + Expr::cust_with_values("text => $1", [query_text]), + Expr::cust(format!("kwargs => COALESCE((SELECT schema #> '{{{key},embed,model_parameters}}' FROM pipeline), '{{}}'::jsonb)")), + ]), + Alias::new("embedding"), + ); + let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); + embedding_cte.table_name(Alias::new(format!("{key}_embedding"))); + with_clause.cte(embedding_cte); + + query + .expr(Expr::cust(format!( + r#"1 - (embeddings.embedding <=> (SELECT embedding FROM "{key}_embedding")::vector) AS score"# + ))) + .order_by_expr(Expr::cust(format!( + r#"embeddings.embedding <=> (SELECT embedding FROM "{key}_embedding")::vector"# + )), Order::Asc); + } + ModelRuntime::OpenAI => { + // We can unwrap here as we know this is all set from above + let model = &pipeline + .parsed_schema + .as_ref() + .unwrap() + .get(&key) + .unwrap() + .embed + .as_ref() + .unwrap() + .model; + + // Get the remote embedding + let embedding = { + let remote_embeddings = build_remote_embeddings( + model.runtime, + &model.name, + vf.model_parameters.as_ref(), + )?; + let mut embeddings = remote_embeddings + .embed(vec![query_text.to_string()]) + .await?; + std::mem::take(&mut embeddings[0]) + }; + + // Build the score CTE + query + .expr(Expr::cust_with_values( + r#"1 - (embeddings.embedding <=> $1::vector) AS score"#, + [embedding.clone()], + )) + .order_by_expr( + Expr::cust_with_values( + r#"embeddings.embedding <=> $1::vector"#, + [embedding], + ), + Order::Asc, + ); + } + } + + query + .column((SIden::Str("embeddings"), SIden::Str("document_id"))) + .column((SIden::Str("chunks"), SIden::Str("chunk"))) + .from_as(embeddings_table.to_table_tuple(), Alias::new("embeddings")) + .join_as( + JoinType::InnerJoin, + chunks_table.to_table_tuple(), + Alias::new("chunks"), + Expr::col((SIden::Str("chunks"), SIden::Str("id"))) + .equals((SIden::Str("embeddings"), SIden::Str("chunk_id"))), + ) + .limit(limit); + + if let Some(filter) = &valid_query.query.filter { + let filter = FilterBuilder::new(filter.clone().0, "documents", "document").build()?; + query.cond_where(filter); + query.join_as( + JoinType::InnerJoin, + documents_table.to_table_tuple(), + Alias::new("documents"), + Expr::col((SIden::Str("documents"), SIden::Str("id"))) + .equals((SIden::Str("embeddings"), SIden::Str("document_id"))), + ); + } + + if let Some(full_text_search) = &vf.full_text_search { + let full_text_table = + format!("{}_{}.{}_tsvectors", collection.name, pipeline.name, key); + query + .and_where(Expr::cust_with_values( + format!( + r#"tsvectors.ts @@ plainto_tsquery((SELECT oid FROM pg_ts_config WHERE cfgname = (SELECT schema #>> '{{{key},full_text_search,configuration}}' FROM pipeline)), $1)"#, + ), + [full_text_search], + )) + .join_as( + JoinType::InnerJoin, + full_text_table.to_table_tuple(), + Alias::new("tsvectors"), + Expr::col((SIden::Str("tsvectors"), SIden::Str("chunk_id"))) + .equals((SIden::Str("embeddings"), SIden::Str("chunk_id"))) + ); + } + + let mut wrapper_query = Query::select(); + wrapper_query + .columns([ + SIden::Str("document_id"), + SIden::Str("chunk"), + SIden::Str("score"), + ]) + .from_subquery(query, Alias::new("s")); + + queries.push(wrapper_query); + } + + // Union all of the queries together + let mut query = queries.pop().context("no query")?; + for q in queries.into_iter() { + query.union(sea_query::UnionType::All, q); + } + + // Resort and limit + query + .order_by(SIden::Str("score"), Order::Desc) + .limit(limit); + + // TODO: Remove this + let query_string = query + .clone() + .with(with_clause.clone()) + .to_string(PostgresQueryBuilder); + println!("\nTHE QUERY: \n{query_string}\n"); + + let (sql, values) = query.with(with_clause).build_sqlx(PostgresQueryBuilder); + Ok((sql, values)) +} From f9cb8a1bf9c0c216dab9df7812096e17a01f873f Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Tue, 16 Jan 2024 16:09:04 -0800 Subject: [PATCH 03/72] Cleaned tests and remote fallback working for search and vector_search --- pgml-sdks/pgml/Cargo.lock | 2 +- pgml-sdks/pgml/src/collection.rs | 238 +++++++-------- pgml-sdks/pgml/src/lib.rs | 276 ++++++++---------- pgml-sdks/pgml/src/search_query_builder.rs | 53 +++- .../pgml/src/vector_search_query_builder.rs | 19 +- 5 files changed, 270 insertions(+), 318 deletions(-) diff --git a/pgml-sdks/pgml/Cargo.lock b/pgml-sdks/pgml/Cargo.lock index 131380b9d..a78a3f0a3 100644 --- a/pgml-sdks/pgml/Cargo.lock +++ b/pgml-sdks/pgml/Cargo.lock @@ -1439,7 +1439,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "pgml" -version = "0.10.0" +version = "0.10.1" dependencies = [ "anyhow", "async-trait", diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index e414ed62a..1f1202a9e 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -15,6 +15,7 @@ use std::time::SystemTime; use tracing::{instrument, warn}; use walkdir::WalkDir; +use crate::search_query_builder::build_search_query; use crate::vector_search_query_builder::build_vector_search_query; use crate::{ filter_builder, get_or_initialize_pool, @@ -712,15 +713,42 @@ impl Collection { #[instrument(skip(self))] pub async fn search( - &self, + &mut self, query: Json, - pipeline: &MultiFieldPipeline, + pipeline: &mut MultiFieldPipeline, ) -> anyhow::Result> { let pool = get_or_initialize_pool(&self.database_url).await?; - let (query, values) = - crate::search_query_builder::build_search_query(self, query, pipeline).await?; - let results: Vec<(Json,)> = sqlx::query_as_with(&query, values).fetch_all(&pool).await?; - Ok(results.into_iter().map(|r| r.0).collect()) + let (built_query, values) = build_search_query(self, query.clone(), pipeline).await?; + let results: Result, _> = sqlx::query_as_with(&built_query, values) + .fetch_all(&pool) + .await; + + match results { + Ok(r) => Ok(r.into_iter().map(|r| r.0).collect()), + Err(e) => match e.as_database_error() { + Some(d) => { + if d.code() == Some(Cow::from("XX000")) { + self.verify_in_database(false).await?; + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to do remote embeddings search")? + .project_info; + pipeline.set_project_info(project_info.to_owned()); + pipeline.verify_in_database(false).await?; + let (built_query, values) = + build_search_query(self, query, pipeline).await?; + let results: Vec<(Json,)> = sqlx::query_as_with(&built_query, values) + .fetch_all(&pool) + .await?; + Ok(results.into_iter().map(|r| r.0).collect()) + } else { + Err(anyhow::anyhow!(e)) + } + } + None => Err(anyhow::anyhow!(e)), + }, + } } /// Performs vector search on the [Collection] @@ -752,142 +780,72 @@ impl Collection { pipeline: &mut MultiFieldPipeline, query_parameters: Option, top_k: Option, - ) -> anyhow::Result> { + ) -> anyhow::Result> { let pool = get_or_initialize_pool(&self.database_url).await?; - let (query, sqlx_values) = - build_vector_search_query(query, self, query_parameters.unwrap_or_default(), pipeline) - .await?; - - // With this system, we only do the wrong type of vector search once - // let runtime = if pipeline.model.is_some() { - // pipeline.model.as_ref().unwrap().runtime - // } else { - // ModelRuntime::Python - // }; - - unimplemented!() - - // let pool = get_or_initialize_pool(&self.database_url).await?; - - // let query_parameters = query_parameters.unwrap_or_default(); - // let top_k = top_k.unwrap_or(5); - - // // With this system, we only do the wrong type of vector search once - // let runtime = if pipeline.model.is_some() { - // pipeline.model.as_ref().unwrap().runtime - // } else { - // ModelRuntime::Python - // }; - // match runtime { - // ModelRuntime::Python => { - // let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); - - // let result = sqlx::query_as(&query_builder!( - // queries::EMBED_AND_VECTOR_SEARCH, - // self.pipelines_table_name, - // embeddings_table_name, - // self.chunks_table_name, - // self.documents_table_name - // )) - // .bind(&pipeline.name) - // .bind(query) - // .bind(&query_parameters) - // .bind(top_k) - // .fetch_all(&pool) - // .await; - - // match result { - // Ok(r) => Ok(r), - // Err(e) => match e.as_database_error() { - // Some(d) => { - // if d.code() == Some(Cow::from("XX000")) { - // self.vector_search_with_remote_embeddings( - // query, - // pipeline, - // query_parameters, - // top_k, - // &pool, - // ) - // .await - // } else { - // Err(anyhow::anyhow!(e)) - // } - // } - // None => Err(anyhow::anyhow!(e)), - // }, - // } - // } - // _ => { - // self.vector_search_with_remote_embeddings( - // query, - // pipeline, - // query_parameters, - // top_k, - // &pool, - // ) - // .await - // } - // } - // .map(|r| { - // r.into_iter() - // .map(|(score, id, metadata)| (1. - score, id, metadata)) - // .collect() - // }) - } - - #[instrument(skip(self, pool))] - #[allow(clippy::type_complexity)] - async fn vector_search_with_remote_embeddings( - &mut self, - query: &str, - pipeline: &mut Pipeline, - query_parameters: Json, - top_k: i64, - pool: &PgPool, - ) -> anyhow::Result> { - // TODO: Make this actually work maybe an alias for the new search or something idk - unimplemented!() - - // self.verify_in_database(false).await?; - - // // Have to set the project info before we can get and set the model - // pipeline.set_project_info( - // self.database_data - // .as_ref() - // .context( - // "Collection must be verified to perform vector search with remote embeddings", - // )? - // .project_info - // .clone(), - // ); - // // Verify to get and set the model if we don't have it set on the pipeline yet - // pipeline.verify_in_database(false).await?; - // let model = pipeline - // .model - // .as_ref() - // .context("Pipeline must be verified to perform vector search with remote embeddings")?; - - // // We need to make sure we are not mutably and immutably borrowing the same things - // let embedding = { - // let remote_embeddings = - // build_remote_embeddings(model.runtime, &model.name, &query_parameters)?; - // let mut embeddings = remote_embeddings.embed(vec![query.to_string()]).await?; - // std::mem::take(&mut embeddings[0]) - // }; - - // let embeddings_table_name = format!("{}.{}_embeddings", self.name, pipeline.name); - // sqlx::query_as(&query_builder!( - // queries::VECTOR_SEARCH, - // embeddings_table_name, - // self.chunks_table_name, - // self.documents_table_name - // )) - // .bind(embedding) - // .bind(top_k) - // .fetch_all(pool) - // .await - // .map_err(|e| anyhow::anyhow!(e)) + let (built_query, values) = build_vector_search_query( + query, + self, + query_parameters.clone().unwrap_or_default(), + pipeline, + ) + .await?; + let results: Result, _> = + sqlx::query_as_with(&built_query, values) + .fetch_all(&pool) + .await; + match results { + Ok(r) => Ok(r + .into_iter() + .map(|v| { + serde_json::json!({ + "document": v.0, + "chunk": v.1, + "score": v.2 + }) + .into() + }) + .collect()), + Err(e) => match e.as_database_error() { + Some(d) => { + if d.code() == Some(Cow::from("XX000")) { + self.verify_in_database(false).await?; + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to do remote embeddings search")? + .project_info; + pipeline.set_project_info(project_info.to_owned()); + pipeline.verify_in_database(false).await?; + let (built_query, values) = build_vector_search_query( + query, + self, + query_parameters.clone().unwrap_or_default(), + pipeline, + ) + .await?; + let results: Vec<(Json, String, f64)> = + sqlx::query_as_with(&built_query, values) + .fetch_all(&pool) + .await?; + Ok(results + .into_iter() + .map(|v| { + serde_json::json!({ + "document": v.0, + "chunk": v.1, + "score": v.2 + }) + .into() + }) + .collect()) + } else { + Err(anyhow::anyhow!(e)) + } + } + None => Err(anyhow::anyhow!(e)), + }, + } } #[instrument(skip(self))] diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 148daebe6..96e1318cc 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -229,148 +229,24 @@ fn main(mut cx: neon::context::ModuleContext) -> neon::result::NeonResult<()> { mod tests { use super::*; use crate::types::Json; - use itertools::assert_equal; use serde_json::json; fn generate_dummy_documents(count: usize) -> Vec { let mut documents = Vec::new(); for i in 0..count { + let body_text = vec![format!( + "Here is some text that we will end up splitting on! {i}" + )] + .into_iter() + .cycle() + .take(100) + .collect::>() + .join("\n"); let document = serde_json::json!( { "id": i, "title": format!("Test document: {}", i), - "body": format!(r#" -Here is the body for test document {} - -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler -Here is some more text this is rather interesting honestly but I am unsure what to say bout this blah blah blah filler filler filler - - {} - - "#, i, i), + "body": body_text, "notes": format!("Here are some notes or something for test document {}", i), "metadata": { "uuid": i * 10, @@ -417,7 +293,7 @@ Here is some more text this is rather interesting honestly but I am unsure what internal_init_logger(None, None).ok(); let mut pipeline1 = MultiFieldPipeline::new("test_r_p_carps_1", Some(json!({}).into()))?; let mut pipeline2 = MultiFieldPipeline::new("test_r_p_carps_2", Some(json!({}).into()))?; - let mut collection = Collection::new("test_r_c_carps_8", None); + let mut collection = Collection::new("test_r_c_carps_9", None); collection.add_pipeline(&mut pipeline1).await?; collection.add_pipeline(&mut pipeline2).await?; let pipelines = collection.get_pipelines().await?; @@ -498,7 +374,7 @@ Here is some more text this is rather interesting honestly but I am unsure what #[sqlx::test] async fn can_upsert_documents_and_add_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cudaap_35"; + let collection_name = "test_r_c_cudaap_38"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(2); collection.upsert_documents(documents.clone(), None).await?; @@ -549,13 +425,13 @@ Here is some more text this is rather interesting honestly but I am unsure what sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) .fetch_all(&pool) .await?; - assert!(body_chunks.len() == 2); + assert!(body_chunks.len() == 4); let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); let tsvectors: Vec = sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) .fetch_all(&pool) .await?; - assert!(tsvectors.len() == 2); + assert!(tsvectors.len() == 4); collection.archive().await?; Ok(()) } @@ -563,11 +439,11 @@ Here is some more text this is rather interesting honestly but I am unsure what #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cs_61"; + let collection_name = "test_r_c_cs_67"; let mut collection = Collection::new(collection_name, None); - // let documents = generate_dummy_documents(10000); - // collection.upsert_documents(documents.clone(), None).await?; - let pipeline_name = "test_r_p_cs_7"; + let documents = generate_dummy_documents(10); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cs_9"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, Some( @@ -581,9 +457,11 @@ Here is some more text this is rather interesting honestly but I am unsure what } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { - "model": "intfloat/e5-small", - "splitter": "recursive_character" + "model": "intfloat/e5-small" }, "full_text_search": { "configuration": "english" @@ -598,14 +476,14 @@ Here is some more text this is rather interesting honestly but I am unsure what .into(), ), )?; - // collection.add_pipeline(&mut pipeline).await?; + collection.add_pipeline(&mut pipeline).await?; let results = collection .search( json!({ "query": { "full_text_search": { "title": { - "query": "test", + "query": "test 9", "boost": 4.0 }, "body": { @@ -637,15 +515,14 @@ Here is some more text this is rather interesting honestly but I am unsure what "limit": 5 }) .into(), - &pipeline, + &mut pipeline, ) .await?; - assert!(results.len() == 5); let ids: Vec = results .into_iter() .map(|r| r["document"]["id"].as_u64().unwrap()) .collect(); - assert_eq!(ids, vec![1, 2, 0, 3, 7]); + assert_eq!(ids, vec![3, 8, 2, 7, 4]); collection.archive().await?; Ok(()) } @@ -653,7 +530,7 @@ Here is some more text this is rather interesting honestly but I am unsure what #[sqlx::test] async fn can_search_with_remote_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cswre_50"; + let collection_name = "test_r_c_cswre_51"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; @@ -668,10 +545,12 @@ Here is some more text this is rather interesting honestly but I am unsure what } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { "model": "text-embedding-ada-002", "source": "openai", - "splitter": "recursive_character" }, "full_text_search": { "configuration": "english" @@ -682,6 +561,7 @@ Here is some more text this is rather interesting honestly but I am unsure what ), )?; collection.add_pipeline(&mut pipeline).await?; + let mut pipeline = MultiFieldPipeline::new(pipeline_name, None)?; let results = collection .search( json!({ @@ -711,23 +591,22 @@ Here is some more text this is rather interesting honestly but I am unsure what "limit": 5 }) .into(), - &pipeline, + &mut pipeline, ) .await?; - assert!(results.len() == 5); let ids: Vec = results .into_iter() .map(|r| r["document"]["id"].as_u64().unwrap()) .collect(); - assert_eq!(ids, vec![2, 3, 0, 1, 4]); + assert_eq!(ids, vec![2, 3, 7, 4, 8]); collection.archive().await?; Ok(()) } #[sqlx::test] - async fn can_vector_search() -> anyhow::Result<()> { + async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cvs_2"; + let collection_name = "test_r_c_cvs_3"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; @@ -745,9 +624,80 @@ Here is some more text this is rather interesting honestly but I am unsure what } }, "body": { + "splitter": { + "model": "recursive_character" + }, "embed": { - "model": "intfloat/e5-small", - "splitter": "recursive_character" + "model": "intfloat/e5-small" + }, + }, + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + let results = collection + .vector_search( + "Test document: 2", + &mut pipeline, + Some( + json!({ + "query": { + "fields": { + "title": { + "full_text_search": "test", + }, + "body": {}, + }, + "filter": { + "id": { + "$gt": 3 + } + } + }, + "limit": 5 + }) + .into(), + ), + None, + ) + .await?; + let ids: Vec = results + .into_iter() + .map(|r| r["document"]["id"].as_u64().unwrap()) + .collect(); + assert_eq!(ids, vec![4, 5, 6, 7, 9]); + collection.archive().await?; + Ok(()) + } + + #[sqlx::test] + async fn can_vector_search_with_remote_embeddings() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let collection_name = "test_r_c_cvs_4"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(10); + collection.upsert_documents(documents.clone(), None).await?; + let pipeline_name = "test_r_p_cvs_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + } + }, + "body": { + "splitter": { + "model": "recursive_character" + }, + "embed": { + "source": "openai", + "model": "text-embedding-ada-002" }, }, }) @@ -755,6 +705,7 @@ Here is some more text this is rather interesting honestly but I am unsure what ), )?; collection.add_pipeline(&mut pipeline).await?; + let mut pipeline = MultiFieldPipeline::new(pipeline_name, None)?; let results = collection .vector_search( "Test document: 2", @@ -770,7 +721,7 @@ Here is some more text this is rather interesting honestly but I am unsure what }, "filter": { "id": { - "$lt": 100 + "$gt": 3 } } }, @@ -781,9 +732,12 @@ Here is some more text this is rather interesting honestly but I am unsure what None, ) .await?; - // results.into_iter().for_each(|r| { - // println!("{}", serde_json::to_string_pretty(&r.0).unwrap()); - // }); + let ids: Vec = results + .into_iter() + .map(|r| r["document"]["id"].as_u64().unwrap()) + .collect(); + assert_eq!(ids, vec![4, 5, 6, 7, 9]); + collection.archive().await?; Ok(()) } diff --git a/pgml-sdks/pgml/src/search_query_builder.rs b/pgml-sdks/pgml/src/search_query_builder.rs index 1e6f093b6..0dd2b94d9 100644 --- a/pgml-sdks/pgml/src/search_query_builder.rs +++ b/pgml-sdks/pgml/src/search_query_builder.rs @@ -119,6 +119,9 @@ pub async fn build_search_query( .expr(Expr::cust(format!( r#"MIN(embeddings.embedding <=> (SELECT embedding FROM "{key}_embedding")::vector) AS score"# ))) + .order_by_expr(Expr::cust(format!( + r#"embeddings.embedding <=> (SELECT embedding FROM "{key}_embedding")::vector"# + )), Order::Asc ) } ModelRuntime::OpenAI => { // We can unwrap here as we know this is all set from above @@ -149,8 +152,15 @@ pub async fn build_search_query( .column((SIden::Str("embeddings"), SIden::Str("document_id"))) .expr(Expr::cust_with_values( r#"MIN(embeddings.embedding <=> $1::vector) AS score"#, - [embedding], + [embedding.clone()], )) + .order_by_expr( + Expr::cust_with_values( + r#"embeddings.embedding <=> $1::vector"#, + [embedding], + ), + Order::Asc, + ) } }; @@ -217,7 +227,21 @@ pub async fn build_search_query( [&vma.query], )) .group_by_col(SIden::Str("document_id")) + .order_by(SIden::Str("score"), Order::Desc) .limit(limit); + + if let Some(filter) = &valid_query.query.filter { + let filter = FilterBuilder::new(filter.clone().0, "documents", "document").build()?; + score_cte.cond_where(filter); + score_cte.join_as( + JoinType::InnerJoin, + documents_table.to_table_tuple(), + Alias::new("documents"), + Expr::col((SIden::Str("documents"), SIden::Str("id"))) + .equals((SIden::Str("tsvectors"), SIden::Str("document_id"))), + ); + } + let mut score_cte = CommonTableExpression::from_select(score_cte); score_cte.table_name(Alias::new(&cte_name)); with_clause.cte(score_cte); @@ -257,7 +281,11 @@ pub async fn build_search_query( let sum_expression = sum_expression .context("query requires some scoring through full_text_search or semantic_search")?; query - .expr_as(id_select_expression, Alias::new("id")) + // .expr_as(id_select_expression.clone(), Alias::new("id")) + .expr(Expr::cust_with_expr( + "DISTINCT ON ($1) $1 as id", + id_select_expression.clone(), + )) .expr_as(sum_expression, Alias::new("score")) .column(SIden::Str("document")) .from(SIden::String(select_from.to_string())) @@ -265,15 +293,26 @@ pub async fn build_search_query( JoinType::InnerJoin, documents_table.to_table_tuple(), Alias::new("documents"), - Expr::col((SIden::Str("documents"), SIden::Str("id"))).equals(SIden::Str("id")), + Expr::col((SIden::Str("documents"), SIden::Str("id"))) + .eq(id_select_expression.clone()), ) - .limit(limit) - .order_by(SIden::Str("score"), Order::Desc); + .order_by_expr( + Expr::cust_with_expr("$1, score", id_select_expression), + Order::Desc, + ); + // .order_by(SIden::Str("score"), Order::Desc); + + let mut re_ordered_query = Query::select(); + re_ordered_query + .expr(Expr::cust("*")) + .from_subquery(query, Alias::new("q1")) + .order_by(SIden::Str("score"), Order::Desc) + .limit(5); let mut combined_query = Query::select(); combined_query - .expr(Expr::cust("json_array_elements(json_agg(q))")) - .from_subquery(query, Alias::new("q")); + .expr(Expr::cust("json_array_elements(json_agg(q2))")) + .from_subquery(re_ordered_query, Alias::new("q2")); combined_query } else { // TODO: Maybe let users filter documents only here? diff --git a/pgml-sdks/pgml/src/vector_search_query_builder.rs b/pgml-sdks/pgml/src/vector_search_query_builder.rs index 3dbb7c468..67154c75d 100644 --- a/pgml-sdks/pgml/src/vector_search_query_builder.rs +++ b/pgml-sdks/pgml/src/vector_search_query_builder.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use sea_query::{ Alias, CommonTableExpression, Expr, Func, JoinType, Order, PostgresQueryBuilder, Query, - QueryStatementWriter, SimpleExpr, WithClause, + QueryStatementWriter, WithClause, }; use sea_query_binder::{SqlxBinder, SqlxValues}; @@ -169,6 +169,7 @@ pub async fn build_vector_search_query( query .column((SIden::Str("embeddings"), SIden::Str("document_id"))) .column((SIden::Str("chunks"), SIden::Str("chunk"))) + .column((SIden::Str("documents"), SIden::Str("document"))) .from_as(embeddings_table.to_table_tuple(), Alias::new("embeddings")) .join_as( JoinType::InnerJoin, @@ -177,18 +178,18 @@ pub async fn build_vector_search_query( Expr::col((SIden::Str("chunks"), SIden::Str("id"))) .equals((SIden::Str("embeddings"), SIden::Str("chunk_id"))), ) - .limit(limit); - - if let Some(filter) = &valid_query.query.filter { - let filter = FilterBuilder::new(filter.clone().0, "documents", "document").build()?; - query.cond_where(filter); - query.join_as( + .join_as( JoinType::InnerJoin, documents_table.to_table_tuple(), Alias::new("documents"), Expr::col((SIden::Str("documents"), SIden::Str("id"))) .equals((SIden::Str("embeddings"), SIden::Str("document_id"))), - ); + ) + .limit(limit); + + if let Some(filter) = &valid_query.query.filter { + let filter = FilterBuilder::new(filter.clone().0, "documents", "document").build()?; + query.cond_where(filter); } if let Some(full_text_search) = &vf.full_text_search { @@ -213,7 +214,7 @@ pub async fn build_vector_search_query( let mut wrapper_query = Query::select(); wrapper_query .columns([ - SIden::Str("document_id"), + SIden::Str("document"), SIden::Str("chunk"), SIden::Str("score"), ]) From b04ead6ec70bd2e8d919372919aec28dccd0e87d Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 17 Jan 2024 09:33:17 -0800 Subject: [PATCH 04/72] Clean up vector search --- pgml-sdks/pgml/src/collection.rs | 22 ++--- pgml-sdks/pgml/src/lib.rs | 84 +++++++++---------- .../pgml/src/vector_search_query_builder.rs | 11 ++- 3 files changed, 53 insertions(+), 64 deletions(-) diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index 1f1202a9e..11239068e 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -776,20 +776,14 @@ impl Collection { #[allow(clippy::type_complexity)] pub async fn vector_search( &mut self, - query: &str, + query: Json, pipeline: &mut MultiFieldPipeline, - query_parameters: Option, top_k: Option, ) -> anyhow::Result> { let pool = get_or_initialize_pool(&self.database_url).await?; - let (built_query, values) = build_vector_search_query( - query, - self, - query_parameters.clone().unwrap_or_default(), - pipeline, - ) - .await?; + let (built_query, values) = + build_vector_search_query(query.clone(), self, pipeline).await?; let results: Result, _> = sqlx::query_as_with(&built_query, values) .fetch_all(&pool) @@ -817,13 +811,8 @@ impl Collection { .project_info; pipeline.set_project_info(project_info.to_owned()); pipeline.verify_in_database(false).await?; - let (built_query, values) = build_vector_search_query( - query, - self, - query_parameters.clone().unwrap_or_default(), - pipeline, - ) - .await?; + let (built_query, values) = + build_vector_search_query(query, self, pipeline).await?; let results: Vec<(Json, String, f64)> = sqlx::query_as_with(&built_query, values) .fetch_all(&pool) @@ -862,6 +851,7 @@ impl Collection { .bind(&self.name) .execute(&mut *transaciton) .await?; + // TODO: Alter pipeline schema sqlx::query(&query_builder!( "ALTER SCHEMA %s RENAME TO %s", &self.name, diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 96e1318cc..28bfbfce5 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -606,11 +606,11 @@ mod tests { #[sqlx::test] async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cvs_3"; + let collection_name = "test_r_c_cvswle_3"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; - let pipeline_name = "test_r_p_cvs_0"; + let pipeline_name = "test_r_p_cvswle_0"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, Some( @@ -638,27 +638,27 @@ mod tests { collection.add_pipeline(&mut pipeline).await?; let results = collection .vector_search( - "Test document: 2", - &mut pipeline, - Some( - json!({ - "query": { - "fields": { - "title": { - "full_text_search": "test", - }, - "body": {}, + json!({ + "query": { + "fields": { + "title": { + "query": "Test document: 2", + "full_text_search": "test" + }, + "body": { + "query": "Test document: 2" }, - "filter": { - "id": { - "$gt": 3 - } - } }, - "limit": 5 - }) - .into(), - ), + "filter": { + "id": { + "$gt": 3 + } + } + }, + "limit": 5 + }) + .into(), + &mut pipeline, None, ) .await?; @@ -674,11 +674,11 @@ mod tests { #[sqlx::test] async fn can_vector_search_with_remote_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cvs_4"; + let collection_name = "test_r_c_cvswre_4"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; - let pipeline_name = "test_r_p_cvs_0"; + let pipeline_name = "test_r_p_cvswre_0"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, Some( @@ -708,27 +708,27 @@ mod tests { let mut pipeline = MultiFieldPipeline::new(pipeline_name, None)?; let results = collection .vector_search( - "Test document: 2", - &mut pipeline, - Some( - json!({ - "query": { - "fields": { - "title": { - "full_text_search": "test", - }, - "body": {}, + json!({ + "query": { + "fields": { + "title": { + "full_text_search": "test", + "query": "Test document: 2" + }, + "body": { + "query": "Test document: 2" }, - "filter": { - "id": { - "$gt": 3 - } - } }, - "limit": 5 - }) - .into(), - ), + "filter": { + "id": { + "$gt": 3 + } + } + }, + "limit": 5 + }) + .into(), + &mut pipeline, None, ) .await?; diff --git a/pgml-sdks/pgml/src/vector_search_query_builder.rs b/pgml-sdks/pgml/src/vector_search_query_builder.rs index 67154c75d..4a6feec9b 100644 --- a/pgml-sdks/pgml/src/vector_search_query_builder.rs +++ b/pgml-sdks/pgml/src/vector_search_query_builder.rs @@ -26,6 +26,7 @@ struct ValidFullTextSearchAction { #[derive(Debug, Deserialize)] struct ValidField { + query: String, model_parameters: Option, full_text_search: Option, } @@ -43,9 +44,8 @@ struct ValidQuery { } pub async fn build_vector_search_query( - query_text: &str, - collection: &Collection, query: Json, + collection: &Collection, pipeline: &MultiFieldPipeline, ) -> anyhow::Result<(String, SqlxValues)> { let valid_query: ValidQuery = serde_json::from_value(query.0)?; @@ -107,7 +107,7 @@ pub async fn build_vector_search_query( Expr::cust(format!( "transformer => (SELECT schema #>> '{{{key},embed,model}}' FROM pipeline)", )), - Expr::cust_with_values("text => $1", [query_text]), + Expr::cust_with_values("text => $1", [vf.query]), Expr::cust(format!("kwargs => COALESCE((SELECT schema #> '{{{key},embed,model_parameters}}' FROM pipeline), '{{}}'::jsonb)")), ]), Alias::new("embedding"), @@ -144,9 +144,8 @@ pub async fn build_vector_search_query( &model.name, vf.model_parameters.as_ref(), )?; - let mut embeddings = remote_embeddings - .embed(vec![query_text.to_string()]) - .await?; + let mut embeddings = + remote_embeddings.embed(vec![vf.query.to_string()]).await?; std::mem::take(&mut embeddings[0]) }; From 44ab0ed3931d6a3dc96a310b896bcce8a7fd887d Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 17 Jan 2024 11:42:24 -0800 Subject: [PATCH 05/72] Switched to a transactional version of upsert documents and syncing pipelines --- pgml-sdks/pgml/src/collection.rs | 104 ++++++---- pgml-sdks/pgml/src/lib.rs | 175 ++++++++++++++-- pgml-sdks/pgml/src/multi_field_pipeline.rs | 223 +++++++-------------- pgml-sdks/pgml/src/queries.rs | 4 +- pgml-sdks/pgml/src/remote_embeddings.rs | 18 +- 5 files changed, 315 insertions(+), 209 deletions(-) diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index 11239068e..cb65c5d1b 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -11,7 +11,9 @@ use sqlx::Executor; use sqlx::PgConnection; use std::borrow::Cow; use std::path::Path; +use std::sync::Arc; use std::time::SystemTime; +use tokio::sync::Mutex; use tracing::{instrument, warn}; use walkdir::WalkDir; @@ -282,7 +284,7 @@ impl Collection { pipeline.verify_in_database(true).await?; let mp = MultiProgress::new(); mp.println(format!("Added Pipeline {}, Now Syncing...", pipeline.name))?; - pipeline.execute(&None, mp).await?; + self.sync_pipeline(pipeline).await?; eprintln!("Done Syncing {}\n", pipeline.name); Ok(()) } @@ -445,21 +447,20 @@ impl Collection { pub async fn upsert_documents( &mut self, documents: Vec, - args: Option, + _args: Option, ) -> anyhow::Result<()> { let pool = get_or_initialize_pool(&self.database_url).await?; self.verify_in_database(false).await?; - - // TODO: Work on this - let args = args.unwrap_or_default(); - - let mut document_ids = vec![]; + let mut pipelines = self.get_pipelines().await?; + for pipeline in &mut pipelines { + pipeline.create_tables().await?; + } let progress_bar = utils::default_progress_bar(documents.len() as u64); progress_bar.println("Upserting Documents..."); - let mut transaction = pool.begin().await?; for document in documents { + let mut transaction = pool.begin().await?; let id = document .get("id") .context("`id` must be a key in document")? @@ -467,14 +468,33 @@ impl Collection { let md5_digest = md5::compute(id.as_bytes()); let source_uuid = uuid::Uuid::from_slice(&md5_digest.0)?; - let id: i64 = sqlx::query_scalar(&query_builder!("INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = $2 RETURNING id", self.documents_table_name)).bind(source_uuid).bind(document).fetch_one(&mut *transaction).await?; - document_ids.push(id); + let document_id: i64 = sqlx::query_scalar(&query_builder!("INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = $2 RETURNING id", self.documents_table_name)).bind(source_uuid).bind(document).fetch_one(&mut *transaction).await?; + + let transaction = Arc::new(Mutex::new(transaction)); + if !pipelines.is_empty() { + use futures::stream::StreamExt; + futures::stream::iter(&mut pipelines) + // Need this map to get around moving the transaction + .map(|pipeline| (pipeline, transaction.clone())) + .for_each_concurrent(10, |(pipeline, transaction)| async move { + pipeline + .execute(Some(document_id), transaction) + .await + .expect("Failed to execute pipeline"); + }) + .await; + } + + Arc::into_inner(transaction) + .context("Error transaction dangling")? + .into_inner() + .commit() + .await?; } - transaction.commit().await?; progress_bar.println("Done Upserting Documents\n"); progress_bar.finish(); - self.sync_pipelines(Some(document_ids)).await + Ok(()) } /// Gets the documents on a [Collection] @@ -686,28 +706,26 @@ impl Collection { } #[instrument(skip(self))] - pub(crate) async fn sync_pipelines( - &mut self, - document_ids: Option>, - ) -> anyhow::Result<()> { + async fn sync_pipeline(&mut self, pipeline: &mut MultiFieldPipeline) -> anyhow::Result<()> { self.verify_in_database(false).await?; - let pipelines = self.get_pipelines().await?; - if !pipelines.is_empty() { - let mp = MultiProgress::new(); - mp.println("Syncing Pipelines...")?; - use futures::stream::StreamExt; - futures::stream::iter(pipelines) - // Need this map to get around moving the document_ids and mp - .map(|pipeline| (pipeline, document_ids.clone(), mp.clone())) - .for_each_concurrent(10, |(mut pipeline, document_ids, mp)| async move { - pipeline - .execute(&document_ids, mp) - .await - .expect("Failed to execute pipeline"); - }) - .await; - mp.println("Done Syncing Pipelines\n")?; - } + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to get collection pipelines")? + .project_info; + pipeline.set_project_info(project_info.clone()); + pipeline.create_tables().await?; + + let pool = get_or_initialize_pool(&self.database_url).await?; + let transaction = pool.begin().await?; + let transaction = Arc::new(Mutex::new(transaction)); + pipeline.execute(None, transaction.clone()).await?; + + Arc::into_inner(transaction) + .context("Error transaction dangling")? + .into_inner() + .commit() + .await?; Ok(()) } @@ -840,22 +858,34 @@ impl Collection { #[instrument(skip(self))] pub async fn archive(&mut self) -> anyhow::Result<()> { let pool = get_or_initialize_pool(&self.database_url).await?; + let pipelines = self.get_pipelines().await?; let timestamp = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .expect("Error getting system time") .as_secs(); - let archive_table_name = format!("{}_archive_{}", &self.name, timestamp); + let collection_archive_name = format!("{}_archive_{}", &self.name, timestamp); let mut transaciton = pool.begin().await?; + // Change name in pgml.collections sqlx::query("UPDATE pgml.collections SET name = $1, active = FALSE where name = $2") - .bind(&archive_table_name) + .bind(&collection_archive_name) .bind(&self.name) .execute(&mut *transaciton) .await?; - // TODO: Alter pipeline schema + // Change collection_pipeline schema + for pipeline in pipelines { + sqlx::query(&query_builder!( + "ALTER SCHEMA %s RENAME TO %s", + format!("{}_{}", self.name, pipeline.name), + format!("{}_{}", collection_archive_name, pipeline.name) + )) + .execute(&mut *transaciton) + .await?; + } + // Change collection schema sqlx::query(&query_builder!( "ALTER SCHEMA %s RENAME TO %s", &self.name, - archive_table_name + collection_archive_name )) .execute(&mut *transaciton) .await?; diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 28bfbfce5..e121e3914 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -293,7 +293,7 @@ mod tests { internal_init_logger(None, None).ok(); let mut pipeline1 = MultiFieldPipeline::new("test_r_p_carps_1", Some(json!({}).into()))?; let mut pipeline2 = MultiFieldPipeline::new("test_r_p_carps_2", Some(json!({}).into()))?; - let mut collection = Collection::new("test_r_c_carps_9", None); + let mut collection = Collection::new("test_r_c_carps_10", None); collection.add_pipeline(&mut pipeline1).await?; collection.add_pipeline(&mut pipeline2).await?; let pipelines = collection.get_pipelines().await?; @@ -309,7 +309,7 @@ mod tests { #[sqlx::test] async fn can_add_pipeline_and_upsert_documents() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_capaud_36"; + let collection_name = "test_r_c_capaud_44"; let pipeline_name = "test_r_p_capaud_6"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, @@ -360,25 +360,25 @@ mod tests { sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) .fetch_all(&pool) .await?; - assert!(body_chunks.len() == 2); + assert!(body_chunks.len() == 4); collection.archive().await?; let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); let tsvectors: Vec = sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) .fetch_all(&pool) .await?; - assert!(tsvectors.len() == 2); + assert!(tsvectors.len() == 4); Ok(()) } #[sqlx::test] async fn can_upsert_documents_and_add_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cudaap_38"; + let collection_name = "test_r_c_cudaap_42"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(2); collection.upsert_documents(documents.clone(), None).await?; - let pipeline_name = "test_r_p_cudaap_6"; + let pipeline_name = "test_r_p_cudaap_9"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, Some( @@ -436,6 +436,158 @@ mod tests { Ok(()) } + #[sqlx::test] + async fn random_pipelines_documents_test() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let collection_name = "test_r_c_rpdt_3"; + let mut collection = Collection::new(collection_name, None); + let documents = generate_dummy_documents(6); + collection + .upsert_documents(documents[..2].to_owned(), None) + .await?; + let pipeline_name1 = "test_r_p_rpdt1_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name1, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + }, + "body": { + "splitter": { + "model": "recursive_character" + }, + "embed": { + "model": "intfloat/e5-small", + }, + "full_text_search": { + "configuration": "english" + } + } + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + + collection + .upsert_documents(documents[2..4].to_owned(), None) + .await?; + + let pool = get_or_initialize_pool(&None).await?; + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name1); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 4); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name1); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 8); + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name1); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 8); + + let pipeline_name2 = "test_r_p_rpdt2_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name2, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + }, + "body": { + "splitter": { + "model": "recursive_character" + }, + "embed": { + "model": "intfloat/e5-small", + }, + "full_text_search": { + "configuration": "english" + } + } + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name2); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 4); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name2); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 8); + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name2); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 8); + + collection + .upsert_documents(documents[4..6].to_owned(), None) + .await?; + + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name2); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 6); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name2); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 12); + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name2); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 12); + + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name1); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 6); + let chunks_table = format!("{}_{}.body_chunks", collection_name, pipeline_name1); + let body_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(body_chunks.len() == 12); + let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name1); + let tsvectors: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) + .fetch_all(&pool) + .await?; + assert!(tsvectors.len() == 12); + + collection.archive().await?; + Ok(()) + } + #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); @@ -530,7 +682,7 @@ mod tests { #[sqlx::test] async fn can_search_with_remote_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cswre_51"; + let collection_name = "test_r_c_cswre_52"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(10); collection.upsert_documents(documents.clone(), None).await?; @@ -785,15 +937,6 @@ mod tests { Ok(()) } - // TODO: Test - // - remote embeddings - // - some kind of simlutaneous upload with async threads and join - // - test the splitting is working correctly - // - test that different splitters and models are working correctly - - // TODO: DO - // - update upsert_documents to not re run pipeline if it is not part of the schema - // #[sqlx::test] // async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { // internal_init_logger(None, None).ok(); diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index 451746b12..67d5e48a9 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -2,10 +2,12 @@ use anyhow::Context; use indicatif::MultiProgress; use rust_bridge::{alias, alias_manual, alias_methods}; use serde::Deserialize; -use sqlx::{Executor, PgConnection, PgPool}; +use sqlx::{Executor, PgConnection, PgPool, Postgres, Transaction}; use std::sync::atomic::Ordering::Relaxed; +use std::sync::Arc; use std::{collections::HashMap, sync::atomic::AtomicBool}; use tokio::join; +use tokio::sync::Mutex; use tracing::instrument; use crate::{ @@ -453,11 +455,10 @@ impl MultiFieldPipeline { #[instrument(skip(self))] pub(crate) async fn execute( &mut self, - document_ids: &Option>, - mp: MultiProgress, + document_id: Option, + transaction: Arc>>, ) -> anyhow::Result<()> { - self.verify_in_database(false).await?; - self.create_tables().await?; + // We are assuming we have manually verified the pipeline before doing this let parsed_schema = self .parsed_schema @@ -469,17 +470,22 @@ impl MultiFieldPipeline { .sync_chunks( key, value.splitter.as_ref().map(|v| &v.model), - document_ids, - &mp, + document_id, + transaction.clone(), ) .await?; if let Some(embed) = &value.embed { - self.sync_embeddings(key, &embed.model, &chunk_ids, &mp) + self.sync_embeddings(key, &embed.model, &chunk_ids, transaction.clone()) .await?; } if let Some(full_text_search) = &value.full_text_search { - self.sync_tsvectors(key, &full_text_search.configuration, &chunk_ids, &mp) - .await?; + self.sync_tsvectors( + key, + &full_text_search.configuration, + &chunk_ids, + transaction.clone(), + ) + .await?; } } Ok(()) @@ -490,11 +496,9 @@ impl MultiFieldPipeline { &self, key: &str, splitter: Option<&Splitter>, - document_ids: &Option>, - mp: &MultiProgress, + document_id: Option, + transaction: Arc>>, ) -> anyhow::Result> { - let pool = self.get_pool().await?; - let project_info = self .project_info .as_ref() @@ -510,60 +514,37 @@ impl MultiFieldPipeline { .as_ref() .context("Splitter must be verified to sync chunks")?; - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(format!("{} - {}", self.name.clone(), key)) - .with_message("Generating chunks"); - - let is_done = AtomicBool::new(false); - let work = async { - let chunk_ids: Result, _> = if document_ids.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_CHUNKS_FOR_DOCUMENT_IDS, - &chunks_table_name, - &json_key_query, - documents_table_name, - &chunks_table_name - )) - .bind(splitter_database_data.id) - .bind(document_ids) - .execute(&pool) - .await - .map_err(|e| { - is_done.store(true, Relaxed); - e - })?; - sqlx::query_scalar(&query_builder!( - "SELECT id FROM %s WHERE document_id = ANY($1)", - &chunks_table_name - )) - .bind(document_ids) - .fetch_all(&pool) - .await - } else { - sqlx::query_scalar(&query_builder!( - queries::GENERATE_CHUNKS, - &chunks_table_name, - &json_key_query, - documents_table_name, - &chunks_table_name - )) - .bind(splitter_database_data.id) - .fetch_all(&pool) - .await - }; - is_done.store(true, Relaxed); - chunk_ids - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } + let chunk_ids: Result, _> = if document_id.is_some() { + sqlx::query(&query_builder!( + queries::GENERATE_CHUNKS_FOR_DOCUMENT_ID, + &chunks_table_name, + &json_key_query, + documents_table_name, + &chunks_table_name + )) + .bind(splitter_database_data.id) + .bind(document_id) + .execute(&mut *transaction.lock().await) + .await?; + sqlx::query_scalar(&query_builder!( + "SELECT id FROM %s WHERE document_id = $1", + &chunks_table_name + )) + .bind(document_id) + .fetch_all(&mut *transaction.lock().await) + .await + } else { + sqlx::query_scalar(&query_builder!( + queries::GENERATE_CHUNKS, + &chunks_table_name, + &json_key_query, + documents_table_name, + &chunks_table_name + )) + .bind(splitter_database_data.id) + .fetch_all(&mut *transaction.lock().await) + .await }; - let (chunk_ids, _) = join!(work, progress_work); - progress_bar.set_message("Done generating chunks"); - progress_bar.finish(); chunk_ids.map_err(anyhow::Error::msg) } else { sqlx::query_scalar(&query_builder!( @@ -583,7 +564,7 @@ impl MultiFieldPipeline { &json_key_query, &documents_table_name )) - .fetch_all(&pool) + .fetch_all(&mut *transaction.lock().await) .await .map_err(anyhow::Error::msg) } @@ -595,10 +576,8 @@ impl MultiFieldPipeline { key: &str, model: &Model, chunk_ids: &Vec, - mp: &MultiProgress, + transaction: Arc>>, ) -> anyhow::Result<()> { - let pool = self.get_pool().await?; - // Remove the stored name from the parameters let mut parameters = model.parameters.clone(); parameters @@ -611,22 +590,13 @@ impl MultiFieldPipeline { .as_ref() .context("Pipeline must have project info to sync chunks")?; - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(self.name.clone()) - .with_message("Generating emmbeddings"); - let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); let embeddings_table_name = format!("{}_{}.{}_embeddings", project_info.name, self.name, key); - let is_done = AtomicBool::new(false); - // We need to be careful about how we handle errors here. We do not want to return an error - // from the async block before setting is_done to true. If we do, the progress bar will - // will load forever. We also want to make sure to propogate any errors we have - let work = async { - let res = match model.runtime { - ModelRuntime::Python => sqlx::query(&query_builder!( + match model.runtime { + ModelRuntime::Python => { + sqlx::query(&query_builder!( queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, embeddings_table_name, chunks_table_name, @@ -635,37 +605,21 @@ impl MultiFieldPipeline { .bind(&model.name) .bind(¶meters) .bind(chunk_ids) - .execute(&pool) - .await - .map_err(|e| anyhow::anyhow!(e)) - .map(|_t| ()), - r => { - let remote_embeddings = - build_remote_embeddings(r, &model.name, Some(¶meters))?; - remote_embeddings - .generate_embeddings( - &embeddings_table_name, - &chunks_table_name, - chunk_ids, - &pool, - ) - .await - .map(|_t| ()) - } - }; - is_done.store(true, Relaxed); - res - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; + .execute(&mut *transaction.lock().await) + .await?; } - }; - let (res, _) = join!(work, progress_work); - res?; - progress_bar.set_message("done generating embeddings"); - progress_bar.finish(); + r => { + let remote_embeddings = build_remote_embeddings(r, &model.name, Some(¶meters))?; + remote_embeddings + .generate_embeddings( + &embeddings_table_name, + &chunks_table_name, + chunk_ids, + transaction, + ) + .await?; + } + } Ok(()) } @@ -675,48 +629,25 @@ impl MultiFieldPipeline { key: &str, configuration: &str, chunk_ids: &Vec, - mp: &MultiProgress, + transaction: Arc>>, ) -> anyhow::Result<()> { - let pool = self.get_pool().await?; - let project_info = self .project_info .as_ref() .context("Pipeline must have project info to sync TSVectors")?; - let progress_bar = mp - .add(utils::default_progress_spinner(1)) - .with_prefix(self.name.clone()) - .with_message("Syncing TSVectors for full text search"); - let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); let tsvectors_table_name = format!("{}_{}.{}_tsvectors", project_info.name, self.name, key); - let is_done = AtomicBool::new(false); - let work = async { - let res = sqlx::query(&query_builder!( - queries::GENERATE_TSVECTORS_FOR_CHUNK_IDS, - tsvectors_table_name, - configuration, - chunks_table_name - )) - .bind(chunk_ids) - .execute(&pool) - .await; - is_done.store(true, Relaxed); - res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) - }; - let progress_work = async { - while !is_done.load(Relaxed) { - progress_bar.inc(1); - tokio::time::sleep(std::time::Duration::from_millis(100)).await; - } - }; - let (res, _) = join!(work, progress_work); - res?; - progress_bar.set_message("Done syncing TSVectors for full text search"); - progress_bar.finish(); - + sqlx::query(&query_builder!( + queries::GENERATE_TSVECTORS_FOR_CHUNK_IDS, + tsvectors_table_name, + configuration, + chunks_table_name + )) + .bind(chunk_ids) + .execute(&mut *transaction.lock().await) + .await?; Ok(()) } diff --git a/pgml-sdks/pgml/src/queries.rs b/pgml-sdks/pgml/src/queries.rs index e15094987..4094c7b96 100644 --- a/pgml-sdks/pgml/src/queries.rs +++ b/pgml-sdks/pgml/src/queries.rs @@ -241,7 +241,7 @@ ON CONFLICT (document_id, chunk_index) DO NOTHING RETURNING id, document_id "#; -pub const GENERATE_CHUNKS_FOR_DOCUMENT_IDS: &str = r#" +pub const GENERATE_CHUNKS_FOR_DOCUMENT_ID: &str = r#" WITH splitter as ( SELECT name, @@ -275,7 +275,7 @@ FROM FROM %s WHERE - id = ANY($2) + id = $2 AND id NOT IN ( SELECT document_id diff --git a/pgml-sdks/pgml/src/remote_embeddings.rs b/pgml-sdks/pgml/src/remote_embeddings.rs index 54c7d2828..3a7ba98d0 100644 --- a/pgml-sdks/pgml/src/remote_embeddings.rs +++ b/pgml-sdks/pgml/src/remote_embeddings.rs @@ -1,6 +1,8 @@ use reqwest::{Client, RequestBuilder}; -use sqlx::postgres::PgPool; +use sqlx::{postgres::PgPool, Postgres, Transaction}; use std::env; +use std::sync::Arc; +use tokio::sync::Mutex; use tracing::instrument; use crate::{model::ModelRuntime, models, query_builder, types::Json}; @@ -41,13 +43,13 @@ pub trait RemoteEmbeddings<'a> { self.parse_response(response) } - #[instrument(skip(self, pool))] + #[instrument(skip(self, transaction))] async fn get_chunks( &self, embeddings_table_name: &str, chunks_table_name: &str, chunk_ids: &Vec, - pool: &PgPool, + transaction: Arc>>, limit: Option, ) -> anyhow::Result> { let limit = limit.unwrap_or(1000); @@ -59,7 +61,7 @@ pub trait RemoteEmbeddings<'a> { )) .bind(chunk_ids) .bind(limit) - .fetch_all(pool) + .fetch_all(&mut *transaction.lock().await) .await .map_err(|e| anyhow::anyhow!(e)) } @@ -87,13 +89,13 @@ pub trait RemoteEmbeddings<'a> { Ok(embeddings) } - #[instrument(skip(self, pool))] + #[instrument(skip(self, transaction))] async fn generate_embeddings( &self, embeddings_table_name: &str, chunks_table_name: &str, chunk_ids: &Vec, - pool: &PgPool, + transaction: Arc>>, ) -> anyhow::Result<()> { loop { let chunks = self @@ -101,7 +103,7 @@ pub trait RemoteEmbeddings<'a> { embeddings_table_name, chunks_table_name, chunk_ids, - pool, + transaction.clone(), None, ) .await?; @@ -138,7 +140,7 @@ pub trait RemoteEmbeddings<'a> { query = query.bind(chunk_ids[i]).bind(&embeddings[i]); } - query.execute(pool).await?; + query.execute(&mut *transaction.lock().await).await?; } Ok(()) } From 9aaa31b7caaf6d0e84706aaa0e36eca03e4dad9d Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Wed, 17 Jan 2024 17:56:54 -0800 Subject: [PATCH 06/72] Working conditional pipeline running on document upsert --- pgml-sdks/pgml/src/collection.rs | 205 ++++++++--- pgml-sdks/pgml/src/lib.rs | 16 +- pgml-sdks/pgml/src/multi_field_pipeline.rs | 383 +++++++++++---------- pgml-sdks/pgml/src/queries.rs | 34 +- 4 files changed, 378 insertions(+), 260 deletions(-) diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index cb65c5d1b..fb37e1125 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -9,6 +9,8 @@ use serde_json::json; use sqlx::postgres::PgPool; use sqlx::Executor; use sqlx::PgConnection; +use sqlx::Postgres; +use sqlx::Transaction; use std::borrow::Cow; use std::path::Path; use std::sync::Arc; @@ -274,6 +276,14 @@ impl Collection { /// ``` #[instrument(skip(self))] pub async fn add_pipeline(&mut self, pipeline: &mut MultiFieldPipeline) -> anyhow::Result<()> { + // The flow for this function: + // 1. Create collection if it does not exists + // 2. Create the pipeline if it does not exist and add it to the collection.pipelines table with ACTIVE = FALSE + // 3. Create the tables for the collection_pipeline schema + // 4. Start a transaction + // 5. Sync the pipeline + // 6. Set the pipeline ACTIVE = TRUE + // 7. Commit the transaction self.verify_in_database(false).await?; let project_info = &self .database_data @@ -281,11 +291,28 @@ impl Collection { .context("Database data must be set to add a pipeline to a collection")? .project_info; pipeline.set_project_info(project_info.clone()); - pipeline.verify_in_database(true).await?; + pipeline.verify_in_database(false).await?; + pipeline.create_tables().await?; + + let pool = get_or_initialize_pool(&self.database_url).await?; + let transaction = pool.begin().await?; + let transaction = Arc::new(Mutex::new(transaction)); + let mp = MultiProgress::new(); mp.println(format!("Added Pipeline {}, Now Syncing...", pipeline.name))?; - self.sync_pipeline(pipeline).await?; - eprintln!("Done Syncing {}\n", pipeline.name); + pipeline.execute(None, transaction.clone()).await?; + let mut transaction = Arc::into_inner(transaction) + .context("Error transaction dangling")? + .into_inner(); + sqlx::query(&query_builder!( + "UPDATE %s SET active = TRUE WHERE name = $1", + self.pipelines_table_name + )) + .bind(&pipeline.name) + .execute(&mut *transaction) + .await?; + transaction.commit().await?; + mp.println(format!("Done Syncing {}\n", pipeline.name))?; Ok(()) } @@ -308,20 +335,20 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn remove_pipeline( - &mut self, - pipeline: &mut MultiFieldPipeline, - ) -> anyhow::Result<()> { - let pool = get_or_initialize_pool(&self.database_url).await?; + pub async fn remove_pipeline(&mut self, pipeline: &MultiFieldPipeline) -> anyhow::Result<()> { + // The flow for this function: + // Create collection if it does not exist + // Begin a transaction + // Drop the collection_pipeline schema + // Delete the pipeline from the collection.pipelines table + // Commit the transaction self.verify_in_database(false).await?; let project_info = &self .database_data .as_ref() - .context("Database data must be set to remove pipeline from collection")? + .context("Database data must be set to remove a pipeline from a collection")? .project_info; - pipeline.set_project_info(project_info.clone()); - pipeline.verify_in_database(false).await?; - + let pool = get_or_initialize_pool(&self.database_url).await?; let pipeline_schema = format!("{}_{}", project_info.name, pipeline.name); let mut transaction = pool.begin().await?; @@ -329,7 +356,7 @@ impl Collection { .execute(query_builder!("DROP SCHEMA IF EXISTS %s CASCADE", pipeline_schema).as_str()) .await?; sqlx::query(&query_builder!( - "UPDATE %s SET active = FALSE WHERE name = $1", + "DELETE FROM %s WHERE name = $1", self.pipelines_table_name )) .bind(&pipeline.name) @@ -344,7 +371,7 @@ impl Collection { /// /// # Arguments /// - /// * `pipeline` - The [Pipeline] to remove. + /// * `pipeline` - The [Pipeline] to enable /// /// # Example /// @@ -359,22 +386,18 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn enable_pipeline(&self, pipeline: &Pipeline) -> anyhow::Result<()> { - sqlx::query(&query_builder!( - "UPDATE %s SET active = TRUE WHERE name = $1", - self.pipelines_table_name - )) - .bind(&pipeline.name) - .execute(&get_or_initialize_pool(&self.database_url).await?) - .await?; - Ok(()) + pub async fn enable_pipeline( + &mut self, + pipeline: &mut MultiFieldPipeline, + ) -> anyhow::Result<()> { + self.add_pipeline(pipeline).await } /// Disables a [Pipeline] on the [Collection] /// /// # Arguments /// - /// * `pipeline` - The [Pipeline] to remove. + /// * `pipeline` - The [Pipeline] to disable /// /// # Example /// @@ -389,14 +412,38 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn disable_pipeline(&self, pipeline: &Pipeline) -> anyhow::Result<()> { + pub async fn disable_pipeline(&mut self, pipeline: &MultiFieldPipeline) -> anyhow::Result<()> { + // Our current system for keeping documents, chunks, embeddings, and tsvectors in sync + // does not play nice with disabling and then re-enabling pipelines. + // For now, when disabling a pipeline, simply delete its schema and remake it later + // The flow for this function: + // 1. Create the collection if it does not exist + // 2. Begin a transaction + // 3. Set the pipelines ACTIVE = FALSE in the collection.pipelines table + // 4. Drop the collection_pipeline schema (this will get remade if they enable it again) + // 5. Commit the transaction + self.verify_in_database(false).await?; + let project_info = &self + .database_data + .as_ref() + .context("Database data must be set to remove a pipeline from a collection")? + .project_info; + let pool = get_or_initialize_pool(&self.database_url).await?; + let pipeline_schema = format!("{}_{}", project_info.name, pipeline.name); + + let mut transaction = pool.begin().await?; sqlx::query(&query_builder!( "UPDATE %s SET active = FALSE WHERE name = $1", self.pipelines_table_name )) .bind(&pipeline.name) - .execute(&get_or_initialize_pool(&self.database_url).await?) + .execute(&mut *transaction) .await?; + transaction + .execute(query_builder!("DROP SCHEMA IF EXISTS %s CASCADE", pipeline_schema).as_str()) + .await?; + transaction.commit().await?; + Ok(()) } @@ -442,13 +489,21 @@ impl Collection { /// Ok(()) /// } /// ``` - // TODO: Make it so if we upload the same documen twice it doesn't do anything #[instrument(skip(self, documents))] pub async fn upsert_documents( &mut self, documents: Vec, _args: Option, ) -> anyhow::Result<()> { + // The flow for this function + // 1. Create the collection if it does not exist + // 2. Get all pipelines where ACTIVE = TRUE + // 3. Create each pipeline and the collection_pipeline schema and tables if they don't already exist + // 4. Foreach document + // -> Begin a transaction returning the old document if it existed + // -> Insert the document + // -> Foreach pipeline check if we need to resync the document and if so sync the document + // -> Commit the transaction let pool = get_or_initialize_pool(&self.database_url).await?; self.verify_in_database(false).await?; let mut pipelines = self.get_pipelines().await?; @@ -468,20 +523,55 @@ impl Collection { let md5_digest = md5::compute(id.as_bytes()); let source_uuid = uuid::Uuid::from_slice(&md5_digest.0)?; - let document_id: i64 = sqlx::query_scalar(&query_builder!("INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = $2 RETURNING id", self.documents_table_name)).bind(source_uuid).bind(document).fetch_one(&mut *transaction).await?; + let (document_id, previous_document): (i64, Option) = sqlx::query_as(&query_builder!( + "WITH prev AS (SELECT document FROM %s WHERE source_uuid = $1) INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = EXCLUDED.document RETURNING id, (SELECT document FROM prev)", + self.documents_table_name, + self.documents_table_name + )) + .bind(&source_uuid) + .bind(&document) + .fetch_one(&mut *transaction) + .await?; let transaction = Arc::new(Mutex::new(transaction)); if !pipelines.is_empty() { use futures::stream::StreamExt; futures::stream::iter(&mut pipelines) // Need this map to get around moving the transaction - .map(|pipeline| (pipeline, transaction.clone())) - .for_each_concurrent(10, |(pipeline, transaction)| async move { - pipeline - .execute(Some(document_id), transaction) - .await - .expect("Failed to execute pipeline"); + .map(|pipeline| { + ( + pipeline, + previous_document.clone(), + document.clone(), + transaction.clone(), + ) }) + .for_each_concurrent( + 10, + |(pipeline, previous_document, document, transaction)| async move { + // Can unwrap here as we know it has parsed schema from the create_table call above + match previous_document { + Some(previous_document) => { + let should_run = + pipeline.parsed_schema.as_ref().unwrap().iter().any( + |(key, _)| document[key] != previous_document[key], + ); + if should_run { + pipeline + .execute(Some(document_id), transaction) + .await + .expect("Failed to execute pipeline"); + } + } + None => { + pipeline + .execute(Some(document_id), transaction) + .await + .expect("Failed to execute pipeline"); + } + } + }, + ) .await; } @@ -705,29 +795,30 @@ impl Collection { // Ok(()) } - #[instrument(skip(self))] - async fn sync_pipeline(&mut self, pipeline: &mut MultiFieldPipeline) -> anyhow::Result<()> { - self.verify_in_database(false).await?; - let project_info = &self - .database_data - .as_ref() - .context("Database data must be set to get collection pipelines")? - .project_info; - pipeline.set_project_info(project_info.clone()); - pipeline.create_tables().await?; - - let pool = get_or_initialize_pool(&self.database_url).await?; - let transaction = pool.begin().await?; - let transaction = Arc::new(Mutex::new(transaction)); - pipeline.execute(None, transaction.clone()).await?; - - Arc::into_inner(transaction) - .context("Error transaction dangling")? - .into_inner() - .commit() - .await?; - Ok(()) - } + // #[instrument(skip(self))] + // async fn sync_pipeline( + // &mut self, + // pipeline: &mut MultiFieldPipeline, + // transaction: Arc>>, + // ) -> anyhow::Result<()> { + // self.verify_in_database(false).await?; + // let project_info = &self + // .database_data + // .as_ref() + // .context("Database data must be set to get collection pipelines")? + // .project_info; + // pipeline.set_project_info(project_info.clone()); + // pipeline.create_tables().await?; + + // pipeline.execute(None, transaction).await?; + + // Arc::into_inner(transaction) + // .context("Error transaction dangling")? + // .into_inner() + // .commit() + // .await?; + // Ok(()) + // } #[instrument(skip(self))] pub async fn search( diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index e121e3914..3ccb65fae 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -309,7 +309,7 @@ mod tests { #[sqlx::test] async fn can_add_pipeline_and_upsert_documents() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_capaud_44"; + let collection_name = "test_r_c_capaud_46"; let pipeline_name = "test_r_p_capaud_6"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, @@ -361,13 +361,13 @@ mod tests { .fetch_all(&pool) .await?; assert!(body_chunks.len() == 4); - collection.archive().await?; let tsvectors_table = format!("{}_{}.body_tsvectors", collection_name, pipeline_name); let tsvectors: Vec = sqlx::query_as(&query_builder!("SELECT * FROM %s", tsvectors_table)) .fetch_all(&pool) .await?; assert!(tsvectors.len() == 4); + collection.archive().await?; Ok(()) } @@ -588,6 +588,18 @@ mod tests { Ok(()) } + #[sqlx::test] + async fn can_update_documents() -> anyhow::Result<()> { + let collection_name = "test_r_c_cud_0"; + let mut collection = Collection::new(collection_name, None); + let mut documents = generate_dummy_documents(1); + collection.upsert_documents(documents.clone(), None).await?; + documents[0]["body"] = json!("new body"); + collection.upsert_documents(documents, None).await?; + // collection.archive().await?; + Ok(()) + } + #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index 67d5e48a9..5160a34c2 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -270,185 +270,194 @@ impl MultiFieldPipeline { let schema = format!("{}_{}", collection_name, self.name); - let mut transaction = pool.begin().await?; - transaction - .execute(query_builder!("CREATE SCHEMA IF NOT EXISTS %s", schema).as_str()) - .await?; - - let parsed_schema = self - .parsed_schema - .as_ref() - .context("Pipeline must have schema to create_tables")?; + // If the schema already exists we don't want recreate all of the tables + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT schema_name FROM information_schema.schemata WHERE schema_name = $1)", + ) + .bind(&schema) + .fetch_one(&pool) + .await?; - for (key, value) in parsed_schema.iter() { - // Create the chunks table - let chunks_table_name = format!("{}.{}_chunks", schema, key); + if !exists { + let mut transaction = pool.begin().await?; transaction - .execute( - query_builder!( - queries::CREATE_CHUNKS_TABLE, - chunks_table_name, - documents_table_name - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_chunk_document_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - chunks_table_name, - "document_id" - ) - .as_str(), - ) + .execute(query_builder!("CREATE SCHEMA IF NOT EXISTS %s", schema).as_str()) .await?; - if let Some(embed) = &value.embed { - let embeddings_table_name = format!("{}.{}_embeddings", schema, key); - let exists: bool = sqlx::query_scalar( - "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to create_tables")?; + + for (key, value) in parsed_schema.iter() { + // Create the chunks table + let chunks_table_name = format!("{}.{}_chunks", schema, key); + transaction + .execute( + query_builder!( + queries::CREATE_CHUNKS_TABLE, + chunks_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_chunk_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + chunks_table_name, + "document_id" + ) + .as_str(), ) - .bind(&schema) - .bind(&embeddings_table_name).fetch_one(&pool).await?; - - if !exists { - let embedding_length = match &embed.model.runtime { - ModelRuntime::Python => { - let embedding: (Vec,) = sqlx::query_as( - "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") - .bind(&embed.model.name) - .bind(&embed.model.parameters) - .fetch_one(&pool).await?; - embedding.0.len() as i64 - } - t => { - let remote_embeddings = build_remote_embeddings( - t.to_owned(), - &embed.model.name, - Some(&embed.model.parameters), - )?; - remote_embeddings.get_embedding_size().await? - } - }; - - // Create the embeddings table - sqlx::query(&query_builder!( - queries::CREATE_EMBEDDINGS_TABLE, - &embeddings_table_name, - chunks_table_name, - documents_table_name, - embedding_length - )) - .execute(&mut *transaction) .await?; - let index_name = format!("{}_pipeline_embedding_chunk_id_index", key); + + if let Some(embed) = &value.embed { + let embeddings_table_name = format!("{}.{}_embeddings", schema, key); + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" + ) + .bind(&schema) + .bind(&embeddings_table_name).fetch_one(&pool).await?; + + if !exists { + let embedding_length = match &embed.model.runtime { + ModelRuntime::Python => { + let embedding: (Vec,) = sqlx::query_as( + "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") + .bind(&embed.model.name) + .bind(&embed.model.parameters) + .fetch_one(&pool).await?; + embedding.0.len() as i64 + } + t => { + let remote_embeddings = build_remote_embeddings( + t.to_owned(), + &embed.model.name, + Some(&embed.model.parameters), + )?; + remote_embeddings.get_embedding_size().await? + } + }; + + // Create the embeddings table + sqlx::query(&query_builder!( + queries::CREATE_EMBEDDINGS_TABLE, + &embeddings_table_name, + chunks_table_name, + documents_table_name, + embedding_length + )) + .execute(&mut *transaction) + .await?; + let index_name = format!("{}_pipeline_embedding_chunk_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + &embeddings_table_name, + "chunk_id" + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_embedding_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + &embeddings_table_name, + "document_id" + ) + .as_str(), + ) + .await?; + let index_with_parameters = format!( + "WITH (m = {}, ef_construction = {})", + embed.hnsw.m, embed.hnsw.ef_construction + ); + let index_name = format!("{}_pipeline_embedding_hnsw_vector_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX_USING_HNSW, + "", + index_name, + &embeddings_table_name, + "embedding vector_cosine_ops", + index_with_parameters + ) + .as_str(), + ) + .await?; + } + } + + // Create the tsvectors table + if value.full_text_search.is_some() { + let tsvectors_table_name = format!("{}.{}_tsvectors", schema, key); + transaction + .execute( + query_builder!( + queries::CREATE_CHUNKS_TSVECTORS_TABLE, + tsvectors_table_name, + chunks_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_chunk_id_index", key); transaction .execute( query_builder!( queries::CREATE_INDEX, "", index_name, - &embeddings_table_name, + tsvectors_table_name, "chunk_id" ) .as_str(), ) .await?; - let index_name = format!("{}_pipeline_embedding_document_id_index", key); + let index_name = format!("{}_pipeline_tsvector_document_id_index", key); transaction .execute( query_builder!( queries::CREATE_INDEX, "", index_name, - &embeddings_table_name, + tsvectors_table_name, "document_id" ) .as_str(), ) .await?; - let index_with_parameters = format!( - "WITH (m = {}, ef_construction = {})", - embed.hnsw.m, embed.hnsw.ef_construction - ); - let index_name = format!("{}_pipeline_embedding_hnsw_vector_index", key); + let index_name = format!("{}_pipeline_tsvector_index", key); transaction .execute( query_builder!( - queries::CREATE_INDEX_USING_HNSW, + queries::CREATE_INDEX_USING_GIN, "", index_name, - &embeddings_table_name, - "embedding vector_cosine_ops", - index_with_parameters + tsvectors_table_name, + "ts" ) .as_str(), ) .await?; } } - - // Create the tsvectors table - if value.full_text_search.is_some() { - let tsvectors_table_name = format!("{}.{}_tsvectors", schema, key); - transaction - .execute( - query_builder!( - queries::CREATE_CHUNKS_TSVECTORS_TABLE, - tsvectors_table_name, - chunks_table_name, - documents_table_name - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_tsvector_chunk_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - tsvectors_table_name, - "chunk_id" - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_tsvector_document_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - tsvectors_table_name, - "document_id" - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_tsvector_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX_USING_GIN, - "", - index_name, - tsvectors_table_name, - "ts" - ) - .as_str(), - ) - .await?; - } + transaction.commit().await?; } - transaction.commit().await?; - Ok(()) } @@ -474,18 +483,20 @@ impl MultiFieldPipeline { transaction.clone(), ) .await?; - if let Some(embed) = &value.embed { - self.sync_embeddings(key, &embed.model, &chunk_ids, transaction.clone()) + if !chunk_ids.is_empty() { + if let Some(embed) = &value.embed { + self.sync_embeddings(key, &embed.model, &chunk_ids, transaction.clone()) + .await?; + } + if let Some(full_text_search) = &value.full_text_search { + self.sync_tsvectors( + key, + &full_text_search.configuration, + &chunk_ids, + transaction.clone(), + ) .await?; - } - if let Some(full_text_search) = &value.full_text_search { - self.sync_tsvectors( - key, - &full_text_search.configuration, - &chunk_ids, - transaction.clone(), - ) - .await?; + } } } Ok(()) @@ -519,8 +530,7 @@ impl MultiFieldPipeline { queries::GENERATE_CHUNKS_FOR_DOCUMENT_ID, &chunks_table_name, &json_key_query, - documents_table_name, - &chunks_table_name + documents_table_name )) .bind(splitter_database_data.id) .bind(document_id) @@ -547,26 +557,52 @@ impl MultiFieldPipeline { }; chunk_ids.map_err(anyhow::Error::msg) } else { - sqlx::query_scalar(&query_builder!( - r#" - INSERT INTO %s( - document_id, chunk_index, chunk - ) - SELECT - id, - 1, - %d - FROM %s - ON CONFLICT (document_id, chunk_index) DO NOTHING - RETURNING id - "#, - &chunks_table_name, - &json_key_query, - &documents_table_name - )) - .fetch_all(&mut *transaction.lock().await) - .await - .map_err(anyhow::Error::msg) + match document_id { + Some(document_id) => sqlx::query_scalar(&query_builder!( + r#" + INSERT INTO %s( + document_id, chunk_index, chunk + ) + SELECT + id, + 1, + %d + FROM %s + WHERE id = $1 + ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk + RETURNING id + "#, + &chunks_table_name, + &json_key_query, + &documents_table_name + )) + .bind(document_id) + .fetch_all(&mut *transaction.lock().await) + .await + .map_err(anyhow::Error::msg), + None => sqlx::query_scalar(&query_builder!( + r#" + INSERT INTO %s( + document_id, chunk_index, chunk + ) + SELECT + id, + 1, + %d + FROM %s + WHERE id NOT IN (SELECT document_id FROM %s) + ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk + RETURNING id + "#, + &chunks_table_name, + &json_key_query, + &documents_table_name, + &chunks_table_name + )) + .fetch_all(&mut *transaction.lock().await) + .await + .map_err(anyhow::Error::msg), + } } } @@ -599,8 +635,7 @@ impl MultiFieldPipeline { sqlx::query(&query_builder!( queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, embeddings_table_name, - chunks_table_name, - embeddings_table_name + chunks_table_name )) .bind(&model.name) .bind(¶meters) diff --git a/pgml-sdks/pgml/src/queries.rs b/pgml-sdks/pgml/src/queries.rs index 4094c7b96..e318fd2d9 100644 --- a/pgml-sdks/pgml/src/queries.rs +++ b/pgml-sdks/pgml/src/queries.rs @@ -20,7 +20,7 @@ CREATE TABLE IF NOT EXISTS %s ( created_at timestamp NOT NULL DEFAULT now(), model_id int8 NOT NULL REFERENCES pgml.models ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, splitter_id int8 NOT NULL REFERENCES pgml.splitters ON DELETE CASCADE ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED, - active BOOLEAN NOT NULL DEFAULT TRUE, + active BOOLEAN NOT NULL DEFAULT FALSE, parameters jsonb NOT NULL DEFAULT '{}', UNIQUE (name) ); @@ -115,7 +115,7 @@ SELECT FROM %s WHERE id = ANY ($1) -ON CONFLICT (chunk_id) DO NOTHING; +ON CONFLICT (chunk_id) DO UPDATE SET ts = EXCLUDED.ts; "#; pub const GENERATE_EMBEDDINGS_FOR_CHUNK_IDS: &str = r#" @@ -132,13 +132,7 @@ FROM %s WHERE id = ANY ($3) - AND id NOT IN ( - SELECT - chunk_id - from - %s - ) -ON CONFLICT (chunk_id) DO NOTHING; +ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding "#; pub const EMBED_AND_VECTOR_SEARCH: &str = r#" @@ -260,30 +254,16 @@ SELECT (chunk).chunk FROM ( - select + SELECT id AS document_id, pgml.chunk( (SELECT name FROM splitter), - text, + %d, (SELECT parameters FROM splitter) ) AS chunk FROM - ( - SELECT - id, - %d AS text - FROM - %s - WHERE - id = $2 - AND id NOT IN ( - SELECT - document_id - FROM - %s - ) - ) AS documents + %s WHERE id = $2 ) chunks -ON CONFLICT (document_id, chunk_index) DO NOTHING +ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk RETURNING id "#; From 6979f697870b854f713395e63ed88de7d1cad351 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:11:41 -0800 Subject: [PATCH 07/72] Really good upsert documents --- pgml-sdks/pgml/src/collection.rs | 385 +++----- pgml-sdks/pgml/src/lib.rs | 999 ++++++++++----------- pgml-sdks/pgml/src/multi_field_pipeline.rs | 604 ++++++++----- pgml-sdks/pgml/src/queries.rs | 87 +- pgml-sdks/pgml/src/remote_embeddings.rs | 66 +- 5 files changed, 1029 insertions(+), 1112 deletions(-) diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index fb37e1125..7553e43f7 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -19,6 +19,7 @@ use tokio::sync::Mutex; use tracing::{instrument, warn}; use walkdir::WalkDir; +use crate::filter_builder::FilterBuilder; use crate::search_query_builder::build_search_query; use crate::vector_search_query_builder::build_vector_search_query; use crate::{ @@ -278,12 +279,8 @@ impl Collection { pub async fn add_pipeline(&mut self, pipeline: &mut MultiFieldPipeline) -> anyhow::Result<()> { // The flow for this function: // 1. Create collection if it does not exists - // 2. Create the pipeline if it does not exist and add it to the collection.pipelines table with ACTIVE = FALSE - // 3. Create the tables for the collection_pipeline schema - // 4. Start a transaction - // 5. Sync the pipeline - // 6. Set the pipeline ACTIVE = TRUE - // 7. Commit the transaction + // 2. Create the pipeline if it does not exist and add it to the collection.pipelines table with ACTIVE = TRUE + // 3. Sync the pipeline - this will delete all previous chunks, embeddings, and tsvectors self.verify_in_database(false).await?; let project_info = &self .database_data @@ -291,27 +288,13 @@ impl Collection { .context("Database data must be set to add a pipeline to a collection")? .project_info; pipeline.set_project_info(project_info.clone()); - pipeline.verify_in_database(false).await?; - pipeline.create_tables().await?; - - let pool = get_or_initialize_pool(&self.database_url).await?; - let transaction = pool.begin().await?; - let transaction = Arc::new(Mutex::new(transaction)); + // We want to intentially throw an error if they have already added this piepline + // as we don't want to casually resync + pipeline.verify_in_database(true).await?; let mp = MultiProgress::new(); mp.println(format!("Added Pipeline {}, Now Syncing...", pipeline.name))?; - pipeline.execute(None, transaction.clone()).await?; - let mut transaction = Arc::into_inner(transaction) - .context("Error transaction dangling")? - .into_inner(); - sqlx::query(&query_builder!( - "UPDATE %s SET active = TRUE WHERE name = $1", - self.pipelines_table_name - )) - .bind(&pipeline.name) - .execute(&mut *transaction) - .await?; - transaction.commit().await?; + pipeline.resync().await?; mp.println(format!("Done Syncing {}\n", pipeline.name))?; Ok(()) } @@ -337,11 +320,11 @@ impl Collection { #[instrument(skip(self))] pub async fn remove_pipeline(&mut self, pipeline: &MultiFieldPipeline) -> anyhow::Result<()> { // The flow for this function: - // Create collection if it does not exist - // Begin a transaction - // Drop the collection_pipeline schema - // Delete the pipeline from the collection.pipelines table - // Commit the transaction + // 1. Create collection if it does not exist + // 2. Begin a transaction + // 3. Drop the collection_pipeline schema + // 4. Delete the pipeline from the collection.pipelines table + // 5. Commit the transaction self.verify_in_database(false).await?; let project_info = &self .database_data @@ -363,7 +346,6 @@ impl Collection { .execute(&mut *transaction) .await?; transaction.commit().await?; - Ok(()) } @@ -390,7 +372,17 @@ impl Collection { &mut self, pipeline: &mut MultiFieldPipeline, ) -> anyhow::Result<()> { - self.add_pipeline(pipeline).await + // The flow for this function: + // 1. Set ACTIVE = TRUE for the pipeline in collection.pipelines + // 2. Resync the pipeline + sqlx::query(&query_builder!( + "UPDATE %s SET active = FALSE WHERE name = $1", + self.pipelines_table_name + )) + .bind(&pipeline.name) + .execute(&get_or_initialize_pool(&self.database_url).await?) + .await?; + pipeline.resync().await } /// Disables a [Pipeline] on the [Collection] @@ -412,38 +404,16 @@ impl Collection { /// } /// ``` #[instrument(skip(self))] - pub async fn disable_pipeline(&mut self, pipeline: &MultiFieldPipeline) -> anyhow::Result<()> { - // Our current system for keeping documents, chunks, embeddings, and tsvectors in sync - // does not play nice with disabling and then re-enabling pipelines. - // For now, when disabling a pipeline, simply delete its schema and remake it later + pub async fn disable_pipeline(&self, pipeline: &MultiFieldPipeline) -> anyhow::Result<()> { // The flow for this function: - // 1. Create the collection if it does not exist - // 2. Begin a transaction - // 3. Set the pipelines ACTIVE = FALSE in the collection.pipelines table - // 4. Drop the collection_pipeline schema (this will get remade if they enable it again) - // 5. Commit the transaction - self.verify_in_database(false).await?; - let project_info = &self - .database_data - .as_ref() - .context("Database data must be set to remove a pipeline from a collection")? - .project_info; - let pool = get_or_initialize_pool(&self.database_url).await?; - let pipeline_schema = format!("{}_{}", project_info.name, pipeline.name); - - let mut transaction = pool.begin().await?; + // 1. Set ACTIVE = FALSE for the pipeline in collection.pipelines sqlx::query(&query_builder!( "UPDATE %s SET active = FALSE WHERE name = $1", self.pipelines_table_name )) .bind(&pipeline.name) - .execute(&mut *transaction) + .execute(&get_or_initialize_pool(&self.database_url).await?) .await?; - transaction - .execute(query_builder!("DROP SCHEMA IF EXISTS %s CASCADE", pipeline_schema).as_str()) - .await?; - transaction.commit().await?; - Ok(()) } @@ -493,12 +463,11 @@ impl Collection { pub async fn upsert_documents( &mut self, documents: Vec, - _args: Option, + args: Option, ) -> anyhow::Result<()> { // The flow for this function // 1. Create the collection if it does not exist // 2. Get all pipelines where ACTIVE = TRUE - // 3. Create each pipeline and the collection_pipeline schema and tables if they don't already exist // 4. Foreach document // -> Begin a transaction returning the old document if it existed // -> Insert the document @@ -507,9 +476,9 @@ impl Collection { let pool = get_or_initialize_pool(&self.database_url).await?; self.verify_in_database(false).await?; let mut pipelines = self.get_pipelines().await?; - for pipeline in &mut pipelines { - pipeline.create_tables().await?; - } + + let args = args.unwrap_or_default(); + let args = args.as_object().context("args must be a JSON object")?; let progress_bar = utils::default_progress_bar(documents.len() as u64); progress_bar.println("Upserting Documents..."); @@ -523,15 +492,29 @@ impl Collection { let md5_digest = md5::compute(id.as_bytes()); let source_uuid = uuid::Uuid::from_slice(&md5_digest.0)?; - let (document_id, previous_document): (i64, Option) = sqlx::query_as(&query_builder!( + let query = if args + .get("merge") + .map(|v| v.as_bool().unwrap_or(false)) + .unwrap_or(false) + { + query_builder!( + "WITH prev AS (SELECT document FROM %s WHERE source_uuid = $1) INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = %s.document || EXCLUDED.document RETURNING id, (SELECT document FROM prev)", + self.documents_table_name, + self.documents_table_name, + self.documents_table_name + ) + } else { + query_builder!( "WITH prev AS (SELECT document FROM %s WHERE source_uuid = $1) INSERT INTO %s (source_uuid, document) VALUES ($1, $2) ON CONFLICT (source_uuid) DO UPDATE SET document = EXCLUDED.document RETURNING id, (SELECT document FROM prev)", self.documents_table_name, self.documents_table_name - )) - .bind(&source_uuid) - .bind(&document) - .fetch_one(&mut *transaction) - .await?; + ) + }; + let (document_id, previous_document): (i64, Option) = sqlx::query_as(&query) + .bind(&source_uuid) + .bind(&document) + .fetch_one(&mut *transaction) + .await?; let transaction = Arc::new(Mutex::new(transaction)); if !pipelines.is_empty() { @@ -549,23 +532,23 @@ impl Collection { .for_each_concurrent( 10, |(pipeline, previous_document, document, transaction)| async move { - // Can unwrap here as we know it has parsed schema from the create_table call above match previous_document { Some(previous_document) => { + // Can unwrap here as we know it has parsed schema from the create_table call above let should_run = pipeline.parsed_schema.as_ref().unwrap().iter().any( |(key, _)| document[key] != previous_document[key], ); if should_run { pipeline - .execute(Some(document_id), transaction) + .sync_document(document_id, transaction) .await .expect("Failed to execute pipeline"); } } None => { pipeline - .execute(Some(document_id), transaction) + .sync_document(document_id, transaction) .await .expect("Failed to execute pipeline"); } @@ -574,12 +557,12 @@ impl Collection { ) .await; } - Arc::into_inner(transaction) .context("Error transaction dangling")? .into_inner() .commit() .await?; + progress_bar.inc(1); } progress_bar.println("Done Upserting Documents\n"); @@ -605,107 +588,60 @@ impl Collection { /// } #[instrument(skip(self))] pub async fn get_documents(&self, args: Option) -> anyhow::Result> { - // TODO: If we want to filter on full text this needs to be part of a pipeline - unimplemented!() - - // let pool = get_or_initialize_pool(&self.database_url).await?; - - // let mut args = args.unwrap_or_default().0; - // let args = args.as_object_mut().context("args must be an object")?; - - // // Get limit or set it to 1000 - // let limit = args - // .remove("limit") - // .map(|l| l.try_to_u64()) - // .unwrap_or(Ok(1000))?; - - // let mut query = Query::select(); - // query - // .from_as( - // self.documents_table_name.to_table_tuple(), - // SIden::Str("documents"), - // ) - // .expr(Expr::cust("*")) // Adds the * in SELECT * FROM - // .limit(limit); - - // if let Some(order_by) = args.remove("order_by") { - // let order_by_builder = - // order_by_builder::OrderByBuilder::new(order_by, "documents", "metadata").build()?; - // for (order_by, order) in order_by_builder { - // query.order_by_expr_with_nulls(order_by, order, NullOrdering::Last); - // } - // } - // query.order_by((SIden::Str("documents"), SIden::Str("id")), Order::Asc); - - // // TODO: Make keyset based pagination work with custom order by - // if let Some(last_row_id) = args.remove("last_row_id") { - // let last_row_id = last_row_id - // .try_to_u64() - // .context("last_row_id must be an integer")?; - // query.and_where(Expr::col((SIden::Str("documents"), SIden::Str("id"))).gt(last_row_id)); - // } - - // if let Some(offset) = args.remove("offset") { - // let offset = offset.try_to_u64().context("offset must be an integer")?; - // query.offset(offset); - // } - - // if let Some(mut filter) = args.remove("filter") { - // let filter = filter - // .as_object_mut() - // .context("filter must be a Json object")?; - - // if let Some(f) = filter.remove("metadata") { - // query.cond_where( - // filter_builder::FilterBuilder::new(f, "documents", "metadata").build(), - // ); - // } - // if let Some(f) = filter.remove("full_text_search") { - // let f = f - // .as_object() - // .context("Full text filter must be a Json object")?; - // let configuration = f - // .get("configuration") - // .context("In full_text_search `configuration` is required")? - // .as_str() - // .context("In full_text_search `configuration` must be a string")?; - // let filter_text = f - // .get("text") - // .context("In full_text_search `text` is required")? - // .as_str() - // .context("In full_text_search `text` must be a string")?; - // query - // .join_as( - // JoinType::InnerJoin, - // self.documents_tsvectors_table_name.to_table_tuple(), - // Alias::new("documents_tsvectors"), - // Expr::col((SIden::Str("documents"), SIden::Str("id"))) - // .equals((SIden::Str("documents_tsvectors"), SIden::Str("document_id"))), - // ) - // .and_where( - // Expr::col(( - // SIden::Str("documents_tsvectors"), - // SIden::Str("configuration"), - // )) - // .eq(configuration), - // ) - // .and_where(Expr::cust_with_values( - // format!( - // "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", - // configuration - // ), - // [filter_text], - // )); - // } - // } - - // let (sql, values) = query.build_sqlx(PostgresQueryBuilder); - // let documents: Vec = - // sqlx::query_as_with(&sql, values).fetch_all(&pool).await?; - // Ok(documents - // .into_iter() - // .map(|d| d.into_user_friendly_json()) - // .collect()) + let pool = get_or_initialize_pool(&self.database_url).await?; + + let mut args = args.unwrap_or_default(); + let args = args.as_object_mut().context("args must be an object")?; + + // Get limit or set it to 1000 + let limit = args + .remove("limit") + .map(|l| l.try_to_u64()) + .unwrap_or(Ok(1000))?; + + let mut query = Query::select(); + query + .from_as( + self.documents_table_name.to_table_tuple(), + SIden::Str("documents"), + ) + .expr(Expr::cust("*")) // Adds the * in SELECT * FROM + .limit(limit); + + if let Some(order_by) = args.remove("order_by") { + let order_by_builder = + order_by_builder::OrderByBuilder::new(order_by, "documents", "document").build()?; + for (order_by, order) in order_by_builder { + query.order_by_expr_with_nulls(order_by, order, NullOrdering::Last); + } + } + query.order_by((SIden::Str("documents"), SIden::Str("id")), Order::Asc); + + // TODO: Make keyset based pagination work with custom order by + if let Some(last_row_id) = args.remove("last_row_id") { + let last_row_id = last_row_id + .try_to_u64() + .context("last_row_id must be an integer")?; + query.and_where(Expr::col((SIden::Str("documents"), SIden::Str("id"))).gt(last_row_id)); + } + + if let Some(offset) = args.remove("offset") { + let offset = offset.try_to_u64().context("offset must be an integer")?; + query.offset(offset); + } + + if let Some(filter) = args.remove("filter") { + let filter = FilterBuilder::new(filter, "documents", "document").build()?; + query.cond_where(filter); + } + + let (sql, values) = query.build_sqlx(PostgresQueryBuilder); + let documents: Vec = + sqlx::query_as_with(&sql, values).fetch_all(&pool).await?; + Ok(documents + .into_iter() + .map(|d| d.into_user_friendly_json()) + .collect()) } /// Deletes documents in a [Collection] @@ -722,103 +658,26 @@ impl Collection { /// async fn example() -> anyhow::Result<()> { /// let mut collection = Collection::new("my_collection", None); /// let documents = collection.delete_documents(serde_json::json!({ - /// "metadata": { - /// "id": { - /// "eq": 1 - /// } + /// "id": { + /// "eq": 1 /// } /// }).into()).await?; /// Ok(()) /// } #[instrument(skip(self))] - pub async fn delete_documents(&self, mut filter: Json) -> anyhow::Result<()> { - // TODO: If we want to filter on full text this needs to be part of a pipeline - unimplemented!() - - // let pool = get_or_initialize_pool(&self.database_url).await?; - - // let mut query = Query::delete(); - // query.from_table(self.documents_table_name.to_table_tuple()); - - // let filter = filter - // .as_object_mut() - // .context("filter must be a Json object")?; - - // if let Some(f) = filter.remove("metadata") { - // query - // .cond_where(filter_builder::FilterBuilder::new(f, "documents", "metadata").build()); - // } - - // if let Some(mut f) = filter.remove("full_text_search") { - // let f = f - // .as_object_mut() - // .context("Full text filter must be a Json object")?; - // let configuration = f - // .get("configuration") - // .context("In full_text_search `configuration` is required")? - // .as_str() - // .context("In full_text_search `configuration` must be a string")?; - // let filter_text = f - // .get("text") - // .context("In full_text_search `text` is required")? - // .as_str() - // .context("In full_text_search `text` must be a string")?; - // let mut inner_select_query = Query::select(); - // inner_select_query - // .from_as( - // self.documents_tsvectors_table_name.to_table_tuple(), - // SIden::Str("documents_tsvectors"), - // ) - // .column(SIden::Str("document_id")) - // .and_where(Expr::cust_with_values( - // format!( - // "documents_tsvectors.ts @@ plainto_tsquery('{}', $1)", - // configuration - // ), - // [filter_text], - // )) - // .and_where( - // Expr::col(( - // SIden::Str("documents_tsvectors"), - // SIden::Str("configuration"), - // )) - // .eq(configuration), - // ); - // query.and_where( - // Expr::col((SIden::Str("documents"), SIden::Str("id"))) - // .in_subquery(inner_select_query), - // ); - // } - - // let (sql, values) = query.build_sqlx(PostgresQueryBuilder); - // sqlx::query_with(&sql, values).fetch_all(&pool).await?; - // Ok(()) - } + pub async fn delete_documents(&self, filter: Json) -> anyhow::Result<()> { + let pool = get_or_initialize_pool(&self.database_url).await?; - // #[instrument(skip(self))] - // async fn sync_pipeline( - // &mut self, - // pipeline: &mut MultiFieldPipeline, - // transaction: Arc>>, - // ) -> anyhow::Result<()> { - // self.verify_in_database(false).await?; - // let project_info = &self - // .database_data - // .as_ref() - // .context("Database data must be set to get collection pipelines")? - // .project_info; - // pipeline.set_project_info(project_info.clone()); - // pipeline.create_tables().await?; - - // pipeline.execute(None, transaction).await?; - - // Arc::into_inner(transaction) - // .context("Error transaction dangling")? - // .into_inner() - // .commit() - // .await?; - // Ok(()) - // } + let mut query = Query::delete(); + query.from_table(self.documents_table_name.to_table_tuple()); + + let filter = FilterBuilder::new(filter.0, "documents", "document").build()?; + query.cond_where(filter); + + let (sql, values) = query.build_sqlx(PostgresQueryBuilder); + sqlx::query_with(&sql, values).fetch_all(&pool).await?; + Ok(()) + } #[instrument(skip(self))] pub async fn search( diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 3ccb65fae..94b21e590 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -309,7 +309,7 @@ mod tests { #[sqlx::test] async fn can_add_pipeline_and_upsert_documents() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_capaud_46"; + let collection_name = "test_r_c_capaud_47"; let pipeline_name = "test_r_p_capaud_6"; let mut pipeline = MultiFieldPipeline::new( pipeline_name, @@ -374,7 +374,7 @@ mod tests { #[sqlx::test] async fn can_upsert_documents_and_add_pipeline() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); - let collection_name = "test_r_c_cudaap_42"; + let collection_name = "test_r_c_cudaap_43"; let mut collection = Collection::new(collection_name, None); let documents = generate_dummy_documents(2); collection.upsert_documents(documents.clone(), None).await?; @@ -588,18 +588,6 @@ mod tests { Ok(()) } - #[sqlx::test] - async fn can_update_documents() -> anyhow::Result<()> { - let collection_name = "test_r_c_cud_0"; - let mut collection = Collection::new(collection_name, None); - let mut documents = generate_dummy_documents(1); - collection.upsert_documents(documents.clone(), None).await?; - documents[0]["body"] = json!("new body"); - collection.upsert_documents(documents, None).await?; - // collection.archive().await?; - Ok(()) - } - #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); @@ -1417,550 +1405,483 @@ mod tests { // Ok(()) // } - // /////////////////////////////// - // // Working With Documents ///// - // /////////////////////////////// - - // #[sqlx::test] - // async fn can_upsert_and_filter_get_documents() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cuafgd_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - - // let mut collection = Collection::new("test_r_c_cuagd_2", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Test basic upsert - // let documents = vec![ - // serde_json::json!({"id": 1, "random_key": 10, "text": "hello world 1"}).into(), - // serde_json::json!({"id": 2, "random_key": 11, "text": "hello world 2"}).into(), - // serde_json::json!({"id": 3, "random_key": 12, "text": "hello world 3"}).into(), - // ]; - // collection.upsert_documents(documents.clone(), None).await?; - // let document = &collection.get_documents(None).await?[0]; - // assert_eq!(document["document"]["text"], "hello world 1"); - - // // Test upsert of text and metadata - // let documents = vec![ - // serde_json::json!({"id": 1, "text": "hello world new"}).into(), - // serde_json::json!({"id": 2, "random_key": 12}).into(), - // serde_json::json!({"id": 3, "random_key": 13}).into(), - // ]; - // collection.upsert_documents(documents.clone(), None).await?; - - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "metadata": { - // "random_key": { - // "$eq": 12 - // } - // } - // } - // }) - // .into(), - // )) - // .await?; - // assert_eq!(documents[0]["document"]["text"], "hello world 2"); - - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "metadata": { - // "random_key": { - // "$gte": 13 - // } - // } - // } - // }) - // .into(), - // )) - // .await?; - // assert_eq!(documents[0]["document"]["text"], "hello world 3"); - - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "full_text_search": { - // "configuration": "english", - // "text": "new" - // } - // } - // }) - // .into(), - // )) - // .await?; - // assert_eq!(documents[0]["document"]["text"], "hello world new"); - // assert_eq!(documents[0]["document"]["id"].as_i64().unwrap(), 1); - - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_paginate_get_documents() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let mut collection = Collection::new("test_r_c_cpgd_2", None); - // collection - // .upsert_documents(generate_dummy_documents(10), None) - // .await?; + /////////////////////////////// + // Working With Documents ///// + /////////////////////////////// - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "limit": 5, - // "offset": 0 - // }) - // .into(), - // )) - // .await?; - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["row_id"].as_i64().unwrap()) - // .collect::>(), - // vec![1, 2, 3, 4, 5] - // ); + #[sqlx::test] + async fn can_upsert_and_filter_get_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cuafgd_1", None); - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "limit": 2, - // "offset": 5 - // }) - // .into(), - // )) - // .await?; - // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["row_id"].as_i64().unwrap()) - // .collect::>(), - // vec![6, 7] - // ); + let documents = vec![ + serde_json::json!({"id": 1, "random_key": 10, "text": "hello world 1"}).into(), + serde_json::json!({"id": 2, "random_key": 11, "text": "hello world 2"}).into(), + serde_json::json!({"id": 3, "random_key": 12, "text": "hello world 3"}).into(), + ]; + collection.upsert_documents(documents.clone(), None).await?; + let document = &collection.get_documents(None).await?[0]; + assert_eq!(document["document"]["text"], "hello world 1"); + + let documents = vec![ + serde_json::json!({"id": 1, "text": "hello world new"}).into(), + serde_json::json!({"id": 2, "random_key": 12}).into(), + serde_json::json!({"id": 3, "random_key": 13}).into(), + ]; + collection.upsert_documents(documents.clone(), None).await?; - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "limit": 2, - // "last_row_id": last_row_id - // }) - // .into(), - // )) - // .await?; - // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["row_id"].as_i64().unwrap()) - // .collect::>(), - // vec![8, 9] - // ); + let documents = collection + .get_documents(Some( + serde_json::json!({ + "filter": { + "random_key": { + "$eq": 12 + } + } + }) + .into(), + )) + .await?; + assert_eq!(documents[0]["document"]["random_key"], 12); + + let documents = collection + .get_documents(Some( + serde_json::json!({ + "filter": { + "random_key": { + "$gte": 13 + } + } + }) + .into(), + )) + .await?; + assert_eq!(documents[0]["document"]["random_key"], 13); - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "limit": 1, - // "last_row_id": last_row_id - // }) - // .into(), - // )) - // .await?; - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["row_id"].as_i64().unwrap()) - // .collect::>(), - // vec![10] - // ); + collection.archive().await?; + Ok(()) + } - // collection.archive().await?; - // Ok(()) - // } + #[sqlx::test] + async fn can_paginate_get_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cpgd_2", None); + collection + .upsert_documents(generate_dummy_documents(10), None) + .await?; - // #[sqlx::test] - // async fn can_filter_and_paginate_get_documents() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cfapgd_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); + let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 5, + "offset": 0 + }) + .into(), + )) + .await?; + assert_eq!( + documents + .into_iter() + .map(|d| d["row_id"].as_i64().unwrap()) + .collect::>(), + vec![1, 2, 3, 4, 5] + ); + + let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 2, + "offset": 5 + }) + .into(), + )) + .await?; + let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); + assert_eq!( + documents + .into_iter() + .map(|d| d["row_id"].as_i64().unwrap()) + .collect::>(), + vec![6, 7] + ); + + let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 2, + "last_row_id": last_row_id + }) + .into(), + )) + .await?; + let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); + assert_eq!( + documents + .into_iter() + .map(|d| d["row_id"].as_i64().unwrap()) + .collect::>(), + vec![8, 9] + ); + + let documents = collection + .get_documents(Some( + serde_json::json!({ + "limit": 1, + "last_row_id": last_row_id + }) + .into(), + )) + .await?; + assert_eq!( + documents + .into_iter() + .map(|d| d["row_id"].as_i64().unwrap()) + .collect::>(), + vec![10] + ); - // let mut collection = Collection::new("test_r_c_cfapgd_1", None); - // collection.add_pipeline(&mut pipeline).await?; + collection.archive().await?; + Ok(()) + } - // collection - // .upsert_documents(generate_dummy_documents(10), None) - // .await?; + #[sqlx::test] + async fn can_filter_and_paginate_get_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cfapgd_1", None); - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "metadata": { - // "id": { - // "$gte": 2 - // } - // } - // }, - // "limit": 2, - // "offset": 0 - // }) - // .into(), - // )) - // .await?; - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["document"]["id"].as_i64().unwrap()) - // .collect::>(), - // vec![2, 3] - // ); + collection + .upsert_documents(generate_dummy_documents(10), None) + .await?; - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "metadata": { - // "id": { - // "$lte": 5 - // } - // } - // }, - // "limit": 100, - // "offset": 4 - // }) - // .into(), - // )) - // .await?; - // let last_row_id = documents.last().unwrap()["row_id"].as_i64().unwrap(); - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["document"]["id"].as_i64().unwrap()) - // .collect::>(), - // vec![4, 5] - // ); + let documents = collection + .get_documents(Some( + serde_json::json!({ + "filter": { + "id": { + "$gte": 2 + } + }, + "limit": 2, + "offset": 0 + }) + .into(), + )) + .await?; + assert_eq!( + documents + .into_iter() + .map(|d| d["document"]["id"].as_i64().unwrap()) + .collect::>(), + vec![2, 3] + ); + + let documents = collection + .get_documents(Some( + serde_json::json!({ + "filter": { + "id": { + "$lte": 5 + } + }, + "limit": 100, + "offset": 4 + }) + .into(), + )) + .await?; + assert_eq!( + documents + .into_iter() + .map(|d| d["document"]["id"].as_i64().unwrap()) + .collect::>(), + vec![4, 5] + ); - // let documents = collection - // .get_documents(Some( - // serde_json::json!({ - // "filter": { - // "full_text_search": { - // "configuration": "english", - // "text": "document" - // } - // }, - // "limit": 100, - // "last_row_id": last_row_id - // }) - // .into(), - // )) - // .await?; - // assert_eq!( - // documents - // .into_iter() - // .map(|d| d["document"]["id"].as_i64().unwrap()) - // .collect::>(), - // vec![6, 7, 8, 9] - // ); + collection.archive().await?; + Ok(()) + } - // collection.archive().await?; - // Ok(()) - // } + #[sqlx::test] + async fn can_filter_and_delete_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cfadd_1", None); + collection + .upsert_documents(generate_dummy_documents(10), None) + .await?; - // #[sqlx::test] - // async fn can_filter_and_delete_documents() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cfadd_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); + collection + .delete_documents( + serde_json::json!({ + "id": { + "$lt": 2 + } + }) + .into(), + ) + .await?; + let documents = collection.get_documents(None).await?; + assert_eq!(documents.len(), 8); + assert!(documents + .iter() + .all(|d| d["document"]["id"].as_i64().unwrap() >= 2)); - // let mut collection = Collection::new("test_r_c_cfadd_1", None); - // collection.add_pipeline(&mut pipeline).await?; - // collection - // .upsert_documents(generate_dummy_documents(10), None) - // .await?; + collection + .delete_documents( + serde_json::json!({ + "id": { + "$gte": 6 + } + }) + .into(), + ) + .await?; + let documents = collection.get_documents(None).await?; + assert_eq!(documents.len(), 4); + assert!(documents + .iter() + .all(|d| d["document"]["id"].as_i64().unwrap() < 6)); - // collection - // .delete_documents( - // serde_json::json!({ - // "metadata": { - // "id": { - // "$lt": 2 - // } - // } - // }) - // .into(), - // ) - // .await?; - // let documents = collection.get_documents(None).await?; - // assert_eq!(documents.len(), 8); - // assert!(documents - // .iter() - // .all(|d| d["document"]["id"].as_i64().unwrap() >= 2)); + collection.archive().await?; + Ok(()) + } - // collection - // .delete_documents( - // serde_json::json!({ - // "full_text_search": { - // "configuration": "english", - // "text": "2" - // } - // }) - // .into(), - // ) - // .await?; - // let documents = collection.get_documents(None).await?; - // assert_eq!(documents.len(), 7); - // assert!(documents - // .iter() - // .all(|d| d["document"]["id"].as_i64().unwrap() > 2)); + #[sqlx::test] + fn can_order_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cod_1", None); + collection + .upsert_documents( + vec![ + json!({ + "id": 1, + "text": "Test Document 1", + "number": 99, + "nested_number": { + "number": 3 + }, - // collection - // .delete_documents( - // serde_json::json!({ - // "metadata": { - // "id": { - // "$gte": 6 - // } - // }, - // "full_text_search": { - // "configuration": "english", - // "text": "6" - // } - // }) - // .into(), - // ) - // .await?; - // let documents = collection.get_documents(None).await?; - // assert_eq!(documents.len(), 6); - // assert!(documents - // .iter() - // .all(|d| d["document"]["id"].as_i64().unwrap() != 6)); + "tie": 2, + }) + .into(), + json!({ + "id": 2, + "text": "Test Document 1", + "number": 98, + "nested_number": { + "number": 2 + }, + "tie": 2, + }) + .into(), + json!({ + "id": 3, + "text": "Test Document 1", + "number": 97, + "nested_number": { + "number": 1 + }, + "tie": 2 + }) + .into(), + ], + None, + ) + .await?; + let documents = collection + .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + .await?; + assert_eq!( + documents + .iter() + .map(|d| d["document"]["number"].as_i64().unwrap()) + .collect::>(), + vec![97, 98, 99] + ); + let documents = collection + .get_documents(Some( + json!({"order_by": {"nested_number": {"number": "asc"}}}).into(), + )) + .await?; + assert_eq!( + documents + .iter() + .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) + .collect::>(), + vec![1, 2, 3] + ); + let documents = collection + .get_documents(Some( + json!({"order_by": {"nested_number": {"number": "asc"}, "tie": "desc"}}).into(), + )) + .await?; + assert_eq!( + documents + .iter() + .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) + .collect::>(), + vec![1, 2, 3] + ); + collection.archive().await?; + Ok(()) + } - // collection.archive().await?; - // Ok(()) - // } + #[sqlx::test] + async fn can_update_documents() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cud_5", None); + collection + .upsert_documents( + vec![ + json!({ + "id": 1, + "text": "Test Document 1" + }) + .into(), + json!({ + "id": 2, + "text": "Test Document 1" + }) + .into(), + json!({ + "id": 3, + "text": "Test Document 1" + }) + .into(), + ], + None, + ) + .await?; + collection + .upsert_documents( + vec![ + json!({ + "id": 1, + "number": 0, + }) + .into(), + json!({ + "id": 2, + "number": 1, + }) + .into(), + json!({ + "id": 3, + "number": 2, + }) + .into(), + ], + None, + ) + .await?; + let documents = collection + .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + .await?; + assert_eq!( + documents + .iter() + .map(|d| d["document"]["number"].as_i64().unwrap()) + .collect::>(), + vec![0, 1, 2] + ); + for document in documents { + assert!(document["document"]["text"].as_str().is_none()); + } + collection.archive().await?; + Ok(()) + } - // #[sqlx::test] - // fn can_order_documents() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let mut collection = Collection::new("test_r_c_cod_1", None); - // collection - // .upsert_documents( - // vec![ - // json!({ - // "id": 1, - // "text": "Test Document 1", - // "number": 99, - // "nested_number": { - // "number": 3 - // }, - - // "tie": 2, - // }) - // .into(), - // json!({ - // "id": 2, - // "text": "Test Document 1", - // "number": 98, - // "nested_number": { - // "number": 2 - // }, - // "tie": 2, - // }) - // .into(), - // json!({ - // "id": 3, - // "text": "Test Document 1", - // "number": 97, - // "nested_number": { - // "number": 1 - // }, - // "tie": 2 - // }) - // .into(), - // ], - // None, - // ) - // .await?; - // let documents = collection - // .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) - // .await?; - // assert_eq!( - // documents - // .iter() - // .map(|d| d["document"]["number"].as_i64().unwrap()) - // .collect::>(), - // vec![97, 98, 99] - // ); - // let documents = collection - // .get_documents(Some( - // json!({"order_by": {"nested_number": {"number": "asc"}}}).into(), - // )) - // .await?; - // assert_eq!( - // documents - // .iter() - // .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) - // .collect::>(), - // vec![1, 2, 3] - // ); - // let documents = collection - // .get_documents(Some( - // json!({"order_by": {"nested_number": {"number": "asc"}, "tie": "desc"}}).into(), - // )) - // .await?; - // assert_eq!( - // documents - // .iter() - // .map(|d| d["document"]["nested_number"]["number"].as_i64().unwrap()) - // .collect::>(), - // vec![1, 2, 3] - // ); - // collection.archive().await?; - // Ok(()) - // } + #[sqlx::test] + fn can_merge_metadata() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut collection = Collection::new("test_r_c_cmm_5", None); + collection + .upsert_documents( + vec![ + json!({ + "id": 1, + "text": "Test Document 1", + "number": 99, + "second_number": 10, + }) + .into(), + json!({ + "id": 2, + "text": "Test Document 1", + "number": 98, + "second_number": 11, + }) + .into(), + json!({ + "id": 3, + "text": "Test Document 1", + "number": 97, + "second_number": 12, + }) + .into(), + ], + None, + ) + .await?; + let documents = collection + .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + .await?; + assert_eq!( + documents + .iter() + .map(|d| ( + d["document"]["number"].as_i64().unwrap(), + d["document"]["second_number"].as_i64().unwrap() + )) + .collect::>(), + vec![(97, 12), (98, 11), (99, 10)] + ); - // #[sqlx::test] - // fn can_merge_metadata() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let mut collection = Collection::new("test_r_c_cmm_4", None); - // collection - // .upsert_documents( - // vec![ - // json!({ - // "id": 1, - // "text": "Test Document 1", - // "number": 99, - // "second_number": 10, - // }) - // .into(), - // json!({ - // "id": 2, - // "text": "Test Document 1", - // "number": 98, - // "second_number": 11, - // }) - // .into(), - // json!({ - // "id": 3, - // "text": "Test Document 1", - // "number": 97, - // "second_number": 12, - // }) - // .into(), - // ], - // None, - // ) - // .await?; - // let documents = collection - // .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) - // .await?; - // assert_eq!( - // documents - // .iter() - // .map(|d| ( - // d["document"]["number"].as_i64().unwrap(), - // d["document"]["second_number"].as_i64().unwrap() - // )) - // .collect::>(), - // vec![(97, 12), (98, 11), (99, 10)] - // ); - // collection - // .upsert_documents( - // vec![ - // json!({ - // "id": 1, - // "number": 0, - // "another_number": 1 - // }) - // .into(), - // json!({ - // "id": 2, - // "number": 1, - // "another_number": 2 - // }) - // .into(), - // json!({ - // "id": 3, - // "number": 2, - // "another_number": 3 - // }) - // .into(), - // ], - // Some( - // json!({ - // "metadata": { - // "merge": true - // } - // }) - // .into(), - // ), - // ) - // .await?; - // let documents = collection - // .get_documents(Some( - // json!({"order_by": {"number": {"number": "asc"}}}).into(), - // )) - // .await?; + collection + .upsert_documents( + vec![ + json!({ + "id": 1, + "number": 0, + "another_number": 1 + }) + .into(), + json!({ + "id": 2, + "number": 1, + "another_number": 2 + }) + .into(), + json!({ + "id": 3, + "number": 2, + "another_number": 3 + }) + .into(), + ], + Some( + json!({ + "merge": true + }) + .into(), + ), + ) + .await?; + let documents = collection + .get_documents(Some(json!({"order_by": {"number": "asc"}}).into())) + .await?; - // assert_eq!( - // documents - // .iter() - // .map(|d| ( - // d["document"]["number"].as_i64().unwrap(), - // d["document"]["another_number"].as_i64().unwrap(), - // d["document"]["second_number"].as_i64().unwrap() - // )) - // .collect::>(), - // vec![(0, 1, 10), (1, 2, 11), (2, 3, 12)] - // ); - // collection.archive().await?; - // Ok(()) - // } + assert_eq!( + documents + .iter() + .map(|d| ( + d["document"]["number"].as_i64().unwrap(), + d["document"]["another_number"].as_i64().unwrap(), + d["document"]["second_number"].as_i64().unwrap() + )) + .collect::>(), + vec![(0, 1, 10), (1, 2, 11), (2, 3, 12)] + ); + collection.archive().await?; + Ok(()) + } } diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index 5160a34c2..d207c83b2 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -10,6 +10,7 @@ use tokio::join; use tokio::sync::Mutex; use tracing::instrument; +use crate::remote_embeddings::PoolOrArcMutextTransaction; use crate::{ collection::ProjectInfo, get_or_initialize_pool, @@ -201,7 +202,7 @@ impl MultiFieldPipeline { let pipeline = if let Some(pipeline) = pipeline { if throw_if_exists { - anyhow::bail!("Pipeline {} already exists", pipeline.name); + anyhow::bail!("Pipeline {} already exists. You do not need to add this pipeline to the collection as it has already been added.", pipeline.name); } let mut parsed_schema = json_to_schema(&pipeline.schema)?; @@ -239,14 +240,21 @@ impl MultiFieldPipeline { } self.parsed_schema = Some(parsed_schema); - sqlx::query_as(&query_builder!( + // Here we actually insert the pipeline into the collection.pipelines table + // and create the collection_pipeline schema and required tables + let mut transaction = pool.begin().await?; + let pipeline = sqlx::query_as(&query_builder!( "INSERT INTO %s (name, schema) VALUES ($1, $2) RETURNING *", format!("{}.pipelines", project_info.name) )) .bind(&self.name) .bind(&self.schema) - .fetch_one(&pool) - .await? + .fetch_one(&mut *transaction) + .await?; + self.create_tables(&mut transaction).await?; + transaction.commit().await?; + + pipeline }; self.database_data = Some(MultiFieldPipelineDatabaseData { id: pipeline.id, @@ -257,10 +265,10 @@ impl MultiFieldPipeline { } #[instrument(skip(self))] - pub(crate) async fn create_tables(&mut self) -> anyhow::Result<()> { - self.verify_in_database(false).await?; - let pool = self.get_pool().await?; - + async fn create_tables( + &mut self, + transaction: &mut Transaction<'static, Postgres>, + ) -> anyhow::Result<()> { let project_info = self .project_info .as_ref() @@ -270,205 +278,185 @@ impl MultiFieldPipeline { let schema = format!("{}_{}", collection_name, self.name); - // If the schema already exists we don't want recreate all of the tables - let exists: bool = sqlx::query_scalar( - "SELECT EXISTS(SELECT schema_name FROM information_schema.schemata WHERE schema_name = $1)", - ) - .bind(&schema) - .fetch_one(&pool) - .await?; + transaction + .execute(query_builder!("CREATE SCHEMA IF NOT EXISTS %s", schema).as_str()) + .await?; + + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to create_tables")?; - if !exists { - let mut transaction = pool.begin().await?; + for (key, value) in parsed_schema.iter() { + let chunks_table_name = format!("{}.{}_chunks", schema, key); transaction - .execute(query_builder!("CREATE SCHEMA IF NOT EXISTS %s", schema).as_str()) + .execute( + query_builder!( + queries::CREATE_CHUNKS_TABLE, + chunks_table_name, + documents_table_name + ) + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_chunk_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + chunks_table_name, + "document_id" + ) + .as_str(), + ) .await?; - let parsed_schema = self - .parsed_schema - .as_ref() - .context("Pipeline must have schema to create_tables")?; + if let Some(embed) = &value.embed { + let embeddings_table_name = format!("{}.{}_embeddings", schema, key); + let embedding_length = match &embed.model.runtime { + ModelRuntime::Python => { + let embedding: (Vec,) = sqlx::query_as( + "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") + .bind(&embed.model.name) + .bind(&embed.model.parameters) + .fetch_one(&mut *transaction).await?; + embedding.0.len() as i64 + } + t => { + let remote_embeddings = build_remote_embeddings( + t.to_owned(), + &embed.model.name, + Some(&embed.model.parameters), + )?; + remote_embeddings.get_embedding_size().await? + } + }; - for (key, value) in parsed_schema.iter() { - // Create the chunks table - let chunks_table_name = format!("{}.{}_chunks", schema, key); + // Create the embeddings table + sqlx::query(&query_builder!( + queries::CREATE_EMBEDDINGS_TABLE, + &embeddings_table_name, + chunks_table_name, + documents_table_name, + embedding_length + )) + .execute(&mut *transaction) + .await?; + let index_name = format!("{}_pipeline_embedding_chunk_id_index", key); transaction .execute( query_builder!( - queries::CREATE_CHUNKS_TABLE, - chunks_table_name, - documents_table_name + queries::CREATE_INDEX, + "", + index_name, + &embeddings_table_name, + "chunk_id" ) .as_str(), ) .await?; - let index_name = format!("{}_pipeline_chunk_document_id_index", key); + let index_name = format!("{}_pipeline_embedding_document_id_index", key); transaction .execute( query_builder!( queries::CREATE_INDEX, "", index_name, - chunks_table_name, + &embeddings_table_name, "document_id" ) .as_str(), ) .await?; - - if let Some(embed) = &value.embed { - let embeddings_table_name = format!("{}.{}_embeddings", schema, key); - let exists: bool = sqlx::query_scalar( - "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" + let index_with_parameters = format!( + "WITH (m = {}, ef_construction = {})", + embed.hnsw.m, embed.hnsw.ef_construction + ); + let index_name = format!("{}_pipeline_embedding_hnsw_vector_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX_USING_HNSW, + "", + index_name, + &embeddings_table_name, + "embedding vector_cosine_ops", + index_with_parameters ) - .bind(&schema) - .bind(&embeddings_table_name).fetch_one(&pool).await?; + .as_str(), + ) + .await?; + } - if !exists { - let embedding_length = match &embed.model.runtime { - ModelRuntime::Python => { - let embedding: (Vec,) = sqlx::query_as( - "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") - .bind(&embed.model.name) - .bind(&embed.model.parameters) - .fetch_one(&pool).await?; - embedding.0.len() as i64 - } - t => { - let remote_embeddings = build_remote_embeddings( - t.to_owned(), - &embed.model.name, - Some(&embed.model.parameters), - )?; - remote_embeddings.get_embedding_size().await? - } - }; - - // Create the embeddings table - sqlx::query(&query_builder!( - queries::CREATE_EMBEDDINGS_TABLE, - &embeddings_table_name, + // Create the tsvectors table + if value.full_text_search.is_some() { + let tsvectors_table_name = format!("{}.{}_tsvectors", schema, key); + transaction + .execute( + query_builder!( + queries::CREATE_CHUNKS_TSVECTORS_TABLE, + tsvectors_table_name, chunks_table_name, - documents_table_name, - embedding_length - )) - .execute(&mut *transaction) - .await?; - let index_name = format!("{}_pipeline_embedding_chunk_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - &embeddings_table_name, - "chunk_id" - ) - .as_str(), - ) - .await?; - let index_name = format!("{}_pipeline_embedding_document_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - &embeddings_table_name, - "document_id" - ) - .as_str(), - ) - .await?; - let index_with_parameters = format!( - "WITH (m = {}, ef_construction = {})", - embed.hnsw.m, embed.hnsw.ef_construction - ); - let index_name = format!("{}_pipeline_embedding_hnsw_vector_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX_USING_HNSW, - "", - index_name, - &embeddings_table_name, - "embedding vector_cosine_ops", - index_with_parameters - ) - .as_str(), - ) - .await?; - } - } - - // Create the tsvectors table - if value.full_text_search.is_some() { - let tsvectors_table_name = format!("{}.{}_tsvectors", schema, key); - transaction - .execute( - query_builder!( - queries::CREATE_CHUNKS_TSVECTORS_TABLE, - tsvectors_table_name, - chunks_table_name, - documents_table_name - ) - .as_str(), + documents_table_name ) - .await?; - let index_name = format!("{}_pipeline_tsvector_chunk_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - tsvectors_table_name, - "chunk_id" - ) - .as_str(), + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_chunk_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + tsvectors_table_name, + "chunk_id" ) - .await?; - let index_name = format!("{}_pipeline_tsvector_document_id_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX, - "", - index_name, - tsvectors_table_name, - "document_id" - ) - .as_str(), + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_document_id_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX, + "", + index_name, + tsvectors_table_name, + "document_id" ) - .await?; - let index_name = format!("{}_pipeline_tsvector_index", key); - transaction - .execute( - query_builder!( - queries::CREATE_INDEX_USING_GIN, - "", - index_name, - tsvectors_table_name, - "ts" - ) - .as_str(), + .as_str(), + ) + .await?; + let index_name = format!("{}_pipeline_tsvector_index", key); + transaction + .execute( + query_builder!( + queries::CREATE_INDEX_USING_GIN, + "", + index_name, + tsvectors_table_name, + "ts" ) - .await?; - } + .as_str(), + ) + .await?; } - transaction.commit().await?; } Ok(()) } #[instrument(skip(self))] - pub(crate) async fn execute( + pub(crate) async fn sync_document( &mut self, - document_id: Option, + document_id: i64, transaction: Arc>>, ) -> anyhow::Result<()> { - // We are assuming we have manually verified the pipeline before doing this + self.verify_in_database(false).await?; + // We are assuming we have manually verified the pipeline before doing this let parsed_schema = self .parsed_schema .as_ref() @@ -476,7 +464,7 @@ impl MultiFieldPipeline { for (key, value) in parsed_schema.iter() { let chunk_ids = self - .sync_chunks( + .sync_chunks_for_document( key, value.splitter.as_ref().map(|v| &v.model), document_id, @@ -485,11 +473,16 @@ impl MultiFieldPipeline { .await?; if !chunk_ids.is_empty() { if let Some(embed) = &value.embed { - self.sync_embeddings(key, &embed.model, &chunk_ids, transaction.clone()) - .await?; + self.sync_embeddings_for_chunks( + key, + &embed.model, + &chunk_ids, + transaction.clone(), + ) + .await?; } if let Some(full_text_search) = &value.full_text_search { - self.sync_tsvectors( + self.sync_tsvectors_for_chunks( key, &full_text_search.configuration, &chunk_ids, @@ -503,11 +496,11 @@ impl MultiFieldPipeline { } #[instrument(skip(self))] - async fn sync_chunks( + async fn sync_chunks_for_document( &self, key: &str, splitter: Option<&Splitter>, - document_id: Option, + document_id: i64, transaction: Arc>>, ) -> anyhow::Result> { let project_info = self @@ -525,41 +518,28 @@ impl MultiFieldPipeline { .as_ref() .context("Splitter must be verified to sync chunks")?; - let chunk_ids: Result, _> = if document_id.is_some() { - sqlx::query(&query_builder!( - queries::GENERATE_CHUNKS_FOR_DOCUMENT_ID, - &chunks_table_name, - &json_key_query, - documents_table_name - )) - .bind(splitter_database_data.id) - .bind(document_id) - .execute(&mut *transaction.lock().await) - .await?; - sqlx::query_scalar(&query_builder!( - "SELECT id FROM %s WHERE document_id = $1", - &chunks_table_name - )) - .bind(document_id) - .fetch_all(&mut *transaction.lock().await) - .await - } else { - sqlx::query_scalar(&query_builder!( - queries::GENERATE_CHUNKS, - &chunks_table_name, - &json_key_query, - documents_table_name, - &chunks_table_name - )) - .bind(splitter_database_data.id) - .fetch_all(&mut *transaction.lock().await) - .await - }; - chunk_ids.map_err(anyhow::Error::msg) + sqlx::query(&query_builder!( + queries::GENERATE_CHUNKS_FOR_DOCUMENT_ID, + &chunks_table_name, + &json_key_query, + documents_table_name + )) + .bind(splitter_database_data.id) + .bind(document_id) + .execute(&mut *transaction.lock().await) + .await?; + + sqlx::query_scalar(&query_builder!( + "SELECT id FROM %s WHERE document_id = $1", + &chunks_table_name + )) + .bind(document_id) + .fetch_all(&mut *transaction.lock().await) + .await + .map_err(anyhow::Error::msg) } else { - match document_id { - Some(document_id) => sqlx::query_scalar(&query_builder!( - r#" + sqlx::query_scalar(&query_builder!( + r#" INSERT INTO %s( document_id, chunk_index, chunk ) @@ -572,42 +552,19 @@ impl MultiFieldPipeline { ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk RETURNING id "#, - &chunks_table_name, - &json_key_query, - &documents_table_name - )) - .bind(document_id) - .fetch_all(&mut *transaction.lock().await) - .await - .map_err(anyhow::Error::msg), - None => sqlx::query_scalar(&query_builder!( - r#" - INSERT INTO %s( - document_id, chunk_index, chunk - ) - SELECT - id, - 1, - %d - FROM %s - WHERE id NOT IN (SELECT document_id FROM %s) - ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk - RETURNING id - "#, - &chunks_table_name, - &json_key_query, - &documents_table_name, - &chunks_table_name - )) - .fetch_all(&mut *transaction.lock().await) - .await - .map_err(anyhow::Error::msg), - } + &chunks_table_name, + &json_key_query, + &documents_table_name + )) + .bind(document_id) + .fetch_all(&mut *transaction.lock().await) + .await + .map_err(anyhow::Error::msg) } } #[instrument(skip(self))] - async fn sync_embeddings( + async fn sync_embeddings_for_chunks( &self, key: &str, model: &Model, @@ -649,8 +606,8 @@ impl MultiFieldPipeline { .generate_embeddings( &embeddings_table_name, &chunks_table_name, - chunk_ids, - transaction, + Some(chunk_ids), + PoolOrArcMutextTransaction::ArcMutextTransaction(transaction), ) .await?; } @@ -659,7 +616,7 @@ impl MultiFieldPipeline { } #[instrument(skip(self))] - async fn sync_tsvectors( + async fn sync_tsvectors_for_chunks( &self, key: &str, configuration: &str, @@ -686,6 +643,169 @@ impl MultiFieldPipeline { Ok(()) } + #[instrument(skip(self))] + pub async fn resync(&mut self) -> anyhow::Result<()> { + self.verify_in_database(false).await?; + + // We are assuming we have manually verified the pipeline before doing this + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync chunks")?; + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to execute")?; + + // Before doing any syncing, delete all old and potentially outdated documents + let pool = self.get_pool().await?; + for (key, _value) in parsed_schema.iter() { + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + pool.execute(query_builder!("DELETE FROM %s CASCADE", chunks_table_name).as_str()) + .await?; + } + + for (key, value) in parsed_schema.iter() { + self.resync_chunks(key, value.splitter.as_ref().map(|v| &v.model)) + .await?; + if let Some(embed) = &value.embed { + self.resync_embeddings(key, &embed.model).await?; + } + if let Some(full_text_search) = &value.full_text_search { + self.resync_tsvectors(key, &full_text_search.configuration) + .await?; + } + } + Ok(()) + } + + #[instrument(skip(self))] + async fn resync_chunks(&self, key: &str, splitter: Option<&Splitter>) -> anyhow::Result<()> { + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync chunks")?; + + let pool = self.get_pool().await?; + + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + let documents_table_name = format!("{}.documents", project_info.name); + let json_key_query = format!("document->>'{}'", key); + + if let Some(splitter) = splitter { + let splitter_database_data = splitter + .database_data + .as_ref() + .context("Splitter must be verified to sync chunks")?; + + sqlx::query(&query_builder!( + queries::GENERATE_CHUNKS, + &chunks_table_name, + &json_key_query, + documents_table_name, + &chunks_table_name + )) + .bind(splitter_database_data.id) + .execute(&pool) + .await?; + } else { + sqlx::query(&query_builder!( + r#" + INSERT INTO %s( + document_id, chunk_index, chunk + ) + SELECT + id, + 1, + %d + FROM %s + WHERE id NOT IN (SELECT document_id FROM %s) + ON CONFLICT (document_id, chunk_index) DO UPDATE SET chunk = EXCLUDED.chunk + RETURNING id + "#, + &chunks_table_name, + &json_key_query, + &documents_table_name, + &chunks_table_name + )) + .execute(&pool) + .await?; + } + Ok(()) + } + + #[instrument(skip(self))] + async fn resync_embeddings(&self, key: &str, model: &Model) -> anyhow::Result<()> { + let pool = self.get_pool().await?; + + // Remove the stored name from the parameters + let mut parameters = model.parameters.clone(); + parameters + .as_object_mut() + .context("Model parameters must be an object")? + .remove("name"); + + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync chunks")?; + + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + let embeddings_table_name = + format!("{}_{}.{}_embeddings", project_info.name, self.name, key); + + match model.runtime { + ModelRuntime::Python => { + sqlx::query(&query_builder!( + queries::GENERATE_EMBEDDINGS, + embeddings_table_name, + chunks_table_name, + embeddings_table_name + )) + .bind(&model.name) + .bind(¶meters) + .execute(&pool) + .await?; + } + r => { + let remote_embeddings = build_remote_embeddings(r, &model.name, Some(¶meters))?; + remote_embeddings + .generate_embeddings( + &embeddings_table_name, + &chunks_table_name, + None, + PoolOrArcMutextTransaction::Pool(pool), + ) + .await?; + } + } + Ok(()) + } + + #[instrument(skip(self))] + async fn resync_tsvectors(&self, key: &str, configuration: &str) -> anyhow::Result<()> { + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to sync TSVectors")?; + + let pool = self.get_pool().await?; + + let chunks_table_name = format!("{}_{}.{}_chunks", project_info.name, self.name, key); + let tsvectors_table_name = format!("{}_{}.{}_tsvectors", project_info.name, self.name, key); + + sqlx::query(&query_builder!( + queries::GENERATE_TSVECTORS, + tsvectors_table_name, + configuration, + chunks_table_name, + tsvectors_table_name + )) + .execute(&pool) + .await?; + Ok(()) + } + async fn get_pool(&self) -> anyhow::Result { let database_url = &self .project_info diff --git a/pgml-sdks/pgml/src/queries.rs b/pgml-sdks/pgml/src/queries.rs index e318fd2d9..0f38f584f 100644 --- a/pgml-sdks/pgml/src/queries.rs +++ b/pgml-sdks/pgml/src/queries.rs @@ -118,6 +118,24 @@ WHERE id = ANY ($1) ON CONFLICT (chunk_id) DO UPDATE SET ts = EXCLUDED.ts; "#; +pub const GENERATE_TSVECTORS: &str = r#" +INSERT INTO %s (chunk_id, document_id, ts) +SELECT + id, + document_id, + to_tsvector('%d', chunk) ts +FROM + %s +WHERE + id NOT IN ( + SELECT + chunk_id + FROM + %s + ) +ON CONFLICT (chunk_id) DO NOTHING; +"#; + pub const GENERATE_EMBEDDINGS_FOR_CHUNK_IDS: &str = r#" INSERT INTO %s (chunk_id, document_id, embedding) SELECT @@ -135,58 +153,26 @@ WHERE ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding "#; -pub const EMBED_AND_VECTOR_SEARCH: &str = r#" -WITH pipeline AS ( - SELECT - model_id - FROM - %s - WHERE - name = $1 -), -model AS ( - SELECT - hyperparams - FROM - pgml.models - WHERE - id = (SELECT model_id FROM pipeline) -), -embedding AS ( - SELECT - pgml.embed( - transformer => (SELECT hyperparams->>'name' FROM model), - text => $2, - kwargs => $3 - )::vector AS embedding -) -SELECT - embeddings.embedding <=> (SELECT embedding FROM embedding) score, - chunks.chunk, - documents.metadata -FROM - %s embeddings - INNER JOIN %s chunks ON chunks.id = embeddings.chunk_id - INNER JOIN %s documents ON documents.id = chunks.document_id - ORDER BY - score ASC - LIMIT - $4; -"#; - -pub const VECTOR_SEARCH: &str = r#" +pub const GENERATE_EMBEDDINGS: &str = r#" +INSERT INTO %s (chunk_id, document_id, embedding) SELECT - embeddings.embedding <=> $1::vector score, - chunks.chunk, - documents.metadata + id, + document_id, + pgml.embed( + text => chunk, + transformer => $1, + kwargs => $2 + ) FROM - %s embeddings - INNER JOIN %s chunks ON chunks.id = embeddings.chunk_id - INNER JOIN %s documents ON documents.id = chunks.document_id - ORDER BY - score ASC - LIMIT - $2; + %s +WHERE + id NOT IN ( + SELECT + chunk_id + FROM + %s + ) +ON CONFLICT (chunk_id) DO NOTHING; "#; pub const GENERATE_CHUNKS: &str = r#" @@ -232,7 +218,6 @@ FROM ) AS documents ) chunks ON CONFLICT (document_id, chunk_index) DO NOTHING -RETURNING id, document_id "#; pub const GENERATE_CHUNKS_FOR_DOCUMENT_ID: &str = r#" diff --git a/pgml-sdks/pgml/src/remote_embeddings.rs b/pgml-sdks/pgml/src/remote_embeddings.rs index 3a7ba98d0..c4ea98469 100644 --- a/pgml-sdks/pgml/src/remote_embeddings.rs +++ b/pgml-sdks/pgml/src/remote_embeddings.rs @@ -7,6 +7,12 @@ use tracing::instrument; use crate::{model::ModelRuntime, models, query_builder, types::Json}; +#[derive(Clone, Debug)] +pub enum PoolOrArcMutextTransaction { + Pool(PgPool), + ArcMutextTransaction(Arc>>), +} + pub fn build_remote_embeddings<'a>( source: ModelRuntime, model_name: &'a str, @@ -43,26 +49,46 @@ pub trait RemoteEmbeddings<'a> { self.parse_response(response) } - #[instrument(skip(self, transaction))] + #[instrument(skip(self))] async fn get_chunks( &self, embeddings_table_name: &str, chunks_table_name: &str, - chunk_ids: &Vec, - transaction: Arc>>, + chunk_ids: Option<&Vec>, + mut db_executor: PoolOrArcMutextTransaction, limit: Option, ) -> anyhow::Result> { let limit = limit.unwrap_or(1000); - sqlx::query_as(&query_builder!( - "SELECT * FROM %s WHERE id NOT IN (SELECT chunk_id FROM %s) AND id = ANY ($1) LIMIT $2", - chunks_table_name, - embeddings_table_name - )) - .bind(chunk_ids) - .bind(limit) - .fetch_all(&mut *transaction.lock().await) - .await + // Requires _query_text be declared out here so it lives long enough + let mut _query_text = "".to_string(); + let query = match chunk_ids { + Some(chunk_ids) => { + _query_text = query_builder!( + "SELECT * FROM %s WHERE id = ANY ($1) LIMIT $2", + chunks_table_name, + embeddings_table_name + ); + sqlx::query_as(_query_text.as_str()) + .bind(chunk_ids) + .bind(limit) + } + None => { + _query_text = query_builder!( + "SELECT * FROM %s WHERE id NOT IN (SELECT chunk_id FROM %s) LIMIT $1", + chunks_table_name, + embeddings_table_name + ); + sqlx::query_as(_query_text.as_str()).bind(limit) + } + }; + + match &mut db_executor { + PoolOrArcMutextTransaction::Pool(pool) => query.fetch_all(&*pool).await, + PoolOrArcMutextTransaction::ArcMutextTransaction(transaction) => { + query.fetch_all(&mut *transaction.lock().await).await + } + } .map_err(|e| anyhow::anyhow!(e)) } @@ -89,13 +115,13 @@ pub trait RemoteEmbeddings<'a> { Ok(embeddings) } - #[instrument(skip(self, transaction))] + #[instrument(skip(self))] async fn generate_embeddings( &self, embeddings_table_name: &str, chunks_table_name: &str, - chunk_ids: &Vec, - transaction: Arc>>, + chunk_ids: Option<&Vec>, + mut db_executor: PoolOrArcMutextTransaction, ) -> anyhow::Result<()> { loop { let chunks = self @@ -103,7 +129,7 @@ pub trait RemoteEmbeddings<'a> { embeddings_table_name, chunks_table_name, chunk_ids, - transaction.clone(), + db_executor.clone(), None, ) .await?; @@ -140,7 +166,13 @@ pub trait RemoteEmbeddings<'a> { query = query.bind(chunk_ids[i]).bind(&embeddings[i]); } - query.execute(&mut *transaction.lock().await).await?; + // query.execute(&mut *transaction.lock().await).await?; + match &mut db_executor { + PoolOrArcMutextTransaction::Pool(pool) => query.execute(&*pool).await, + PoolOrArcMutextTransaction::ArcMutextTransaction(transaction) => { + query.execute(&mut *transaction.lock().await).await + } + }?; } Ok(()) } From c8e1af8abc035f3ed4c35503e0afb74d6dfbea58 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:58:24 -0800 Subject: [PATCH 08/72] Cleaned up some tests --- pgml-sdks/pgml/src/collection.rs | 3 +- pgml-sdks/pgml/src/lib.rs | 533 +++++++------------------------ 2 files changed, 115 insertions(+), 421 deletions(-) diff --git a/pgml-sdks/pgml/src/collection.rs b/pgml-sdks/pgml/src/collection.rs index 7553e43f7..575c88858 100644 --- a/pgml-sdks/pgml/src/collection.rs +++ b/pgml-sdks/pgml/src/collection.rs @@ -9,7 +9,6 @@ use serde_json::json; use sqlx::postgres::PgPool; use sqlx::Executor; use sqlx::PgConnection; -use sqlx::Postgres; use sqlx::Transaction; use std::borrow::Cow; use std::path::Path; @@ -376,7 +375,7 @@ impl Collection { // 1. Set ACTIVE = TRUE for the pipeline in collection.pipelines // 2. Resync the pipeline sqlx::query(&query_builder!( - "UPDATE %s SET active = FALSE WHERE name = $1", + "UPDATE %s SET active = TRUE WHERE name = $1", self.pipelines_table_name )) .bind(&pipeline.name) diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 94b21e590..0f0e4db18 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -436,6 +436,64 @@ mod tests { Ok(()) } + #[sqlx::test] + async fn disable_enable_pipeline() -> anyhow::Result<()> { + let mut pipeline = MultiFieldPipeline::new("test_p_dep_1", Some(json!({}).into()))?; + let mut collection = Collection::new("test_r_c_dep_1", None); + collection.add_pipeline(&mut pipeline).await?; + let queried_pipeline = &collection.get_pipelines().await?[0]; + assert_eq!(pipeline.name, queried_pipeline.name); + collection.disable_pipeline(&pipeline).await?; + let queried_pipelines = &collection.get_pipelines().await?; + assert!(queried_pipelines.is_empty()); + collection.enable_pipeline(&mut pipeline).await?; + let queried_pipeline = &collection.get_pipelines().await?[0]; + assert_eq!(pipeline.name, queried_pipeline.name); + collection.archive().await?; + Ok(()) + } + + #[sqlx::test] + async fn can_upsert_documents_and_enable_pipeline() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let collection_name = "test_r_c_cudaap_43"; + let mut collection = Collection::new(collection_name, None); + let pipeline_name = "test_r_p_cudaap_9"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + } + } + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + collection.disable_pipeline(&pipeline).await?; + let documents = generate_dummy_documents(2); + collection.upsert_documents(documents, None).await?; + let pool = get_or_initialize_pool(&None).await?; + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 0); + collection.enable_pipeline(&mut pipeline).await?; + let chunks_table = format!("{}_{}.title_chunks", collection_name, pipeline_name); + let title_chunks: Vec = + sqlx::query_as(&query_builder!("SELECT * FROM %s", chunks_table)) + .fetch_all(&pool) + .await?; + assert!(title_chunks.len() == 2); + collection.archive().await?; + Ok(()) + } + #[sqlx::test] async fn random_pipelines_documents_test() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); @@ -588,6 +646,10 @@ mod tests { Ok(()) } + /////////////////////////////// + // Searches /////////////////// + /////////////////////////////// + #[sqlx::test] async fn can_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); @@ -755,6 +817,10 @@ mod tests { Ok(()) } + /////////////////////////////// + // Vector Searches ///////////// + /////////////////////////////// + #[sqlx::test] async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { internal_init_logger(None, None).ok(); @@ -893,50 +959,6 @@ mod tests { Ok(()) } - #[sqlx::test] - async fn generate_er_diagram() -> anyhow::Result<()> { - internal_init_logger(None, None).ok(); - let mut pipeline = MultiFieldPipeline::new( - "test_p_ged_57", - Some( - json!({ - "title": { - "embed": { - "model": "intfloat/e5-small" - }, - "full_text_search": { - "configuration": "english" - } - }, - "body": { - "splitter": { - "model": "recursive_character" - }, - "embed": { - "model": "intfloat/e5-small" - }, - "full_text_search": { - "configuration": "english" - } - }, - "notes": { - "embed": { - "model": "intfloat/e5-small" - } - } - }) - .into(), - ), - )?; - let mut collection = Collection::new("test_r_c_ged_2", None); - collection.add_pipeline(&mut pipeline).await?; - let diagram = collection.generate_er_diagram(&mut pipeline).await?; - assert!(!diagram.is_empty()); - println!("{diagram}"); - collection.archive().await?; - Ok(()) - } - // #[sqlx::test] // async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { // internal_init_logger(None, None).ok(); @@ -977,25 +999,6 @@ mod tests { // Ok(()) // } - // #[sqlx::test] - // async fn disable_enable_pipeline() -> anyhow::Result<()> { - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new("test_p_dep_0", Some(model), Some(splitter), None); - // let mut collection = Collection::new("test_r_c_dep_1", None); - // collection.add_pipeline(&mut pipeline).await?; - // let queried_pipeline = &collection.get_pipelines().await?[0]; - // assert_eq!(pipeline.name, queried_pipeline.name); - // collection.disable_pipeline(&pipeline).await?; - // let queried_pipelines = &collection.get_pipelines().await?; - // assert!(queried_pipelines.is_empty()); - // collection.enable_pipeline(&pipeline).await?; - // let queried_pipeline = &collection.get_pipelines().await?[0]; - // assert_eq!(pipeline.name, queried_pipeline.name); - // collection.archive().await?; - // Ok(()) - // } - // #[sqlx::test] // async fn sync_multiple_pipelines() -> anyhow::Result<()> { // internal_init_logger(None, None).ok(); @@ -1049,362 +1052,6 @@ mod tests { // Ok(()) // } - // /////////////////////////////// - // // Various Searches /////////// - // /////////////////////////////// - - // #[sqlx::test] - // async fn can_vector_search_with_local_embeddings() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswle_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cvswle_28", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let mut pipeline = Pipeline::new("test_r_p_cvswle_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let results = collection - // .vector_search("Here is some query", &mut pipeline, None, None) - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_remote_embeddings() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::new( - // Some("text-embedding-ada-002".to_string()), - // Some("openai".to_string()), - // None, - // ); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswre_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cvswre_21", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let mut pipeline = Pipeline::new("test_r_p_cvswre_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let results = collection - // .vector_search("Here is some query", &mut pipeline, None, Some(10)) - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_query_builder() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswqb_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cvswqb_4", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let pipeline = Pipeline::new("test_r_p_cvswqb_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(4), None) - // .await?; - // let results = collection - // .query() - // .vector_recall("Here is some query", &pipeline, None) - // .limit(3) - // .fetch_all() - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_query_builder_and_pass_model_parameters_in_search( - // ) -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::new( - // Some("hkunlp/instructor-base".to_string()), - // Some("python".to_string()), - // Some(json!({"instruction": "Represent the Wikipedia document for retrieval: "}).into()), - // ); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswqbapmpis_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cvswqbapmpis_4", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let pipeline = Pipeline::new("test_r_p_cvswqbapmpis_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let results = collection - // .query() - // .vector_recall( - // "Here is some query", - // &pipeline, - // Some( - // json!({ - // "instruction": "Represent the Wikipedia document for retrieval: " - // }) - // .into(), - // ), - // ) - // .limit(10) - // .fetch_all() - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_query_builder_with_remote_embeddings() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::new( - // Some("text-embedding-ada-002".to_string()), - // Some("openai".to_string()), - // None, - // ); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswqbwre_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cvswqbwre_5", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let pipeline = Pipeline::new("test_r_p_cvswqbwre_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(4), None) - // .await?; - // let results = collection - // .query() - // .vector_recall("Here is some query", &pipeline, None) - // .limit(3) - // .fetch_all() - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value( - // ) -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = - // Pipeline::new("test_r_p_cvswqbachesv_1", Some(model), Some(splitter), None); - // let mut collection = Collection::new("test_r_c_cvswqbachesv_3", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let pipeline = Pipeline::new("test_r_p_cvswqbachesv_1", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let results = collection - // .query() - // .vector_recall( - // "Here is some query", - // &pipeline, - // Some( - // json!({ - // "hnsw": { - // "ef_search": 2 - // } - // }) - // .into(), - // ), - // ) - // .fetch_all() - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_vector_search_with_query_builder_and_custom_hnsw_ef_search_value_and_remote_embeddings( - // ) -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::new( - // Some("text-embedding-ada-002".to_string()), - // Some("openai".to_string()), - // None, - // ); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cvswqbachesvare_2", - // Some(model), - // Some(splitter), - // None, - // ); - // let mut collection = Collection::new("test_r_c_cvswqbachesvare_7", None); - // collection.add_pipeline(&mut pipeline).await?; - - // // Recreate the pipeline to replicate a more accurate example - // let pipeline = Pipeline::new("test_r_p_cvswqbachesvare_2", None, None, None); - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let results = collection - // .query() - // .vector_recall( - // "Here is some query", - // &pipeline, - // Some( - // json!({ - // "hnsw": { - // "ef_search": 2 - // } - // }) - // .into(), - // ), - // ) - // .fetch_all() - // .await?; - // assert!(results.len() == 3); - // collection.archive().await?; - // Ok(()) - // } - - // #[sqlx::test] - // async fn can_filter_vector_search() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cfd_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_cfd_2", None); - // collection.add_pipeline(&mut pipeline).await?; - // collection - // .upsert_documents(generate_dummy_documents(5), None) - // .await?; - - // let filters = vec![ - // (5, json!({}).into()), - // ( - // 3, - // json!({ - // "metadata": { - // "id": { - // "$lt": 3 - // } - // } - // }) - // .into(), - // ), - // ( - // 1, - // json!({ - // "full_text_search": { - // "configuration": "english", - // "text": "1", - // } - // }) - // .into(), - // ), - // ]; - - // for (expected_result_count, filter) in filters { - // let results = collection - // .query() - // .vector_recall("Here is some query", &pipeline, None) - // .filter(filter) - // .fetch_all() - // .await?; - // assert_eq!(results.len(), expected_result_count); - // } - - // collection.archive().await?; - // Ok(()) - // } - /////////////////////////////// // Working With Documents ///// /////////////////////////////// @@ -1884,4 +1531,52 @@ mod tests { collection.archive().await?; Ok(()) } + + /////////////////////////////// + // ER Diagram ///////////////// + /////////////////////////////// + + #[sqlx::test] + async fn generate_er_diagram() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let mut pipeline = MultiFieldPipeline::new( + "test_p_ged_57", + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + } + }, + "body": { + "splitter": { + "model": "recursive_character" + }, + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + } + }, + "notes": { + "embed": { + "model": "intfloat/e5-small" + } + } + }) + .into(), + ), + )?; + let mut collection = Collection::new("test_r_c_ged_2", None); + collection.add_pipeline(&mut pipeline).await?; + let diagram = collection.generate_er_diagram(&mut pipeline).await?; + assert!(!diagram.is_empty()); + println!("{diagram}"); + collection.archive().await?; + Ok(()) + } } From 9df12b571148ce19786e43de1d6418553abd5c78 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Fri, 19 Jan 2024 12:48:20 -0800 Subject: [PATCH 09/72] Switching old pipeline to be a pass through for the new multi field pipeline --- pgml-sdks/pgml/src/lib.rs | 2 +- pgml-sdks/pgml/src/multi_field_pipeline.rs | 3 +- pgml-sdks/pgml/src/pipeline.rs | 678 +-------------------- pgml-sdks/pgml/src/query_builder.rs | 207 +++---- 4 files changed, 137 insertions(+), 753 deletions(-) diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 0f0e4db18..0765b020f 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -818,7 +818,7 @@ mod tests { } /////////////////////////////// - // Vector Searches ///////////// + // Vector Searches //////////// /////////////////////////////// #[sqlx::test] diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index d207c83b2..d3138b4f6 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -142,9 +142,8 @@ pub struct MultiFieldPipelineDatabaseData { pub created_at: DateTime, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MultiFieldPipeline { - // TODO: Make the schema and parsed_schema optional fields only required if they try to save a new pipeline that does not exist pub name: String, pub schema: Option, pub parsed_schema: Option, diff --git a/pgml-sdks/pgml/src/pipeline.rs b/pgml-sdks/pgml/src/pipeline.rs index 395729ac9..ea76a51c2 100644 --- a/pgml-sdks/pgml/src/pipeline.rs +++ b/pgml-sdks/pgml/src/pipeline.rs @@ -11,7 +11,8 @@ use crate::{ collection::ProjectInfo, get_or_initialize_pool, model::{Model, ModelRuntime}, - models, queries, query_builder, + multi_field_pipeline::MultiFieldPipeline, + queries, query_builder, remote_embeddings::build_remote_embeddings, splitter::Splitter, types::{DateTime, Json, TryToNumeric}, @@ -126,16 +127,35 @@ impl Pipeline { model: Option, splitter: Option, parameters: Option, - ) -> Self { - let parameters = Some(parameters.unwrap_or_default()); - Self { - name: name.to_string(), - model, - splitter, - parameters, - project_info: None, - database_data: None, - } + ) -> MultiFieldPipeline { + // let schema = serde_json::json!({ + // "text": { + // "embed": { + // "model": model.na + // }); + let schema = if let Some(model) = model { + Some(serde_json::json!({ + "text": { + "embed": { + "model": model.name + } + } + })) + } else { + None + }; + MultiFieldPipeline::new(name, schema.map(|v| v.into())) + .expect("Error conerting pipeline into new multifield pipeline") + + // let parameters = Some(parameters.unwrap_or_default()); + // Self { + // name: name.to_string(), + // model, + // splitter, + // parameters, + // project_info: None, + // database_data: None, + // } } /// Gets the status of the [Pipeline] @@ -226,640 +246,4 @@ impl Pipeline { // tsvectors_status, // }) } - - #[instrument(skip(self))] - pub(crate) async fn verify_in_database(&mut self, throw_if_exists: bool) -> anyhow::Result<()> { - unimplemented!() - // if self.database_data.is_none() { - // let pool = self.get_pool().await?; - - // let project_info = self - // .project_info - // .as_ref() - // .expect("Cannot verify pipeline without project info"); - - // let pipeline: Option = sqlx::query_as(&query_builder!( - // "SELECT * FROM %s WHERE name = $1", - // format!("{}.pipelines", project_info.name) - // )) - // .bind(&self.name) - // .fetch_optional(&pool) - // .await?; - - // let pipeline = if let Some(p) = pipeline { - // if throw_if_exists { - // anyhow::bail!("Pipeline {} already exists", p.name); - // } - // let model: models::Model = sqlx::query_as( - // "SELECT id, created_at, runtime::TEXT, hyperparams FROM pgml.models WHERE id = $1", - // ) - // .bind(p.model_id) - // .fetch_one(&pool) - // .await?; - // let mut model: Model = model.into(); - // model.set_project_info(project_info.clone()); - // self.model = Some(model); - - // let splitter: models::Splitter = - // sqlx::query_as("SELECT * FROM pgml.splitters WHERE id = $1") - // .bind(p.splitter_id) - // .fetch_one(&pool) - // .await?; - // let mut splitter: Splitter = splitter.into(); - // splitter.set_project_info(project_info.clone()); - // self.splitter = Some(splitter); - - // p - // } else { - // let model = self - // .model - // .as_mut() - // .expect("Cannot save pipeline without model"); - // model.set_project_info(project_info.clone()); - // model.verify_in_database(false).await?; - - // let splitter = self - // .splitter - // .as_mut() - // .expect("Cannot save pipeline without splitter"); - // splitter.set_project_info(project_info.clone()); - // splitter.verify_in_database(false).await?; - - // sqlx::query_as(&query_builder!( - // "INSERT INTO %s (name, model_id, splitter_id, parameters) VALUES ($1, $2, $3, $4) RETURNING *", - // format!("{}.pipelines", project_info.name) - // )) - // .bind(&self.name) - // .bind( - // model - // .database_data - // .as_ref() - // .context("Cannot save pipeline without model")? - // .id, - // ) - // .bind( - // splitter - // .database_data - // .as_ref() - // .context("Cannot save pipeline without splitter")? - // .id, - // ) - // .bind(&self.parameters) - // .fetch_one(&pool) - // .await? - // }; - - // self.database_data = Some(PipelineDatabaseData { - // id: pipeline.id, - // created_at: pipeline.created_at, - // model_id: pipeline.model_id, - // splitter_id: pipeline.splitter_id, - // }); - // self.parameters = Some(pipeline.parameters); - // } - // Ok(()) - } - - #[instrument(skip(self, mp))] - pub(crate) async fn execute( - &mut self, - document_ids: &Option>, - mp: MultiProgress, - ) -> anyhow::Result<()> { - unimplemented!() - // // TODO: Chunk document_ids if there are too many - - // // A couple notes on the following methods - // // - Atomic bools are required to work nicely with pyo3 otherwise we would use cells - // // - We use green threads because they are cheap, but we want to be super careful to not - // // return an error before stopping the green thread. To meet that end, we map errors and - // // return types often - // let chunk_ids = self.sync_chunks(document_ids, &mp).await?; - // self.sync_embeddings(chunk_ids, &mp).await?; - // self.sync_tsvectors(document_ids, &mp).await?; - // Ok(()) - } - - #[instrument(skip(self, mp))] - async fn sync_chunks( - &mut self, - document_ids: &Option>, - mp: &MultiProgress, - ) -> anyhow::Result>> { - unimplemented!() - // self.verify_in_database(false).await?; - // let pool = self.get_pool().await?; - - // let database_data = self - // .database_data - // .as_mut() - // .context("Pipeline must be verified to generate chunks")?; - - // let project_info = self - // .project_info - // .as_ref() - // .context("Pipeline must have project info to generate chunks")?; - - // let progress_bar = mp - // .add(utils::default_progress_spinner(1)) - // .with_prefix(self.name.clone()) - // .with_message("generating chunks"); - - // // This part is a bit tricky - // // We want to return the ids for all chunks we inserted OR would have inserted if they didn't already exist - // // The query is structured in such a way to not insert any chunks that already exist so we - // // can't rely on the data returned from the inset queries, we need to query the chunks table - // // It is important we return the ids for chunks we would have inserted if they didn't already exist so we are robust to random crashes - // let is_done = AtomicBool::new(false); - // let work = async { - // let chunk_ids: Result>, _> = if document_ids.is_some() { - // sqlx::query(&query_builder!( - // queries::GENERATE_CHUNKS_FOR_DOCUMENT_IDS, - // &format!("{}.chunks", project_info.name), - // &format!("{}.documents", project_info.name), - // &format!("{}.chunks", project_info.name) - // )) - // .bind(database_data.splitter_id) - // .bind(document_ids) - // .execute(&pool) - // .await - // .map_err(|e| { - // is_done.store(true, Relaxed); - // e - // })?; - // sqlx::query_scalar(&query_builder!( - // "SELECT id FROM %s WHERE document_id = ANY($1)", - // &format!("{}.chunks", project_info.name) - // )) - // .bind(document_ids) - // .fetch_all(&pool) - // .await - // .map(Some) - // } else { - // sqlx::query(&query_builder!( - // queries::GENERATE_CHUNKS, - // &format!("{}.chunks", project_info.name), - // &format!("{}.documents", project_info.name), - // &format!("{}.chunks", project_info.name) - // )) - // .bind(database_data.splitter_id) - // .execute(&pool) - // .await - // .map(|_t| None) - // }; - // is_done.store(true, Relaxed); - // chunk_ids - // }; - // let progress_work = async { - // while !is_done.load(Relaxed) { - // progress_bar.inc(1); - // tokio::time::sleep(std::time::Duration::from_millis(100)).await; - // } - // }; - // let (chunk_ids, _) = join!(work, progress_work); - // progress_bar.set_message("done generating chunks"); - // progress_bar.finish(); - // Ok(chunk_ids?) - } - - #[instrument(skip(self, mp))] - async fn sync_embeddings( - &mut self, - chunk_ids: Option>, - mp: &MultiProgress, - ) -> anyhow::Result<()> { - unimplemented!() - // self.verify_in_database(false).await?; - // let pool = self.get_pool().await?; - - // let embeddings_table_name = self.create_or_get_embeddings_table().await?; - - // let model = self - // .model - // .as_ref() - // .context("Pipeline must be verified to generate embeddings")?; - - // let database_data = self - // .database_data - // .as_mut() - // .context("Pipeline must be verified to generate embeddings")?; - - // let project_info = self - // .project_info - // .as_ref() - // .context("Pipeline must have project info to generate embeddings")?; - - // // Remove the stored name from the parameters - // let mut parameters = model.parameters.clone(); - // parameters - // .as_object_mut() - // .context("Model parameters must be an object")? - // .remove("name"); - - // let progress_bar = mp - // .add(utils::default_progress_spinner(1)) - // .with_prefix(self.name.clone()) - // .with_message("generating emmbeddings"); - - // let is_done = AtomicBool::new(false); - // // We need to be careful about how we handle errors here. We do not want to return an error - // // from the async block before setting is_done to true. If we do, the progress bar will - // // will load forever. We also want to make sure to propogate any errors we have - // let work = async { - // let res = match model.runtime { - // ModelRuntime::Python => if chunk_ids.is_some() { - // sqlx::query(&query_builder!( - // queries::GENERATE_EMBEDDINGS_FOR_CHUNK_IDS, - // embeddings_table_name, - // &format!("{}.chunks", project_info.name), - // embeddings_table_name - // )) - // .bind(&model.name) - // .bind(¶meters) - // .bind(database_data.splitter_id) - // .bind(chunk_ids) - // .execute(&pool) - // .await - // } else { - // sqlx::query(&query_builder!( - // queries::GENERATE_EMBEDDINGS, - // embeddings_table_name, - // &format!("{}.chunks", project_info.name), - // embeddings_table_name - // )) - // .bind(&model.name) - // .bind(¶meters) - // .bind(database_data.splitter_id) - // .execute(&pool) - // .await - // } - // .map_err(|e| anyhow::anyhow!(e)) - // .map(|_t| ()), - // r => { - // let remote_embeddings = build_remote_embeddings(r, &model.name, ¶meters)?; - // remote_embeddings - // .generate_embeddings( - // &embeddings_table_name, - // &format!("{}.chunks", project_info.name), - // database_data.splitter_id, - // chunk_ids, - // &pool, - // ) - // .await - // .map(|_t| ()) - // } - // }; - // is_done.store(true, Relaxed); - // res - // }; - // let progress_work = async { - // while !is_done.load(Relaxed) { - // progress_bar.inc(1); - // tokio::time::sleep(std::time::Duration::from_millis(100)).await; - // } - // }; - // let (res, _) = join!(work, progress_work); - // progress_bar.set_message("done generating embeddings"); - // progress_bar.finish(); - // res - } - - #[instrument(skip(self))] - async fn sync_tsvectors( - &mut self, - document_ids: &Option>, - mp: &MultiProgress, - ) -> anyhow::Result<()> { - unimplemented!() - // self.verify_in_database(false).await?; - // let pool = self.get_pool().await?; - - // let parameters = self - // .parameters - // .as_ref() - // .context("Pipeline must be verified to generate tsvectors")?; - - // if parameters["full_text_search"]["active"] != serde_json::Value::Bool(true) { - // return Ok(()); - // } - - // let project_info = self - // .project_info - // .as_ref() - // .context("Pipeline must have project info to generate tsvectors")?; - - // let progress_bar = mp - // .add(utils::default_progress_spinner(1)) - // .with_prefix(self.name.clone()) - // .with_message("generating tsvectors for full text search"); - - // let configuration = parameters["full_text_search"]["configuration"] - // .as_str() - // .context("Full text search configuration must be a string")?; - - // let is_done = AtomicBool::new(false); - // let work = async { - // let res = if document_ids.is_some() { - // sqlx::query(&query_builder!( - // queries::GENERATE_TSVECTORS_FOR_DOCUMENT_IDS, - // format!("{}.documents_tsvectors", project_info.name), - // configuration, - // configuration, - // format!("{}.documents", project_info.name) - // )) - // .bind(document_ids) - // .execute(&pool) - // .await - // } else { - // sqlx::query(&query_builder!( - // queries::GENERATE_TSVECTORS, - // format!("{}.documents_tsvectors", project_info.name), - // configuration, - // configuration, - // format!("{}.documents", project_info.name) - // )) - // .execute(&pool) - // .await - // }; - // is_done.store(true, Relaxed); - // res.map(|_t| ()).map_err(|e| anyhow::anyhow!(e)) - // }; - // let progress_work = async { - // while !is_done.load(Relaxed) { - // progress_bar.inc(1); - // tokio::time::sleep(std::time::Duration::from_millis(100)).await; - // } - // }; - // let (res, _) = join!(work, progress_work); - // progress_bar.set_message("done generating tsvectors for full text search"); - // progress_bar.finish(); - // res - } - - #[instrument(skip(self))] - pub(crate) async fn create_or_get_embeddings_table(&mut self) -> anyhow::Result { - unimplemented!() - // self.verify_in_database(false).await?; - // let pool = self.get_pool().await?; - - // let collection_name = &self - // .project_info - // .as_ref() - // .context("Pipeline must have project info to get the embeddings table name")? - // .name; - // let embeddings_table_name = format!("{}.{}_embeddings", collection_name, self.name); - - // // Notice that we actually check for existence of the table in the database instead of - // // blindly creating it with `CREATE TABLE IF NOT EXISTS`. This is because we want to avoid - // // generating embeddings just to get the length if we don't need to - // let exists: bool = sqlx::query_scalar( - // "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_schema = $1 AND table_name = $2)" - // ) - // .bind(&self - // .project_info - // .as_ref() - // .context("Pipeline must have project info to get the embeddings table name")?.name) - // .bind(format!("{}_embeddings", self.name)).fetch_one(&pool).await?; - - // if !exists { - // let model = self - // .model - // .as_ref() - // .context("Pipeline must be verified to create embeddings table")?; - - // // Remove the stored name from the model parameters - // let mut model_parameters = model.parameters.clone(); - // model_parameters - // .as_object_mut() - // .context("Model parameters must be an object")? - // .remove("name"); - - // let embedding_length = match &model.runtime { - // ModelRuntime::Python => { - // let embedding: (Vec,) = sqlx::query_as( - // "SELECT embedding from pgml.embed(transformer => $1, text => 'Hello, World!', kwargs => $2) as embedding") - // .bind(&model.name) - // .bind(model_parameters) - // .fetch_one(&pool).await?; - // embedding.0.len() as i64 - // } - // t => { - // let remote_embeddings = - // build_remote_embeddings(t.to_owned(), &model.name, &model_parameters)?; - // remote_embeddings.get_embedding_size().await? - // } - // }; - - // let mut transaction = pool.begin().await?; - // sqlx::query(&query_builder!( - // queries::CREATE_EMBEDDINGS_TABLE, - // &embeddings_table_name, - // &format!( - // "{}.chunks", - // self.project_info - // .as_ref() - // .context("Pipeline must have project info to create the embeddings table")? - // .name - // ), - // embedding_length - // )) - // .execute(&mut *transaction) - // .await?; - // let index_name = format!("{}_pipeline_created_at_index", self.name); - // transaction - // .execute( - // query_builder!( - // queries::CREATE_INDEX, - // "", - // index_name, - // &embeddings_table_name, - // "created_at" - // ) - // .as_str(), - // ) - // .await?; - // let index_name = format!("{}_pipeline_chunk_id_index", self.name); - // transaction - // .execute( - // query_builder!( - // queries::CREATE_INDEX, - // "", - // index_name, - // &embeddings_table_name, - // "chunk_id" - // ) - // .as_str(), - // ) - // .await?; - // // See: https://github.com/pgvector/pgvector - // let (m, ef_construction) = match &self.parameters { - // Some(p) => { - // let m = if !p["hnsw"]["m"].is_null() { - // p["hnsw"]["m"] - // .try_to_u64() - // .context("hnsw.m must be an integer")? - // } else { - // 16 - // }; - // let ef_construction = if !p["hnsw"]["ef_construction"].is_null() { - // p["hnsw"]["ef_construction"] - // .try_to_u64() - // .context("hnsw.ef_construction must be an integer")? - // } else { - // 64 - // }; - // (m, ef_construction) - // } - // None => (16, 64), - // }; - // let index_with_parameters = - // format!("WITH (m = {}, ef_construction = {})", m, ef_construction); - // let index_name = format!("{}_pipeline_hnsw_vector_index", self.name); - // transaction - // .execute( - // query_builder!( - // queries::CREATE_INDEX_USING_HNSW, - // "", - // index_name, - // &embeddings_table_name, - // "embedding vector_cosine_ops", - // index_with_parameters - // ) - // .as_str(), - // ) - // .await?; - // transaction.commit().await?; - // } - - // Ok(embeddings_table_name) - } - - #[instrument(skip(self))] - pub(crate) fn set_project_info(&mut self, project_info: ProjectInfo) { - unimplemented!() - // if self.model.is_some() { - // self.model - // .as_mut() - // .unwrap() - // .set_project_info(project_info.clone()); - // } - // if self.splitter.is_some() { - // self.splitter - // .as_mut() - // .unwrap() - // .set_project_info(project_info.clone()); - // } - // self.project_info = Some(project_info); - } - - /// Convert the [Pipeline] to [Json] - /// - /// # Example: - /// - /// ``` - /// use pgml::Collection; - /// - /// async fn example() -> anyhow::Result<()> { - /// let mut collection = Collection::new("my_collection", None); - /// let mut pipeline = collection.get_pipeline("my_pipeline").await?; - /// let pipeline_dict = pipeline.to_dict().await?; - /// Ok(()) - /// } - /// ``` - #[instrument(skip(self))] - pub async fn to_dict(&mut self) -> anyhow::Result { - unimplemented!() - // self.verify_in_database(false).await?; - - // let status = self.get_status().await?; - - // let model_dict = self - // .model - // .as_mut() - // .context("Pipeline must be verified to call to_dict")? - // .to_dict() - // .await?; - - // let splitter_dict = self - // .splitter - // .as_mut() - // .context("Pipeline must be verified to call to_dict")? - // .to_dict() - // .await?; - - // let database_data = self - // .database_data - // .as_ref() - // .context("Pipeline must be verified to call to_dict")?; - - // let parameters = self - // .parameters - // .as_ref() - // .context("Pipeline must be verified to call to_dict")?; - - // Ok(serde_json::json!({ - // "id": database_data.id, - // "name": self.name, - // "model": *model_dict, - // "splitter": *splitter_dict, - // "parameters": *parameters, - // "status": *Json::from(status), - // }) - // .into()) - } - - async fn get_pool(&self) -> anyhow::Result { - unimplemented!() - // let database_url = &self - // .project_info - // .as_ref() - // .context("Project info required to call method pipeline.get_pool()")? - // .database_url; - // get_or_initialize_pool(database_url).await - } - - pub(crate) async fn create_pipelines_table( - project_info: &ProjectInfo, - conn: &mut PgConnection, - ) -> anyhow::Result<()> { - unimplemented!() - // let pipelines_table_name = format!("{}.pipelines", project_info.name); - // sqlx::query(&query_builder!( - // queries::CREATE_PIPELINES_TABLE, - // pipelines_table_name - // )) - // .execute(&mut *conn) - // .await?; - // conn.execute( - // query_builder!( - // queries::CREATE_INDEX, - // "", - // "pipeline_name_index", - // pipelines_table_name, - // "name" - // ) - // .as_str(), - // ) - // .await?; - // Ok(()) - } -} - -impl From for Pipeline { - fn from(x: models::PipelineWithModelAndSplitter) -> Self { - unimplemented!() - // Self { - // model: Some(x.clone().into()), - // splitter: Some(x.clone().into()), - // name: x.pipeline_name, - // project_info: None, - // database_data: Some(PipelineDatabaseData { - // id: x.pipeline_id, - // created_at: x.pipeline_created_at, - // model_id: x.model_id, - // splitter_id: x.splitter_id, - // }), - // parameters: Some(x.pipeline_parameters), - // } - } } diff --git a/pgml-sdks/pgml/src/query_builder.rs b/pgml-sdks/pgml/src/query_builder.rs index 5ebc7ef8a..8bb1b8b81 100644 --- a/pgml-sdks/pgml/src/query_builder.rs +++ b/pgml-sdks/pgml/src/query_builder.rs @@ -12,7 +12,7 @@ use crate::{ filter_builder, get_or_initialize_pool, model::ModelRuntime, models, - pipeline::Pipeline, + multi_field_pipeline::MultiFieldPipeline, query_builder, remote_embeddings::build_remote_embeddings, types::{IntoTableNameAndSchema, Json, SIden, TryToNumeric}, @@ -20,7 +20,7 @@ use crate::{ }; #[cfg(feature = "python")] -use crate::{pipeline::PipelinePython, types::JsonPython}; +use crate::{multi_field_pipeline::MultiFieldPipelinePython, types::JsonPython}; #[derive(Clone, Debug)] struct QueryBuilderState {} @@ -31,7 +31,7 @@ pub struct QueryBuilder { with: WithClause, collection: Collection, query_string: Option, - pipeline: Option, + pipeline: Option, query_parameters: Option, } @@ -123,7 +123,7 @@ impl QueryBuilder { pub fn vector_recall( mut self, query: &str, - pipeline: &Pipeline, + pipeline: &MultiFieldPipeline, query_parameters: Option, ) -> Self { unimplemented!() @@ -148,8 +148,8 @@ impl QueryBuilder { // self.collection.pipelines_table_name.to_table_tuple(), // SIden::Str("pipeline"), // ) - // .columns([models::PipelineIden::ModelId]) - // .and_where(Expr::col(models::PipelineIden::Name).eq(&pipeline.name)); + // .columns([models::MultiFieldPipelineIden::ModelId]) + // .and_where(Expr::col(models::MultiFieldPipelineIden::Name).eq(&pipeline.name)); // let mut pipeline_cte = CommonTableExpression::from_select(pipeline_cte); // pipeline_cte.table_name(Alias::new("pipeline")); @@ -222,114 +222,115 @@ impl QueryBuilder { #[instrument(skip(self))] pub async fn fetch_all(mut self) -> anyhow::Result> { - let pool = get_or_initialize_pool(&self.collection.database_url).await?; + unimplemented!() + // let pool = get_or_initialize_pool(&self.collection.database_url).await?; - let mut query_parameters = self.query_parameters.unwrap_or_default(); + // let mut query_parameters = self.query_parameters.unwrap_or_default(); - let (sql, values) = self - .query - .clone() - .with(self.with.clone()) - .build_sqlx(PostgresQueryBuilder); + // let (sql, values) = self + // .query + // .clone() + // .with(self.with.clone()) + // .build_sqlx(PostgresQueryBuilder); - let result: Result, _> = - if !query_parameters["hnsw"]["ef_search"].is_null() { - let mut transaction = pool.begin().await?; - let ef_search = query_parameters["hnsw"]["ef_search"] - .try_to_i64() - .context("ef_search must be an integer")?; - sqlx::query(&query_builder!("SET LOCAL hnsw.ef_search = %d", ef_search)) - .execute(&mut *transaction) - .await?; - let results = sqlx::query_as_with(&sql, values) - .fetch_all(&mut *transaction) - .await; - transaction.commit().await?; - results - } else { - sqlx::query_as_with(&sql, values).fetch_all(&pool).await - }; + // let result: Result, _> = + // if !query_parameters["hnsw"]["ef_search"].is_null() { + // let mut transaction = pool.begin().await?; + // let ef_search = query_parameters["hnsw"]["ef_search"] + // .try_to_i64() + // .context("ef_search must be an integer")?; + // sqlx::query(&query_builder!("SET LOCAL hnsw.ef_search = %d", ef_search)) + // .execute(&mut *transaction) + // .await?; + // let results = sqlx::query_as_with(&sql, values) + // .fetch_all(&mut *transaction) + // .await; + // transaction.commit().await?; + // results + // } else { + // sqlx::query_as_with(&sql, values).fetch_all(&pool).await + // }; - match result { - Ok(r) => Ok(r), - Err(e) => match e.as_database_error() { - Some(d) => { - if d.code() == Some(Cow::from("XX000")) { - // Explicitly get and set the model - let project_info = self.collection.get_project_info().await?; - let pipeline = self - .pipeline - .as_mut() - .context("Need pipeline to call fetch_all on query builder with remote embeddings")?; - pipeline.set_project_info(project_info); - pipeline.verify_in_database(false).await?; - let model = pipeline - .model - .as_ref() - .context("Pipeline must be verified to perform vector search with remote embeddings")?; + // match result { + // Ok(r) => Ok(r), + // Err(e) => match e.as_database_error() { + // Some(d) => { + // if d.code() == Some(Cow::from("XX000")) { + // // Explicitly get and set the model + // let project_info = self.collection.get_project_info().await?; + // let pipeline = self + // .pipeline + // .as_mut() + // .context("Need pipeline to call fetch_all on query builder with remote embeddings")?; + // pipeline.set_project_info(project_info); + // pipeline.verify_in_database(false).await?; + // let model = pipeline + // .model + // .as_ref() + // .context("MultiFieldPipeline must be verified to perform vector search with remote embeddings")?; - // If the model runtime is python, the error was not caused by an unsupported runtime - if model.runtime == ModelRuntime::Python { - return Err(anyhow::anyhow!(e)); - } + // // If the model runtime is python, the error was not caused by an unsupported runtime + // if model.runtime == ModelRuntime::Python { + // return Err(anyhow::anyhow!(e)); + // } - let hnsw_parameters = query_parameters - .as_object_mut() - .context("Query parameters must be a Json object")? - .remove("hnsw"); + // let hnsw_parameters = query_parameters + // .as_object_mut() + // .context("Query parameters must be a Json object")? + // .remove("hnsw"); - let remote_embeddings = - build_remote_embeddings(model.runtime, &model.name, Some(&query_parameters))?; - let mut embeddings = remote_embeddings - .embed(vec![self - .query_string - .to_owned() - .context("Must have query_string to call fetch_all on query_builder with remote embeddings")?]) - .await?; - let embedding = std::mem::take(&mut embeddings[0]); + // let remote_embeddings = + // build_remote_embeddings(model.runtime, &model.name, Some(&query_parameters))?; + // let mut embeddings = remote_embeddings + // .embed(vec![self + // .query_string + // .to_owned() + // .context("Must have query_string to call fetch_all on query_builder with remote embeddings")?]) + // .await?; + // let embedding = std::mem::take(&mut embeddings[0]); - let mut embedding_cte = Query::select(); - embedding_cte - .expr(Expr::cust_with_values("$1::vector embedding", [embedding])); + // let mut embedding_cte = Query::select(); + // embedding_cte + // .expr(Expr::cust_with_values("$1::vector embedding", [embedding])); - let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); - embedding_cte.table_name(Alias::new("embedding")); - let mut with_clause = WithClause::new(); - with_clause.cte(embedding_cte); + // let mut embedding_cte = CommonTableExpression::from_select(embedding_cte); + // embedding_cte.table_name(Alias::new("embedding")); + // let mut with_clause = WithClause::new(); + // with_clause.cte(embedding_cte); - let (sql, values) = self - .query - .clone() - .with(with_clause) - .build_sqlx(PostgresQueryBuilder); + // let (sql, values) = self + // .query + // .clone() + // .with(with_clause) + // .build_sqlx(PostgresQueryBuilder); - if let Some(parameters) = hnsw_parameters { - let mut transaction = pool.begin().await?; - let ef_search = parameters["ef_search"] - .try_to_i64() - .context("ef_search must be an integer")?; - sqlx::query(&query_builder!( - "SET LOCAL hnsw.ef_search = %d", - ef_search - )) - .execute(&mut *transaction) - .await?; - let results = sqlx::query_as_with(&sql, values) - .fetch_all(&mut *transaction) - .await; - transaction.commit().await?; - results - } else { - sqlx::query_as_with(&sql, values).fetch_all(&pool).await - } - .map_err(|e| anyhow::anyhow!(e)) - } else { - Err(anyhow::anyhow!(e)) - } - } - None => Err(anyhow::anyhow!(e)), - }, - }.map(|r| r.into_iter().map(|(score, id, metadata)| (1. - score, id, metadata)).collect()) + // if let Some(parameters) = hnsw_parameters { + // let mut transaction = pool.begin().await?; + // let ef_search = parameters["ef_search"] + // .try_to_i64() + // .context("ef_search must be an integer")?; + // sqlx::query(&query_builder!( + // "SET LOCAL hnsw.ef_search = %d", + // ef_search + // )) + // .execute(&mut *transaction) + // .await?; + // let results = sqlx::query_as_with(&sql, values) + // .fetch_all(&mut *transaction) + // .await; + // transaction.commit().await?; + // results + // } else { + // sqlx::query_as_with(&sql, values).fetch_all(&pool).await + // } + // .map_err(|e| anyhow::anyhow!(e)) + // } else { + // Err(anyhow::anyhow!(e)) + // } + // } + // None => Err(anyhow::anyhow!(e)), + // }, + // }.map(|r| r.into_iter().map(|(score, id, metadata)| (1. - score, id, metadata)).collect()) } // This is mostly so our SDKs in other languages have some way to debug From f75a2ec8eec34c3893d732c3743d79e24b8a8710 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Mon, 22 Jan 2024 13:28:20 -0800 Subject: [PATCH 10/72] Finished pipeline as a pass through and more tests --- pgml-sdks/pgml/src/lib.rs | 313 +++++++++++++++------ pgml-sdks/pgml/src/multi_field_pipeline.rs | 135 ++++++++- pgml-sdks/pgml/src/pipeline.rs | 206 ++------------ 3 files changed, 374 insertions(+), 280 deletions(-) diff --git a/pgml-sdks/pgml/src/lib.rs b/pgml-sdks/pgml/src/lib.rs index 0765b020f..bc4266b17 100644 --- a/pgml-sdks/pgml/src/lib.rs +++ b/pgml-sdks/pgml/src/lib.rs @@ -646,6 +646,158 @@ mod tests { Ok(()) } + #[sqlx::test] + async fn pipeline_sync_status() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let collection_name = "test_r_c_pss_5"; + let mut collection = Collection::new(collection_name, None); + let pipeline_name = "test_r_p_pss_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small" + }, + "full_text_search": { + "configuration": "english" + }, + "splitter": { + "model": "recursive_character" + } + } + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + let documents = generate_dummy_documents(4); + collection + .upsert_documents(documents[..2].to_owned(), None) + .await?; + let status = pipeline.get_status().await?; + assert_eq!( + status.0, + json!({ + "title": { + "chunks": { + "not_synced": 0, + "synced": 2, + "total": 2 + }, + "embeddings": { + "not_synced": 0, + "synced": 2, + "total": 2 + }, + "tsvectors": { + "not_synced": 0, + "synced": 2, + "total": 2 + }, + } + }) + ); + collection.disable_pipeline(&mut pipeline).await?; + collection + .upsert_documents(documents[2..4].to_owned(), None) + .await?; + let status = pipeline.get_status().await?; + assert_eq!( + status.0, + json!({ + "title": { + "chunks": { + "not_synced": 2, + "synced": 2, + "total": 4 + }, + "embeddings": { + "not_synced": 0, + "synced": 2, + "total": 2 + }, + "tsvectors": { + "not_synced": 0, + "synced": 2, + "total": 2 + }, + } + }) + ); + collection.enable_pipeline(&mut pipeline).await?; + let status = pipeline.get_status().await?; + assert_eq!( + status.0, + json!({ + "title": { + "chunks": { + "not_synced": 0, + "synced": 4, + "total": 4 + }, + "embeddings": { + "not_synced": 0, + "synced": 4, + "total": 4 + }, + "tsvectors": { + "not_synced": 0, + "synced": 4, + "total": 4 + }, + } + }) + ); + collection.archive().await?; + Ok(()) + } + + #[sqlx::test] + async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { + internal_init_logger(None, None).ok(); + let collection_name = "test_r_c_cschpfp_4"; + let mut collection = Collection::new(collection_name, None); + let pipeline_name = "test_r_p_cschpfp_0"; + let mut pipeline = MultiFieldPipeline::new( + pipeline_name, + Some( + json!({ + "title": { + "embed": { + "model": "intfloat/e5-small", + "hnsw": { + "m": 100, + "ef_construction": 200 + } + } + } + }) + .into(), + ), + )?; + collection.add_pipeline(&mut pipeline).await?; + let schema = format!("{collection_name}_{pipeline_name}"); + let full_embeddings_table_name = format!("{schema}.title_embeddings"); + let embeddings_table_name = full_embeddings_table_name.split('.').collect::>()[1]; + let pool = get_or_initialize_pool(&None).await?; + let results: Vec<(String, String)> = sqlx::query_as(&query_builder!( + "select indexname, indexdef from pg_indexes where tablename = '%d' and schemaname = '%d'", + embeddings_table_name, + schema + )).fetch_all(&pool).await?; + let names = results.iter().map(|(name, _)| name).collect::>(); + let definitions = results + .iter() + .map(|(_, definition)| definition) + .collect::>(); + assert!(names.contains(&&"title_pipeline_embedding_hnsw_vector_index".to_string())); + assert!(definitions.contains(&&format!("CREATE INDEX title_pipeline_embedding_hnsw_vector_index ON {full_embeddings_table_name} USING hnsw (embedding vector_cosine_ops) WITH (m='100', ef_construction='200')"))); + collection.archive().await?; + Ok(()) + } + /////////////////////////////// // Searches /////////////////// /////////////////////////////// @@ -959,99 +1111,6 @@ mod tests { Ok(()) } - // #[sqlx::test] - // async fn can_specify_custom_hnsw_parameters_for_pipelines() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline = Pipeline::new( - // "test_r_p_cschpfp_0", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "hnsw": { - // "m": 100, - // "ef_construction": 200 - // } - // }) - // .into(), - // ), - // ); - // let collection_name = "test_r_c_cschpfp_1"; - // let mut collection = Collection::new(collection_name, None); - // collection.add_pipeline(&mut pipeline).await?; - // let full_embeddings_table_name = pipeline.create_or_get_embeddings_table().await?; - // let embeddings_table_name = full_embeddings_table_name.split('.').collect::>()[1]; - // let pool = get_or_initialize_pool(&None).await?; - // let results: Vec<(String, String)> = sqlx::query_as(&query_builder!( - // "select indexname, indexdef from pg_indexes where tablename = '%d' and schemaname = '%d'", - // embeddings_table_name, - // collection_name - // )).fetch_all(&pool).await?; - // let names = results.iter().map(|(name, _)| name).collect::>(); - // let definitions = results - // .iter() - // .map(|(_, definition)| definition) - // .collect::>(); - // assert!(names.contains(&&format!("{}_pipeline_hnsw_vector_index", pipeline.name))); - // assert!(definitions.contains(&&format!("CREATE INDEX {}_pipeline_hnsw_vector_index ON {} USING hnsw (embedding vector_cosine_ops) WITH (m='100', ef_construction='200')", pipeline.name, full_embeddings_table_name))); - // Ok(()) - // } - - // #[sqlx::test] - // async fn sync_multiple_pipelines() -> anyhow::Result<()> { - // internal_init_logger(None, None).ok(); - // let model = Model::default(); - // let splitter = Splitter::default(); - // let mut pipeline1 = Pipeline::new( - // "test_r_p_smp_0", - // Some(model.clone()), - // Some(splitter.clone()), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut pipeline2 = Pipeline::new( - // "test_r_p_smp_1", - // Some(model), - // Some(splitter), - // Some( - // serde_json::json!({ - // "full_text_search": { - // "active": true, - // "configuration": "english" - // } - // }) - // .into(), - // ), - // ); - // let mut collection = Collection::new("test_r_c_smp_3", None); - // collection.add_pipeline(&mut pipeline1).await?; - // collection.add_pipeline(&mut pipeline2).await?; - // collection - // .upsert_documents(generate_dummy_documents(3), None) - // .await?; - // let status_1 = pipeline1.get_status().await?; - // let status_2 = pipeline2.get_status().await?; - // assert!( - // status_1.chunks_status.synced == status_1.chunks_status.total - // && status_1.chunks_status.not_synced == 0 - // ); - // assert!( - // status_2.chunks_status.synced == status_2.chunks_status.total - // && status_2.chunks_status.not_synced == 0 - // ); - // collection.archive().await?; - // Ok(()) - // } - /////////////////////////////// // Working With Documents ///// /////////////////////////////// @@ -1532,6 +1591,74 @@ mod tests { Ok(()) } + /////////////////////////////// + // Pipeline -> MultiFieldPIpeline + /////////////////////////////// + + #[test] + fn pipeline_to_multi_field_pipeline() -> anyhow::Result<()> { + let model = Model::new( + Some("test_model".to_string()), + Some("pgml".to_string()), + Some( + json!({ + "test_parameter": 10 + }) + .into(), + ), + ); + let splitter = Splitter::new( + Some("test_splitter".to_string()), + Some( + json!({ + "test_parameter": 11 + }) + .into(), + ), + ); + let parameters = json!({ + "full_text_search": { + "active": true, + "configuration": "test_configuration" + }, + "hnsw": { + "m": 16, + "ef_construction": 64 + } + }); + let multi_field_pipeline = Pipeline::new( + "test_name", + Some(model), + Some(splitter), + Some(parameters.into()), + ); + let schema = json!({ + "text": { + "splitter": { + "model": "test_splitter", + "parameters": { + "test_parameter": 11 + } + }, + "embed": { + "model": "test_model", + "parameters": { + "test_parameter": 10 + }, + "hnsw": { + "m": 16, + "ef_construction": 64 + } + }, + "full_text_search": { + "configuration": "test_configuration" + } + } + }); + assert_eq!(schema, multi_field_pipeline.schema.unwrap().0); + Ok(()) + } + /////////////////////////////// // ER Diagram ///////////////// /////////////////////////////// diff --git a/pgml-sdks/pgml/src/multi_field_pipeline.rs b/pgml-sdks/pgml/src/multi_field_pipeline.rs index d3138b4f6..bba53fd48 100644 --- a/pgml-sdks/pgml/src/multi_field_pipeline.rs +++ b/pgml-sdks/pgml/src/multi_field_pipeline.rs @@ -2,6 +2,7 @@ use anyhow::Context; use indicatif::MultiProgress; use rust_bridge::{alias, alias_manual, alias_methods}; use serde::Deserialize; +use serde_json::json; use sqlx::{Executor, PgConnection, PgPool, Postgres, Transaction}; use std::sync::atomic::Ordering::Relaxed; use std::sync::Arc; @@ -71,15 +72,15 @@ impl Default for HNSW { impl TryFrom for HNSW { type Error = anyhow::Error; fn try_from(value: Json) -> anyhow::Result { - let m = if !value["hnsw"]["m"].is_null() { - value["hnsw"]["m"] + let m = if !value["m"].is_null() { + value["m"] .try_to_u64() .context("hnsw.m must be an integer")? } else { 16 }; - let ef_construction = if !value["hnsw"]["ef_construction"].is_null() { - value["hnsw"]["ef_construction"] + let ef_construction = if !value["ef_construction"].is_null() { + value["ef_construction"] .try_to_u64() .context("hnsw.ef_construction must be an integer")? } else { @@ -136,6 +137,40 @@ impl TryFrom for FieldAction { } } +#[derive(Debug, Clone)] +pub struct InvividualSyncStatus { + pub synced: i64, + pub not_synced: i64, + pub total: i64, +} + +impl From for Json { + fn from(value: InvividualSyncStatus) -> Self { + serde_json::json!({ + "synced": value.synced, + "not_synced": value.not_synced, + "total": value.total, + }) + .into() + } +} + +impl From for InvividualSyncStatus { + fn from(value: Json) -> Self { + Self { + synced: value["synced"] + .as_i64() + .expect("The synced field is not an integer"), + not_synced: value["not_synced"] + .as_i64() + .expect("The not_synced field is not an integer"), + total: value["total"] + .as_i64() + .expect("The total field is not an integer"), + } + } +} + #[derive(Debug, Clone)] pub struct MultiFieldPipelineDatabaseData { pub id: i64, @@ -181,6 +216,94 @@ impl MultiFieldPipeline { }) } + /// Gets the status of the [Pipeline] + /// This includes the status of the chunks, embeddings, and tsvectors + /// + /// # Example + /// + /// ``` + /// use pgml::Collection; + /// + /// async fn example() -> anyhow::Result<()> { + /// let mut collection = Collection::new("my_collection", None); + /// let mut pipeline = collection.get_pipeline("my_pipeline").await?; + /// let status = pipeline.get_status().await?; + /// Ok(()) + /// } + /// ``` + #[instrument(skip(self))] + pub async fn get_status(&mut self) -> anyhow::Result { + self.verify_in_database(false).await?; + let parsed_schema = self + .parsed_schema + .as_ref() + .context("Pipeline must have schema to get status")?; + let project_info = self + .project_info + .as_ref() + .context("Pipeline must have project info to get status")?; + let pool = self.get_pool().await?; + + let mut results = json!({}); + + let schema = format!("{}_{}", project_info.name, self.name); + let documents_table_name = format!("{}.documents", project_info.name); + for (key, value) in parsed_schema.iter() { + let chunks_table_name = format!("{schema}.{key}_chunks"); + + results[key] = json!({}); + + if let Some(_) = value.splitter { + let chunks_status: (Option, Option) = sqlx::query_as(&query_builder!( + "SELECT (SELECT COUNT(DISTINCT document_id) FROM %s), COUNT(id) FROM %s", + chunks_table_name, + documents_table_name + )) + .fetch_one(&pool) + .await?; + results[key]["chunks"] = json!({ + "synced": chunks_status.0.unwrap_or(0), + "not_synced": chunks_status.1.unwrap_or(0) - chunks_status.0.unwrap_or(0), + "total": chunks_status.1.unwrap_or(0), + }); + } + + if let Some(_) = value.embed { + let embeddings_table_name = format!("{schema}.{key}_embeddings"); + let embeddings_status: (Option, Option) = + sqlx::query_as(&query_builder!( + "SELECT (SELECT count(*) FROM %s), (SELECT count(*) FROM %s)", + embeddings_table_name, + chunks_table_name + )) + .fetch_one(&pool) + .await?; + results[key]["embeddings"] = json!({ + "synced": embeddings_status.0.unwrap_or(0), + "not_synced": embeddings_status.1.unwrap_or(0) - embeddings_status.0.unwrap_or(0), + "total": embeddings_status.1.unwrap_or(0), + }); + } + + if let Some(_) = value.full_text_search { + let tsvectors_table_name = format!("{schema}.{key}_tsvectors"); + let tsvectors_status: (Option, Option) = sqlx::query_as(&query_builder!( + "SELECT (SELECT count(*) FROM %s), (SELECT count(*) FROM %s)", + tsvectors_table_name, + chunks_table_name + )) + .fetch_one(&pool) + .await?; + results[key]["tsvectors"] = json!({ + "synced": tsvectors_status.0.unwrap_or(0), + "not_synced": tsvectors_status.1.unwrap_or(0) - tsvectors_status.0.unwrap_or(0), + "total": tsvectors_status.1.unwrap_or(0), + }); + } + } + Ok(results.into()) + } + #[instrument(skip(self))] pub(crate) async fn verify_in_database(&mut self, throw_if_exists: bool) -> anyhow::Result<()> { if self.database_data.is_none() { @@ -189,7 +312,7 @@ impl MultiFieldPipeline { let project_info = self .project_info .as_ref() - .context("Cannot verify pipeline wihtout project info")?; + .context("Cannot verify pipeline without project info")?; let pipeline: Option = sqlx::query_as(&query_builder!( "SELECT * FROM %s WHERE name = $1", @@ -643,7 +766,7 @@ impl MultiFieldPipeline { } #[instrument(skip(self))] - pub async fn resync(&mut self) -> anyhow::Result<()> { + pub(crate) async fn resync(&mut self) -> anyhow::Result<()> { self.verify_in_database(false).await?; // We are assuming we have manually verified the pipeline before doing this diff --git a/pgml-sdks/pgml/src/pipeline.rs b/pgml-sdks/pgml/src/pipeline.rs index ea76a51c2..854e55714 100644 --- a/pgml-sdks/pgml/src/pipeline.rs +++ b/pgml-sdks/pgml/src/pipeline.rs @@ -1,6 +1,7 @@ use anyhow::Context; use indicatif::MultiProgress; use rust_bridge::{alias, alias_manual, alias_methods}; +use serde_json::json; use sqlx::{Executor, PgConnection, PgPool}; use std::sync::atomic::AtomicBool; use std::sync::atomic::Ordering::Relaxed; @@ -22,85 +23,14 @@ use crate::{ #[cfg(feature = "python")] use crate::{model::ModelPython, splitter::SplitterPython, types::JsonPython}; -#[derive(Debug, Clone)] -pub struct InvividualSyncStatus { - pub synced: i64, - pub not_synced: i64, - pub total: i64, -} - -impl From for Json { - fn from(value: InvividualSyncStatus) -> Self { - serde_json::json!({ - "synced": value.synced, - "not_synced": value.not_synced, - "total": value.total, - }) - .into() - } -} - -impl From for InvividualSyncStatus { - fn from(value: Json) -> Self { - Self { - synced: value["synced"] - .as_i64() - .expect("The synced field is not an integer"), - not_synced: value["not_synced"] - .as_i64() - .expect("The not_synced field is not an integer"), - total: value["total"] - .as_i64() - .expect("The total field is not an integer"), - } - } -} - -#[derive(alias_manual, Debug, Clone)] -pub struct PipelineSyncData { - pub chunks_status: InvividualSyncStatus, - pub embeddings_status: InvividualSyncStatus, - pub tsvectors_status: InvividualSyncStatus, -} - -impl From for Json { - fn from(value: PipelineSyncData) -> Self { - serde_json::json!({ - "chunks_status": *Json::from(value.chunks_status), - "embeddings_status": *Json::from(value.embeddings_status), - "tsvectors_status": *Json::from(value.tsvectors_status), - }) - .into() - } -} - -impl From for PipelineSyncData { - fn from(mut value: Json) -> Self { - Self { - chunks_status: Json::from(std::mem::take(&mut value["chunks_status"])).into(), - embeddings_status: Json::from(std::mem::take(&mut value["embeddings_status"])).into(), - tsvectors_status: Json::from(std::mem::take(&mut value["tsvectors_status"])).into(), - } - } -} - -#[derive(Debug, Clone)] -pub struct PipelineDatabaseData { - pub id: i64, - pub created_at: DateTime, - pub model_id: i64, - pub splitter_id: i64, -} - /// A pipeline that processes documents +/// This has been deprecated in favor of [MultiFieldPipeline] #[derive(alias, Debug, Clone)] pub struct Pipeline { pub name: String, pub model: Option, pub splitter: Option, pub parameters: Option, - project_info: Option, - pub(crate) database_data: Option, } #[alias_methods(new, get_status, to_dict)] @@ -128,122 +58,36 @@ impl Pipeline { splitter: Option, parameters: Option, ) -> MultiFieldPipeline { - // let schema = serde_json::json!({ - // "text": { - // "embed": { - // "model": model.na - // }); + let parameters = parameters.unwrap_or_default(); let schema = if let Some(model) = model { - Some(serde_json::json!({ + let mut schema = json!({ "text": { "embed": { - "model": model.name + "model": model.name, + "parameters": model.parameters, + "hnsw": parameters["hnsw"] } } - })) + }); + if let Some(splitter) = splitter { + schema["text"]["splitter"] = json!({ + "model": splitter.name, + "parameters": splitter.parameters + }); + } + if parameters["full_text_search"]["active"] + .as_bool() + .unwrap_or_default() + { + schema["text"]["full_text_search"] = json!({ + "configuration": parameters["full_text_search"]["configuration"].as_str().map(|v| v.to_string()).unwrap_or_else(|| "english".to_string()) + }); + } + Some(schema.into()) } else { None }; - MultiFieldPipeline::new(name, schema.map(|v| v.into())) - .expect("Error conerting pipeline into new multifield pipeline") - - // let parameters = Some(parameters.unwrap_or_default()); - // Self { - // name: name.to_string(), - // model, - // splitter, - // parameters, - // project_info: None, - // database_data: None, - // } - } - - /// Gets the status of the [Pipeline] - /// This includes the status of the chunks, embeddings, and tsvectors - /// - /// # Example - /// - /// ``` - /// use pgml::Collection; - /// - /// async fn example() -> anyhow::Result<()> { - /// let mut collection = Collection::new("my_collection", None); - /// let mut pipeline = collection.get_pipeline("my_pipeline").await?; - /// let status = pipeline.get_status().await?; - /// Ok(()) - /// } - /// ``` - #[instrument(skip(self))] - pub async fn get_status(&mut self) -> anyhow::Result { - unimplemented!() - // let pool = self.get_pool().await?; - - // self.verify_in_database(false).await?; - // let embeddings_table_name = self.create_or_get_embeddings_table().await?; - - // let database_data = self - // .database_data - // .as_ref() - // .context("Pipeline must be verified to get status")?; - - // let parameters = self - // .parameters - // .as_ref() - // .context("Pipeline must be verified to get status")?; - - // let project_name = &self.project_info.as_ref().unwrap().name; - - // // TODO: Maybe combine all of these into one query so it is faster - // let chunks_status: (Option, Option) = sqlx::query_as(&query_builder!( - // "SELECT (SELECT COUNT(DISTINCT document_id) FROM %s WHERE splitter_id = $1), COUNT(id) FROM %s", - // format!("{}.chunks", project_name), - // format!("{}.documents", project_name) - // )) - // .bind(database_data.splitter_id) - // .fetch_one(&pool).await?; - // let chunks_status = InvividualSyncStatus { - // synced: chunks_status.0.unwrap_or(0), - // not_synced: chunks_status.1.unwrap_or(0) - chunks_status.0.unwrap_or(0), - // total: chunks_status.1.unwrap_or(0), - // }; - - // let embeddings_status: (Option, Option) = sqlx::query_as(&query_builder!( - // "SELECT (SELECT count(*) FROM %s), (SELECT count(*) FROM %s WHERE splitter_id = $1)", - // embeddings_table_name, - // format!("{}.chunks", project_name) - // )) - // .bind(database_data.splitter_id) - // .fetch_one(&pool) - // .await?; - // let embeddings_status = InvividualSyncStatus { - // synced: embeddings_status.0.unwrap_or(0), - // not_synced: embeddings_status.1.unwrap_or(0) - embeddings_status.0.unwrap_or(0), - // total: embeddings_status.1.unwrap_or(0), - // }; - - // let tsvectors_status = if parameters["full_text_search"]["active"] - // == serde_json::Value::Bool(true) - // { - // sqlx::query_as(&query_builder!( - // "SELECT (SELECT COUNT(*) FROM %s WHERE configuration = $1), (SELECT COUNT(*) FROM %s)", - // format!("{}.documents_tsvectors", project_name), - // format!("{}.documents", project_name) - // )) - // .bind(parameters["full_text_search"]["configuration"].as_str()) - // .fetch_one(&pool).await? - // } else { - // (Some(0), Some(0)) - // }; - // let tsvectors_status = InvividualSyncStatus { - // synced: tsvectors_status.0.unwrap_or(0), - // not_synced: tsvectors_status.1.unwrap_or(0) - tsvectors_status.0.unwrap_or(0), - // total: tsvectors_status.1.unwrap_or(0), - // }; - - // Ok(PipelineSyncData { - // chunks_status, - // embeddings_status, - // tsvectors_status, - // }) + MultiFieldPipeline::new(name, schema) + .expect("Error converting pipeline into new multifield pipeline") } } From 59f44192f6fde39d223f96a66f3c8a3f5c61a0f4 Mon Sep 17 00:00:00 2001 From: Silas Marvin <19626586+SilasMarvin@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:56:35 -0800 Subject: [PATCH 11/72] Working site search with doc type filtering --- pgml-dashboard/src/api/cms.rs | 16 +- pgml-dashboard/src/main.rs | 12 +- pgml-dashboard/src/utils/markdown.rs | 615 +++++---------------------- 3 files changed, 116 insertions(+), 527 deletions(-) diff --git a/pgml-dashboard/src/api/cms.rs b/pgml-dashboard/src/api/cms.rs index d2a7c767f..ee1060d02 100644 --- a/pgml-dashboard/src/api/cms.rs +++ b/pgml-dashboard/src/api/cms.rs @@ -559,16 +559,8 @@ impl Collection { } #[get("/search?", rank = 20)] -async fn search( - query: &str, - site_search: &State, -) -> ResponseOk { - eprintln!("\n\nWE IN HERE\n\n"); - let results = site_search - .search(query) - .await - .expect("Error performing search"); - +async fn search(query: &str, site_search: &State) -> ResponseOk { + let results = site_search.search(query, None).await.expect("Error performing search"); ResponseOk( Template(Search { query: query.to_string(), @@ -718,9 +710,9 @@ pub fn routes() -> Vec { #[cfg(test)] mod test { use super::*; - use crate::utils::markdown::{options, MarkdownHeadings, SyntaxHighlighter}; + use crate::utils::markdown::options; use regex::Regex; - use rocket::http::{ContentType, Cookie, Status}; + use rocket::http::Status; use rocket::local::asynchronous::Client; use rocket::{Build, Rocket}; diff --git a/pgml-dashboard/src/main.rs b/pgml-dashboard/src/main.rs index 275e9c5df..13830dd0f 100644 --- a/pgml-dashboard/src/main.rs +++ b/pgml-dashboard/src/main.rs @@ -92,11 +92,10 @@ async fn main() { // it's important to hang on to sentry so it isn't dropped and stops reporting let _sentry = configure_reporting().await; - // markdown::SearchIndex::build().await.unwrap(); - - let site_search = markdown::SiteSearch::new() + let mut site_search = markdown::SiteSearch::new() .await .expect("Error initializing site search"); + site_search.build().await.expect("Error building site search"); pgml_dashboard::migrate(guards::Cluster::default(None).pool()) .await @@ -135,8 +134,13 @@ mod test { pgml_dashboard::migrate(Cluster::default(None).pool()).await.unwrap(); + let mut site_search = markdown::SiteSearch::new() + .await + .expect("Error initializing site search"); + site_search.build().await.expect("Error building site search"); + rocket::build() - .manage(markdown::SearchIndex::open().unwrap()) + .manage(site_search) .mount("/", rocket::routes![index, error]) .mount("/dashboard/static", FileServer::from(config::static_dir())) .mount("/dashboard", pgml_dashboard::routes()) diff --git a/pgml-dashboard/src/utils/markdown.rs b/pgml-dashboard/src/utils/markdown.rs index ee19c606c..285246add 100644 --- a/pgml-dashboard/src/utils/markdown.rs +++ b/pgml-dashboard/src/utils/markdown.rs @@ -1,8 +1,9 @@ +use crate::api::cms::{DocType, Document}; use crate::{templates::docs::TocLink, utils::config}; use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; -use std::path::{Path, PathBuf}; +use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use anyhow::Result; @@ -10,22 +11,15 @@ use comrak::{ adapters::{HeadingAdapter, HeadingMeta, SyntaxHighlighterAdapter}, arena_tree::Node, nodes::{Ast, AstNode, NodeValue}, - parse_document, Arena, ComrakExtensionOptions, ComrakOptions, ComrakRenderOptions, + Arena, ComrakExtensionOptions, ComrakOptions, ComrakRenderOptions, }; use convert_case; use itertools::Itertools; use regex::Regex; -use serde::{Deserialize, Serialize}; -use tantivy::collector::TopDocs; -use tantivy::query::{QueryParser, RegexQuery}; -use tantivy::schema::*; -use tantivy::tokenizer::{LowerCaser, NgramTokenizer, TextAnalyzer}; -use tantivy::{Index, IndexReader, SnippetGenerator}; -use url::Url; - -use std::sync::Mutex; - +use serde::Deserialize; use std::fmt; +use std::sync::Mutex; +use url::Url; pub struct MarkdownHeadings { header_map: Arc>>, @@ -1224,25 +1218,16 @@ pub async fn get_document(path: &PathBuf) -> anyhow::Result { } #[derive(Deserialize)] -pub struct SearchResult { - pub title: String, - pub body: String, - pub path: String, - pub snippet: String, -} - -#[derive(Serialize)] -struct Document { - id: String, +struct SearchResultWithoutSnippet { title: String, - body: String, + contents: String, path: String, } -impl Document { - fn new(id: String, title: String, body: String, path: String) -> Self { - Self { id, title, body, path } - } +pub struct SearchResult { + pub title: String, + pub path: String, + pub snippet: String, } pub struct SiteSearch { @@ -1253,15 +1238,41 @@ pub struct SiteSearch { impl SiteSearch { pub async fn new() -> anyhow::Result { let collection = pgml::Collection::new( - "hypercloud-site-search-c-1", + "hypercloud-site-search-c-4", Some(std::env::var("SITE_SEARCH_DATABASE_URL")?), ); - let pipeline = pgml::MultiFieldPipeline::new("hypercloud-site-search-p-1", serde_json::json!({}).into()); + let pipeline = pgml::MultiFieldPipeline::new( + "hypercloud-site-search-p-1", + Some( + serde_json::json!({ + "title": { + "full_text_search": { + "configuration": "english" + }, + "embed": { + "model": "intfloat/e5-small" + } + }, + "contents": { + "splitter": { + "model": "recursive_character" + }, + "full_text_search": { + "configuration": "english" + }, + "embed": { + "model": "intfloat/e5-small" + } + } + }) + .into(), + ), + )?; Ok(Self { collection, pipeline }) } pub fn documents() -> Vec { - // TODO imrpove this .display().to_string() + // TODO improve this .display().to_string() let guides = glob::glob(&config::cms_dir().join("docs/**/*.md").display().to_string()).expect("glob failed"); let blogs = glob::glob(&config::cms_dir().join("blog/**/*.md").display().to_string()).expect("glob failed"); guides @@ -1270,256 +1281,84 @@ impl SiteSearch { .collect() } - pub async fn search(&self, query: &str) -> anyhow::Result> { - self.collection - .search( - serde_json::json!({ - "query": { - "semantic_search": { - "title": { - "query": query, - "boost": 2.0, - }, - "body": { - "query": query, - } - } + pub async fn search(&self, query: &str, doc_type: Option) -> anyhow::Result

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.