From c462f516ad34d375a774e4a512348314ac5aaae9 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Wed, 24 May 2023 16:46:34 -0700 Subject: [PATCH 1/5] Add LangChain splitters --- pgml-extension/src/api.rs | 7 +++- pgml-extension/src/bindings/chunking.py | 29 ++++++++++++++++ pgml-extension/src/bindings/chunking.rs | 46 +++++++++++++++++++++++++ pgml-extension/src/bindings/mod.rs | 2 ++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 pgml-extension/src/bindings/chunking.py create mode 100644 pgml-extension/src/bindings/chunking.rs diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index e3ecff1e4..60339fd69 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -565,7 +565,12 @@ fn load_dataset( #[pg_extern(immutable, parallel_safe)] pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { - crate::bindings::transformers::embed(transformer, &text, &kwargs.0) + crate::bindings::transformers::embed(transformer, text, &kwargs.0) +} + +#[pg_extern(immutable, parallel_safe)] +pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { + crate::bindings::chunking::chunk(splitter, text, &kwargs.0) } #[cfg(feature = "python")] diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/chunking.py new file mode 100644 index 000000000..7bd224230 --- /dev/null +++ b/pgml-extension/src/bindings/chunking.py @@ -0,0 +1,29 @@ +from langchain.text_splitter import ( + CharacterTextSplitter, + LatexTextSplitter, + MarkdownTextSplitter, + NLTKTextSplitter, + PythonCodeTextSplitter, + RecursiveCharacterTextSplitter, + SpacyTextSplitter, +) +import json + +SPLITTERS = { + "character": CharacterTextSplitter, + "latex": LatexTextSplitter, + "markdown": MarkdownTextSplitter, + "nltk": NLTKTextSplitter, + "python": PythonCodeTextSplitter, + "recursive_character": RecursiveCharacterTextSplitter, + "spacy": SpacyTextSplitter, +} + + +def chunk(splitter, text, args): + kwargs = json.loads(args) + + if splitter in SPLITTERS: + return SPLITTERS[splitter](**kwargs).split_text(text) + else: + raise ValueError("Unknown splitter: {}".format(splitter)) diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs new file mode 100644 index 000000000..58c3bae9f --- /dev/null +++ b/pgml-extension/src/bindings/chunking.rs @@ -0,0 +1,46 @@ + +use once_cell::sync::Lazy; +use pgrx::*; +use pyo3::prelude::*; +use pyo3::types::PyTuple; + +static PY_MODULE: Lazy> = Lazy::new(|| { + Python::with_gil(|py| -> Py { + let src = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/bindings/chunking.py" + )); + + PyModule::from_code(py, src, "", "").unwrap().into() + }) +}); + +pub fn chunk( + splitter: &str, + text: &str, + kwargs: &serde_json::Value, +) -> Vec { + crate::bindings::venv::activate(); + + let kwargs = serde_json::to_string(kwargs).unwrap(); + + Python::with_gil(|py| -> Vec { + let chunk: Py = PY_MODULE.getattr(py, "chunk").unwrap().into(); + + chunk + .call1( + py, + PyTuple::new( + py, + &[ + splitter.into_py(py), + text.into_py(py), + kwargs.into_py(py), + ], + ), + ) + .unwrap() + .extract(py) + .unwrap() + }) +} diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index b147a8104..f1bdd101b 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -13,6 +13,8 @@ pub mod sklearn; pub mod transformers; #[cfg(feature = "python")] pub mod venv; +#[cfg(feature = "python")] +pub mod chunking; pub mod xgboost; pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box; From 422bf6577e9c403b8d832c311d0a3c750b4a2861 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:21:20 -0700 Subject: [PATCH 2/5] TableIterator & fmt --- pgml-extension/src/api.rs | 19 +++++++++++++++---- pgml-extension/src/bindings/chunking.rs | 13 ++----------- pgml-extension/src/bindings/mod.rs | 4 ++-- pgml-extension/src/bindings/transformers.rs | 6 +----- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index 60339fd69..71d3d3421 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -569,8 +569,19 @@ pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> } #[pg_extern(immutable, parallel_safe)] -pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { - crate::bindings::chunking::chunk(splitter, text, &kwargs.0) +pub fn chunk( + splitter: &str, + text: &str, + kwargs: default!(JsonB, "'{}'"), +) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> { + let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0); + let chunks = chunks + .into_iter() + .enumerate() + .map(|(i, chunk)| (i as i64 + 1, chunk)) + .collect::>(); + + TableIterator::new(chunks.into_iter()) } #[cfg(feature = "python")] @@ -580,7 +591,7 @@ pub fn transform_json( task: JsonB, args: default!(JsonB, "'{}'"), inputs: default!(Vec, "ARRAY[]::TEXT[]"), - cache: default!(bool, false) + cache: default!(bool, false), ) -> JsonB { JsonB(crate::bindings::transformers::transform( &task.0, &args.0, &inputs, @@ -594,7 +605,7 @@ pub fn transform_string( task: String, args: default!(JsonB, "'{}'"), inputs: default!(Vec, "ARRAY[]::TEXT[]"), - cache: default!(bool, false) + cache: default!(bool, false), ) -> JsonB { let mut task_map = HashMap::new(); task_map.insert("task", task); diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs index 58c3bae9f..da83a5b42 100644 --- a/pgml-extension/src/bindings/chunking.rs +++ b/pgml-extension/src/bindings/chunking.rs @@ -1,4 +1,3 @@ - use once_cell::sync::Lazy; use pgrx::*; use pyo3::prelude::*; @@ -15,11 +14,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| { }) }); -pub fn chunk( - splitter: &str, - text: &str, - kwargs: &serde_json::Value, -) -> Vec { +pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec { crate::bindings::venv::activate(); let kwargs = serde_json::to_string(kwargs).unwrap(); @@ -32,11 +27,7 @@ pub fn chunk( py, PyTuple::new( py, - &[ - splitter.into_py(py), - text.into_py(py), - kwargs.into_py(py), - ], + &[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)], ), ) .unwrap() diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index f1bdd101b..845ea20fa 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -5,6 +5,8 @@ use pgrx::*; use crate::orm::*; +#[cfg(feature = "python")] +pub mod chunking; pub mod lightgbm; pub mod linfa; #[cfg(feature = "python")] @@ -13,8 +15,6 @@ pub mod sklearn; pub mod transformers; #[cfg(feature = "python")] pub mod venv; -#[cfg(feature = "python")] -pub mod chunking; pub mod xgboost; pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box; diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs index 9ce3825f4..8f296d812 100644 --- a/pgml-extension/src/bindings/transformers.rs +++ b/pgml-extension/src/bindings/transformers.rs @@ -40,11 +40,7 @@ pub fn transform( py, PyTuple::new( py, - &[ - task.into_py(py), - args.into_py(py), - inputs.into_py(py), - ], + &[task.into_py(py), args.into_py(py), inputs.into_py(py)], ), ) .unwrap() From e6b2d1e7f79d12eeeb045bf813977e56bd3f33a6 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:35:46 -0700 Subject: [PATCH 3/5] req and example --- pgml-extension/examples/test_chunking.sql | 60 +++++++++++++++++++++++ pgml-extension/requirements.txt | 1 + 2 files changed, 61 insertions(+) create mode 100644 pgml-extension/examples/test_chunking.sql diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/test_chunking.sql new file mode 100644 index 000000000..1b5acc676 --- /dev/null +++ b/pgml-extension/examples/test_chunking.sql @@ -0,0 +1,60 @@ +--- Chunk text for LLM embeddings and vectorization. + +DROP TABLE documents CASCADE; +CREATE TABLE documents ( + id BIGSERIAL PRIMARY KEY, + document TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +DROP TABLE splitters CASCADE; +CREATE TABLE splitters ( + id BIGSERIAL PRIMARY KEY, + splitter VARCHAR NOT NULL DEFAULT 'recursive_character' +); + +DROP TABLE document_chunks CASCADE; +CREATE TABLE document_chunks( + id BIGSERIAL PRIMARY KEY, + document_id BIGINT NOT NULL REFERENCES documents(id), + splitter_id BIGINT NOT NULL REFERENCES splitters(id), + chunk_index BIGINT NOT NULL, + chunk VARCHAR +); + +INSERT INTO documents VALUES ( + 1, + 'It was the best of times, it was the worst of times, it was the age of wisdom, + it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, + it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, + we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like + the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.', + NOW() +); + +INSERT INTO splitters VALUES (1, 'recursive_character'); + +WITH document AS ( + SELECT id, document + FROM documents + WHERE id = 1 +), + +splitter AS ( + SELECT id, splitter + FROM splitters + WHERE id = 1 +) + +INSERT INTO document_chunks SELECT + nextval('document_chunks_id_seq'::regclass), + (SELECT id FROM document), + (SELECT id FROM splitter), + chunk_index, + chunk +FROM + pgml.chunk( + (SELECT splitter FROM splitter), + (SELECT document FROM document), + '{"chunk_size": 2, "chunk_overlap": 2}' + ); diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index 1d766a091..405dc0a70 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -18,3 +18,4 @@ torchvision==0.15.2 tqdm==4.65.0 transformers==4.29.2 xgboost==1.7.5 +langchain==0.0.180 From 90098e97a58f2c6b04b58b6ea939720bd9295bf5 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:38:49 -0700 Subject: [PATCH 4/5] ok --- pgml-extension/examples/{test_chunking.sql => chunking.sql} | 2 ++ pgml-extension/tests/test.sql | 1 + 2 files changed, 3 insertions(+) rename pgml-extension/examples/{test_chunking.sql => chunking.sql} (97%) diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/chunking.sql similarity index 97% rename from pgml-extension/examples/test_chunking.sql rename to pgml-extension/examples/chunking.sql index 1b5acc676..f8559ef7c 100644 --- a/pgml-extension/examples/test_chunking.sql +++ b/pgml-extension/examples/chunking.sql @@ -58,3 +58,5 @@ FROM (SELECT document FROM document), '{"chunk_size": 2, "chunk_overlap": 2}' ); + +SELECT * FROM document_chunks LIMIT 5; diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql index 1c4dd614b..c1ef81d58 100644 --- a/pgml-extension/tests/test.sql +++ b/pgml-extension/tests/test.sql @@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine'); \i examples/multi_classification.sql \i examples/regression.sql \i examples/vectors.sql +\i examples/chunking.sql -- transformers are generally too slow to run in the test suite --\i examples/transformers.sql From 9439324eee9858d56d249bdc95d5859ccdf0dadc Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:41:19 -0700 Subject: [PATCH 5/5] rename --- pgml-extension/src/api.rs | 2 +- pgml-extension/src/bindings/{chunking.py => langchain.py} | 0 pgml-extension/src/bindings/{chunking.rs => langchain.rs} | 2 +- pgml-extension/src/bindings/mod.rs | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename pgml-extension/src/bindings/{chunking.py => langchain.py} (100%) rename pgml-extension/src/bindings/{chunking.rs => langchain.rs} (95%) diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index 71d3d3421..914952e91 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -574,7 +574,7 @@ pub fn chunk( text: &str, kwargs: default!(JsonB, "'{}'"), ) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> { - let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0); + let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0); let chunks = chunks .into_iter() .enumerate() diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/langchain.py similarity index 100% rename from pgml-extension/src/bindings/chunking.py rename to pgml-extension/src/bindings/langchain.py diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/langchain.rs similarity index 95% rename from pgml-extension/src/bindings/chunking.rs rename to pgml-extension/src/bindings/langchain.rs index da83a5b42..61b3d61ef 100644 --- a/pgml-extension/src/bindings/chunking.rs +++ b/pgml-extension/src/bindings/langchain.rs @@ -7,7 +7,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| { Python::with_gil(|py| -> Py { let src = include_str!(concat!( env!("CARGO_MANIFEST_DIR"), - "/src/bindings/chunking.py" + "/src/bindings/langchain.py" )); PyModule::from_code(py, src, "", "").unwrap().into() diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index 845ea20fa..77a46e161 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -6,7 +6,7 @@ use pgrx::*; use crate::orm::*; #[cfg(feature = "python")] -pub mod chunking; +pub mod langchain; pub mod lightgbm; pub mod linfa; #[cfg(feature = "python")] pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy