Content-Length: 15289 | pFad | http://github.com/postgresml/postgresml/pull/655.patch

thub.com From c462f516ad34d375a774e4a512348314ac5aaae9 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Wed, 24 May 2023 16:46:34 -0700 Subject: [PATCH 1/5] Add LangChain splitters --- pgml-extension/src/api.rs | 7 +++- pgml-extension/src/bindings/chunking.py | 29 ++++++++++++++++ pgml-extension/src/bindings/chunking.rs | 46 +++++++++++++++++++++++++ pgml-extension/src/bindings/mod.rs | 2 ++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 pgml-extension/src/bindings/chunking.py create mode 100644 pgml-extension/src/bindings/chunking.rs diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index e3ecff1e4..60339fd69 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -565,7 +565,12 @@ fn load_dataset( #[pg_extern(immutable, parallel_safe)] pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { - crate::bindings::transformers::embed(transformer, &text, &kwargs.0) + crate::bindings::transformers::embed(transformer, text, &kwargs.0) +} + +#[pg_extern(immutable, parallel_safe)] +pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { + crate::bindings::chunking::chunk(splitter, text, &kwargs.0) } #[cfg(feature = "python")] diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/chunking.py new file mode 100644 index 000000000..7bd224230 --- /dev/null +++ b/pgml-extension/src/bindings/chunking.py @@ -0,0 +1,29 @@ +from langchain.text_splitter import ( + CharacterTextSplitter, + LatexTextSplitter, + MarkdownTextSplitter, + NLTKTextSplitter, + PythonCodeTextSplitter, + RecursiveCharacterTextSplitter, + SpacyTextSplitter, +) +import json + +SPLITTERS = { + "character": CharacterTextSplitter, + "latex": LatexTextSplitter, + "markdown": MarkdownTextSplitter, + "nltk": NLTKTextSplitter, + "python": PythonCodeTextSplitter, + "recursive_character": RecursiveCharacterTextSplitter, + "spacy": SpacyTextSplitter, +} + + +def chunk(splitter, text, args): + kwargs = json.loads(args) + + if splitter in SPLITTERS: + return SPLITTERS[splitter](**kwargs).split_text(text) + else: + raise ValueError("Unknown splitter: {}".format(splitter)) diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs new file mode 100644 index 000000000..58c3bae9f --- /dev/null +++ b/pgml-extension/src/bindings/chunking.rs @@ -0,0 +1,46 @@ + +use once_cell::sync::Lazy; +use pgrx::*; +use pyo3::prelude::*; +use pyo3::types::PyTuple; + +static PY_MODULE: Lazy> = Lazy::new(|| { + Python::with_gil(|py| -> Py { + let src = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/src/bindings/chunking.py" + )); + + PyModule::from_code(py, src, "", "").unwrap().into() + }) +}); + +pub fn chunk( + splitter: &str, + text: &str, + kwargs: &serde_json::Value, +) -> Vec { + crate::bindings::venv::activate(); + + let kwargs = serde_json::to_string(kwargs).unwrap(); + + Python::with_gil(|py| -> Vec { + let chunk: Py = PY_MODULE.getattr(py, "chunk").unwrap().into(); + + chunk + .call1( + py, + PyTuple::new( + py, + &[ + splitter.into_py(py), + text.into_py(py), + kwargs.into_py(py), + ], + ), + ) + .unwrap() + .extract(py) + .unwrap() + }) +} diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index b147a8104..f1bdd101b 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -13,6 +13,8 @@ pub mod sklearn; pub mod transformers; #[cfg(feature = "python")] pub mod venv; +#[cfg(feature = "python")] +pub mod chunking; pub mod xgboost; pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box; From 422bf6577e9c403b8d832c311d0a3c750b4a2861 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:21:20 -0700 Subject: [PATCH 2/5] TableIterator & fmt --- pgml-extension/src/api.rs | 19 +++++++++++++++---- pgml-extension/src/bindings/chunking.rs | 13 ++----------- pgml-extension/src/bindings/mod.rs | 4 ++-- pgml-extension/src/bindings/transformers.rs | 6 +----- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index 60339fd69..71d3d3421 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -569,8 +569,19 @@ pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> } #[pg_extern(immutable, parallel_safe)] -pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec { - crate::bindings::chunking::chunk(splitter, text, &kwargs.0) +pub fn chunk( + splitter: &str, + text: &str, + kwargs: default!(JsonB, "'{}'"), +) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> { + let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0); + let chunks = chunks + .into_iter() + .enumerate() + .map(|(i, chunk)| (i as i64 + 1, chunk)) + .collect::>(); + + TableIterator::new(chunks.into_iter()) } #[cfg(feature = "python")] @@ -580,7 +591,7 @@ pub fn transform_json( task: JsonB, args: default!(JsonB, "'{}'"), inputs: default!(Vec, "ARRAY[]::TEXT[]"), - cache: default!(bool, false) + cache: default!(bool, false), ) -> JsonB { JsonB(crate::bindings::transformers::transform( &task.0, &args.0, &inputs, @@ -594,7 +605,7 @@ pub fn transform_string( task: String, args: default!(JsonB, "'{}'"), inputs: default!(Vec, "ARRAY[]::TEXT[]"), - cache: default!(bool, false) + cache: default!(bool, false), ) -> JsonB { let mut task_map = HashMap::new(); task_map.insert("task", task); diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs index 58c3bae9f..da83a5b42 100644 --- a/pgml-extension/src/bindings/chunking.rs +++ b/pgml-extension/src/bindings/chunking.rs @@ -1,4 +1,3 @@ - use once_cell::sync::Lazy; use pgrx::*; use pyo3::prelude::*; @@ -15,11 +14,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| { }) }); -pub fn chunk( - splitter: &str, - text: &str, - kwargs: &serde_json::Value, -) -> Vec { +pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec { crate::bindings::venv::activate(); let kwargs = serde_json::to_string(kwargs).unwrap(); @@ -32,11 +27,7 @@ pub fn chunk( py, PyTuple::new( py, - &[ - splitter.into_py(py), - text.into_py(py), - kwargs.into_py(py), - ], + &[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)], ), ) .unwrap() diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index f1bdd101b..845ea20fa 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -5,6 +5,8 @@ use pgrx::*; use crate::orm::*; +#[cfg(feature = "python")] +pub mod chunking; pub mod lightgbm; pub mod linfa; #[cfg(feature = "python")] @@ -13,8 +15,6 @@ pub mod sklearn; pub mod transformers; #[cfg(feature = "python")] pub mod venv; -#[cfg(feature = "python")] -pub mod chunking; pub mod xgboost; pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box; diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs index 9ce3825f4..8f296d812 100644 --- a/pgml-extension/src/bindings/transformers.rs +++ b/pgml-extension/src/bindings/transformers.rs @@ -40,11 +40,7 @@ pub fn transform( py, PyTuple::new( py, - &[ - task.into_py(py), - args.into_py(py), - inputs.into_py(py), - ], + &[task.into_py(py), args.into_py(py), inputs.into_py(py)], ), ) .unwrap() From e6b2d1e7f79d12eeeb045bf813977e56bd3f33a6 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:35:46 -0700 Subject: [PATCH 3/5] req and example --- pgml-extension/examples/test_chunking.sql | 60 +++++++++++++++++++++++ pgml-extension/requirements.txt | 1 + 2 files changed, 61 insertions(+) create mode 100644 pgml-extension/examples/test_chunking.sql diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/test_chunking.sql new file mode 100644 index 000000000..1b5acc676 --- /dev/null +++ b/pgml-extension/examples/test_chunking.sql @@ -0,0 +1,60 @@ +--- Chunk text for LLM embeddings and vectorization. + +DROP TABLE documents CASCADE; +CREATE TABLE documents ( + id BIGSERIAL PRIMARY KEY, + document TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +DROP TABLE splitters CASCADE; +CREATE TABLE splitters ( + id BIGSERIAL PRIMARY KEY, + splitter VARCHAR NOT NULL DEFAULT 'recursive_character' +); + +DROP TABLE document_chunks CASCADE; +CREATE TABLE document_chunks( + id BIGSERIAL PRIMARY KEY, + document_id BIGINT NOT NULL REFERENCES documents(id), + splitter_id BIGINT NOT NULL REFERENCES splitters(id), + chunk_index BIGINT NOT NULL, + chunk VARCHAR +); + +INSERT INTO documents VALUES ( + 1, + 'It was the best of times, it was the worst of times, it was the age of wisdom, + it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, + it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, + we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like + the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.', + NOW() +); + +INSERT INTO splitters VALUES (1, 'recursive_character'); + +WITH document AS ( + SELECT id, document + FROM documents + WHERE id = 1 +), + +splitter AS ( + SELECT id, splitter + FROM splitters + WHERE id = 1 +) + +INSERT INTO document_chunks SELECT + nextval('document_chunks_id_seq'::regclass), + (SELECT id FROM document), + (SELECT id FROM splitter), + chunk_index, + chunk +FROM + pgml.chunk( + (SELECT splitter FROM splitter), + (SELECT document FROM document), + '{"chunk_size": 2, "chunk_overlap": 2}' + ); diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index 1d766a091..405dc0a70 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -18,3 +18,4 @@ torchvision==0.15.2 tqdm==4.65.0 transformers==4.29.2 xgboost==1.7.5 +langchain==0.0.180 From 90098e97a58f2c6b04b58b6ea939720bd9295bf5 Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:38:49 -0700 Subject: [PATCH 4/5] ok --- pgml-extension/examples/{test_chunking.sql => chunking.sql} | 2 ++ pgml-extension/tests/test.sql | 1 + 2 files changed, 3 insertions(+) rename pgml-extension/examples/{test_chunking.sql => chunking.sql} (97%) diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/chunking.sql similarity index 97% rename from pgml-extension/examples/test_chunking.sql rename to pgml-extension/examples/chunking.sql index 1b5acc676..f8559ef7c 100644 --- a/pgml-extension/examples/test_chunking.sql +++ b/pgml-extension/examples/chunking.sql @@ -58,3 +58,5 @@ FROM (SELECT document FROM document), '{"chunk_size": 2, "chunk_overlap": 2}' ); + +SELECT * FROM document_chunks LIMIT 5; diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql index 1c4dd614b..c1ef81d58 100644 --- a/pgml-extension/tests/test.sql +++ b/pgml-extension/tests/test.sql @@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine'); \i examples/multi_classification.sql \i examples/regression.sql \i examples/vectors.sql +\i examples/chunking.sql -- transformers are generally too slow to run in the test suite --\i examples/transformers.sql From 9439324eee9858d56d249bdc95d5859ccdf0dadc Mon Sep 17 00:00:00 2001 From: Lev Kokotov Date: Thu, 25 May 2023 10:41:19 -0700 Subject: [PATCH 5/5] rename --- pgml-extension/src/api.rs | 2 +- pgml-extension/src/bindings/{chunking.py => langchain.py} | 0 pgml-extension/src/bindings/{chunking.rs => langchain.rs} | 2 +- pgml-extension/src/bindings/mod.rs | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename pgml-extension/src/bindings/{chunking.py => langchain.py} (100%) rename pgml-extension/src/bindings/{chunking.rs => langchain.rs} (95%) diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs index 71d3d3421..914952e91 100644 --- a/pgml-extension/src/api.rs +++ b/pgml-extension/src/api.rs @@ -574,7 +574,7 @@ pub fn chunk( text: &str, kwargs: default!(JsonB, "'{}'"), ) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> { - let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0); + let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0); let chunks = chunks .into_iter() .enumerate() diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/langchain.py similarity index 100% rename from pgml-extension/src/bindings/chunking.py rename to pgml-extension/src/bindings/langchain.py diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/langchain.rs similarity index 95% rename from pgml-extension/src/bindings/chunking.rs rename to pgml-extension/src/bindings/langchain.rs index da83a5b42..61b3d61ef 100644 --- a/pgml-extension/src/bindings/chunking.rs +++ b/pgml-extension/src/bindings/langchain.rs @@ -7,7 +7,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| { Python::with_gil(|py| -> Py { let src = include_str!(concat!( env!("CARGO_MANIFEST_DIR"), - "/src/bindings/chunking.py" + "/src/bindings/langchain.py" )); PyModule::from_code(py, src, "", "").unwrap().into() diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs index 845ea20fa..77a46e161 100644 --- a/pgml-extension/src/bindings/mod.rs +++ b/pgml-extension/src/bindings/mod.rs @@ -6,7 +6,7 @@ use pgrx::*; use crate::orm::*; #[cfg(feature = "python")] -pub mod chunking; +pub mod langchain; pub mod lightgbm; pub mod linfa; #[cfg(feature = "python")]

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier! Saves Data!