Content-Length: 15289 | pFad | http://github.com/postgresml/postgresml/pull/655.patch
thub.com
From c462f516ad34d375a774e4a512348314ac5aaae9 Mon Sep 17 00:00:00 2001
From: Lev Kokotov
Date: Wed, 24 May 2023 16:46:34 -0700
Subject: [PATCH 1/5] Add LangChain splitters
---
pgml-extension/src/api.rs | 7 +++-
pgml-extension/src/bindings/chunking.py | 29 ++++++++++++++++
pgml-extension/src/bindings/chunking.rs | 46 +++++++++++++++++++++++++
pgml-extension/src/bindings/mod.rs | 2 ++
4 files changed, 83 insertions(+), 1 deletion(-)
create mode 100644 pgml-extension/src/bindings/chunking.py
create mode 100644 pgml-extension/src/bindings/chunking.rs
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
index e3ecff1e4..60339fd69 100644
--- a/pgml-extension/src/api.rs
+++ b/pgml-extension/src/api.rs
@@ -565,7 +565,12 @@ fn load_dataset(
#[pg_extern(immutable, parallel_safe)]
pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec {
- crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
+ crate::bindings::transformers::embed(transformer, text, &kwargs.0)
+}
+
+#[pg_extern(immutable, parallel_safe)]
+pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec {
+ crate::bindings::chunking::chunk(splitter, text, &kwargs.0)
}
#[cfg(feature = "python")]
diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/chunking.py
new file mode 100644
index 000000000..7bd224230
--- /dev/null
+++ b/pgml-extension/src/bindings/chunking.py
@@ -0,0 +1,29 @@
+from langchain.text_splitter import (
+ CharacterTextSplitter,
+ LatexTextSplitter,
+ MarkdownTextSplitter,
+ NLTKTextSplitter,
+ PythonCodeTextSplitter,
+ RecursiveCharacterTextSplitter,
+ SpacyTextSplitter,
+)
+import json
+
+SPLITTERS = {
+ "character": CharacterTextSplitter,
+ "latex": LatexTextSplitter,
+ "markdown": MarkdownTextSplitter,
+ "nltk": NLTKTextSplitter,
+ "python": PythonCodeTextSplitter,
+ "recursive_character": RecursiveCharacterTextSplitter,
+ "spacy": SpacyTextSplitter,
+}
+
+
+def chunk(splitter, text, args):
+ kwargs = json.loads(args)
+
+ if splitter in SPLITTERS:
+ return SPLITTERS[splitter](**kwargs).split_text(text)
+ else:
+ raise ValueError("Unknown splitter: {}".format(splitter))
diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs
new file mode 100644
index 000000000..58c3bae9f
--- /dev/null
+++ b/pgml-extension/src/bindings/chunking.rs
@@ -0,0 +1,46 @@
+
+use once_cell::sync::Lazy;
+use pgrx::*;
+use pyo3::prelude::*;
+use pyo3::types::PyTuple;
+
+static PY_MODULE: Lazy> = Lazy::new(|| {
+ Python::with_gil(|py| -> Py {
+ let src = include_str!(concat!(
+ env!("CARGO_MANIFEST_DIR"),
+ "/src/bindings/chunking.py"
+ ));
+
+ PyModule::from_code(py, src, "", "").unwrap().into()
+ })
+});
+
+pub fn chunk(
+ splitter: &str,
+ text: &str,
+ kwargs: &serde_json::Value,
+) -> Vec {
+ crate::bindings::venv::activate();
+
+ let kwargs = serde_json::to_string(kwargs).unwrap();
+
+ Python::with_gil(|py| -> Vec {
+ let chunk: Py = PY_MODULE.getattr(py, "chunk").unwrap().into();
+
+ chunk
+ .call1(
+ py,
+ PyTuple::new(
+ py,
+ &[
+ splitter.into_py(py),
+ text.into_py(py),
+ kwargs.into_py(py),
+ ],
+ ),
+ )
+ .unwrap()
+ .extract(py)
+ .unwrap()
+ })
+}
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
index b147a8104..f1bdd101b 100644
--- a/pgml-extension/src/bindings/mod.rs
+++ b/pgml-extension/src/bindings/mod.rs
@@ -13,6 +13,8 @@ pub mod sklearn;
pub mod transformers;
#[cfg(feature = "python")]
pub mod venv;
+#[cfg(feature = "python")]
+pub mod chunking;
pub mod xgboost;
pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box;
From 422bf6577e9c403b8d832c311d0a3c750b4a2861 Mon Sep 17 00:00:00 2001
From: Lev Kokotov
Date: Thu, 25 May 2023 10:21:20 -0700
Subject: [PATCH 2/5] TableIterator & fmt
---
pgml-extension/src/api.rs | 19 +++++++++++++++----
pgml-extension/src/bindings/chunking.rs | 13 ++-----------
pgml-extension/src/bindings/mod.rs | 4 ++--
pgml-extension/src/bindings/transformers.rs | 6 +-----
4 files changed, 20 insertions(+), 22 deletions(-)
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
index 60339fd69..71d3d3421 100644
--- a/pgml-extension/src/api.rs
+++ b/pgml-extension/src/api.rs
@@ -569,8 +569,19 @@ pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) ->
}
#[pg_extern(immutable, parallel_safe)]
-pub fn chunk(splitter: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec {
- crate::bindings::chunking::chunk(splitter, text, &kwargs.0)
+pub fn chunk(
+ splitter: &str,
+ text: &str,
+ kwargs: default!(JsonB, "'{}'"),
+) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
+ let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0);
+ let chunks = chunks
+ .into_iter()
+ .enumerate()
+ .map(|(i, chunk)| (i as i64 + 1, chunk))
+ .collect::>();
+
+ TableIterator::new(chunks.into_iter())
}
#[cfg(feature = "python")]
@@ -580,7 +591,7 @@ pub fn transform_json(
task: JsonB,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec, "ARRAY[]::TEXT[]"),
- cache: default!(bool, false)
+ cache: default!(bool, false),
) -> JsonB {
JsonB(crate::bindings::transformers::transform(
&task.0, &args.0, &inputs,
@@ -594,7 +605,7 @@ pub fn transform_string(
task: String,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec, "ARRAY[]::TEXT[]"),
- cache: default!(bool, false)
+ cache: default!(bool, false),
) -> JsonB {
let mut task_map = HashMap::new();
task_map.insert("task", task);
diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/chunking.rs
index 58c3bae9f..da83a5b42 100644
--- a/pgml-extension/src/bindings/chunking.rs
+++ b/pgml-extension/src/bindings/chunking.rs
@@ -1,4 +1,3 @@
-
use once_cell::sync::Lazy;
use pgrx::*;
use pyo3::prelude::*;
@@ -15,11 +14,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| {
})
});
-pub fn chunk(
- splitter: &str,
- text: &str,
- kwargs: &serde_json::Value,
-) -> Vec {
+pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec {
crate::bindings::venv::activate();
let kwargs = serde_json::to_string(kwargs).unwrap();
@@ -32,11 +27,7 @@ pub fn chunk(
py,
PyTuple::new(
py,
- &[
- splitter.into_py(py),
- text.into_py(py),
- kwargs.into_py(py),
- ],
+ &[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
),
)
.unwrap()
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
index f1bdd101b..845ea20fa 100644
--- a/pgml-extension/src/bindings/mod.rs
+++ b/pgml-extension/src/bindings/mod.rs
@@ -5,6 +5,8 @@ use pgrx::*;
use crate::orm::*;
+#[cfg(feature = "python")]
+pub mod chunking;
pub mod lightgbm;
pub mod linfa;
#[cfg(feature = "python")]
@@ -13,8 +15,6 @@ pub mod sklearn;
pub mod transformers;
#[cfg(feature = "python")]
pub mod venv;
-#[cfg(feature = "python")]
-pub mod chunking;
pub mod xgboost;
pub type Fit = fn(dataset: &Dataset, hyperparams: &Hyperparams) -> Box;
diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs
index 9ce3825f4..8f296d812 100644
--- a/pgml-extension/src/bindings/transformers.rs
+++ b/pgml-extension/src/bindings/transformers.rs
@@ -40,11 +40,7 @@ pub fn transform(
py,
PyTuple::new(
py,
- &[
- task.into_py(py),
- args.into_py(py),
- inputs.into_py(py),
- ],
+ &[task.into_py(py), args.into_py(py), inputs.into_py(py)],
),
)
.unwrap()
From e6b2d1e7f79d12eeeb045bf813977e56bd3f33a6 Mon Sep 17 00:00:00 2001
From: Lev Kokotov
Date: Thu, 25 May 2023 10:35:46 -0700
Subject: [PATCH 3/5] req and example
---
pgml-extension/examples/test_chunking.sql | 60 +++++++++++++++++++++++
pgml-extension/requirements.txt | 1 +
2 files changed, 61 insertions(+)
create mode 100644 pgml-extension/examples/test_chunking.sql
diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/test_chunking.sql
new file mode 100644
index 000000000..1b5acc676
--- /dev/null
+++ b/pgml-extension/examples/test_chunking.sql
@@ -0,0 +1,60 @@
+--- Chunk text for LLM embeddings and vectorization.
+
+DROP TABLE documents CASCADE;
+CREATE TABLE documents (
+ id BIGSERIAL PRIMARY KEY,
+ document TEXT NOT NULL,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+DROP TABLE splitters CASCADE;
+CREATE TABLE splitters (
+ id BIGSERIAL PRIMARY KEY,
+ splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
+);
+
+DROP TABLE document_chunks CASCADE;
+CREATE TABLE document_chunks(
+ id BIGSERIAL PRIMARY KEY,
+ document_id BIGINT NOT NULL REFERENCES documents(id),
+ splitter_id BIGINT NOT NULL REFERENCES splitters(id),
+ chunk_index BIGINT NOT NULL,
+ chunk VARCHAR
+);
+
+INSERT INTO documents VALUES (
+ 1,
+ 'It was the best of times, it was the worst of times, it was the age of wisdom,
+ it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
+ it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
+ we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
+ the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
+ NOW()
+);
+
+INSERT INTO splitters VALUES (1, 'recursive_character');
+
+WITH document AS (
+ SELECT id, document
+ FROM documents
+ WHERE id = 1
+),
+
+splitter AS (
+ SELECT id, splitter
+ FROM splitters
+ WHERE id = 1
+)
+
+INSERT INTO document_chunks SELECT
+ nextval('document_chunks_id_seq'::regclass),
+ (SELECT id FROM document),
+ (SELECT id FROM splitter),
+ chunk_index,
+ chunk
+FROM
+ pgml.chunk(
+ (SELECT splitter FROM splitter),
+ (SELECT document FROM document),
+ '{"chunk_size": 2, "chunk_overlap": 2}'
+ );
diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt
index 1d766a091..405dc0a70 100644
--- a/pgml-extension/requirements.txt
+++ b/pgml-extension/requirements.txt
@@ -18,3 +18,4 @@ torchvision==0.15.2
tqdm==4.65.0
transformers==4.29.2
xgboost==1.7.5
+langchain==0.0.180
From 90098e97a58f2c6b04b58b6ea939720bd9295bf5 Mon Sep 17 00:00:00 2001
From: Lev Kokotov
Date: Thu, 25 May 2023 10:38:49 -0700
Subject: [PATCH 4/5] ok
---
pgml-extension/examples/{test_chunking.sql => chunking.sql} | 2 ++
pgml-extension/tests/test.sql | 1 +
2 files changed, 3 insertions(+)
rename pgml-extension/examples/{test_chunking.sql => chunking.sql} (97%)
diff --git a/pgml-extension/examples/test_chunking.sql b/pgml-extension/examples/chunking.sql
similarity index 97%
rename from pgml-extension/examples/test_chunking.sql
rename to pgml-extension/examples/chunking.sql
index 1b5acc676..f8559ef7c 100644
--- a/pgml-extension/examples/test_chunking.sql
+++ b/pgml-extension/examples/chunking.sql
@@ -58,3 +58,5 @@ FROM
(SELECT document FROM document),
'{"chunk_size": 2, "chunk_overlap": 2}'
);
+
+SELECT * FROM document_chunks LIMIT 5;
diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql
index 1c4dd614b..c1ef81d58 100644
--- a/pgml-extension/tests/test.sql
+++ b/pgml-extension/tests/test.sql
@@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
\i examples/multi_classification.sql
\i examples/regression.sql
\i examples/vectors.sql
+\i examples/chunking.sql
-- transformers are generally too slow to run in the test suite
--\i examples/transformers.sql
From 9439324eee9858d56d249bdc95d5859ccdf0dadc Mon Sep 17 00:00:00 2001
From: Lev Kokotov
Date: Thu, 25 May 2023 10:41:19 -0700
Subject: [PATCH 5/5] rename
---
pgml-extension/src/api.rs | 2 +-
pgml-extension/src/bindings/{chunking.py => langchain.py} | 0
pgml-extension/src/bindings/{chunking.rs => langchain.rs} | 2 +-
pgml-extension/src/bindings/mod.rs | 2 +-
4 files changed, 3 insertions(+), 3 deletions(-)
rename pgml-extension/src/bindings/{chunking.py => langchain.py} (100%)
rename pgml-extension/src/bindings/{chunking.rs => langchain.rs} (95%)
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
index 71d3d3421..914952e91 100644
--- a/pgml-extension/src/api.rs
+++ b/pgml-extension/src/api.rs
@@ -574,7 +574,7 @@ pub fn chunk(
text: &str,
kwargs: default!(JsonB, "'{}'"),
) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
- let chunks = crate::bindings::chunking::chunk(splitter, text, &kwargs.0);
+ let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
let chunks = chunks
.into_iter()
.enumerate()
diff --git a/pgml-extension/src/bindings/chunking.py b/pgml-extension/src/bindings/langchain.py
similarity index 100%
rename from pgml-extension/src/bindings/chunking.py
rename to pgml-extension/src/bindings/langchain.py
diff --git a/pgml-extension/src/bindings/chunking.rs b/pgml-extension/src/bindings/langchain.rs
similarity index 95%
rename from pgml-extension/src/bindings/chunking.rs
rename to pgml-extension/src/bindings/langchain.rs
index da83a5b42..61b3d61ef 100644
--- a/pgml-extension/src/bindings/chunking.rs
+++ b/pgml-extension/src/bindings/langchain.rs
@@ -7,7 +7,7 @@ static PY_MODULE: Lazy> = Lazy::new(|| {
Python::with_gil(|py| -> Py {
let src = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
- "/src/bindings/chunking.py"
+ "/src/bindings/langchain.py"
));
PyModule::from_code(py, src, "", "").unwrap().into()
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
index 845ea20fa..77a46e161 100644
--- a/pgml-extension/src/bindings/mod.rs
+++ b/pgml-extension/src/bindings/mod.rs
@@ -6,7 +6,7 @@ use pgrx::*;
use crate::orm::*;
#[cfg(feature = "python")]
-pub mod chunking;
+pub mod langchain;
pub mod lightgbm;
pub mod linfa;
#[cfg(feature = "python")]
--- a PPN by Garber Painting Akron. With Image Size Reduction included!Fetched URL: http://github.com/postgresml/postgresml/pull/655.patch
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy