Add LangChain splitters (#655)

levkk · web-flow · commit 9949cde49f46 · 2023-05-31T13:45:00.000-07:00
diff --git a/pgml-extension/examples/chunking.sql b/pgml-extension/examples/chunking.sql
@@ -0,0 +1,62 @@
+--- Chunk text for  LLM embeddings and vectorization.
+
+DROP TABLE documents CASCADE;
+CREATE TABLE documents (
+	id BIGSERIAL PRIMARY KEY,
+	document TEXT NOT NULL,
+	created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+DROP TABLE splitters CASCADE;
+CREATE TABLE splitters (
+	id BIGSERIAL PRIMARY KEY,
+	splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
+);
+
+DROP TABLE document_chunks CASCADE;
+CREATE TABLE document_chunks(
+	id BIGSERIAL PRIMARY KEY,
+	document_id BIGINT NOT NULL REFERENCES documents(id),
+	splitter_id BIGINT NOT NULL REFERENCES splitters(id),
+	chunk_index BIGINT NOT NULL,
+	chunk VARCHAR
+);
+
+INSERT INTO documents VALUES (
+	1,
+	'It was the best of times, it was the worst of times, it was the age of wisdom, 
+	it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, 
+	it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, 
+	we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
+	the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
+	NOW()
+);
+
+INSERT INTO splitters VALUES (1, 'recursive_character');
+
+WITH document AS (
+	SELECT id, document
+	FROM documents
+	WHERE id = 1
+),
+
+splitter AS (
+	SELECT id, splitter
+	FROM splitters
+	WHERE id = 1
+)
+
+INSERT INTO document_chunks SELECT
+	nextval('document_chunks_id_seq'::regclass),
+	(SELECT id FROM document),
+	(SELECT id FROM splitter),
+	chunk_index,
+	chunk
+FROM
+	pgml.chunk(
+		(SELECT splitter FROM splitter),
+		(SELECT document FROM document),
+		'{"chunk_size": 2, "chunk_overlap": 2}'
+	);
+
+SELECT * FROM document_chunks LIMIT 5;
diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt
@@ -18,3 +18,4 @@ torchvision==0.15.2
 tqdm==4.65.0
 transformers==4.29.2
 xgboost==1.7.5
+langchain==0.0.180
diff --git a/pgml-extension/src/api.rs b/pgml-extension/src/api.rs
@@ -565,7 +565,23 @@ fn load_dataset(
 
 #[pg_extern(immutable, parallel_safe)]
 pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
-    crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
+    crate::bindings::transformers::embed(transformer, text, &kwargs.0)
+}
+
+#[pg_extern(immutable, parallel_safe)]
+pub fn chunk(
+    splitter: &str,
+    text: &str,
+    kwargs: default!(JsonB, "'{}'"),
+) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
+    let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
+    let chunks = chunks
+        .into_iter()
+        .enumerate()
+        .map(|(i, chunk)| (i as i64 + 1, chunk))
+        .collect::<Vec<(i64, String)>>();
+
+    TableIterator::new(chunks.into_iter())
 }
 
 #[cfg(feature = "python")]
@@ -575,7 +591,7 @@ pub fn transform_json(
     task: JsonB,
     args: default!(JsonB, "'{}'"),
     inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
-    cache: default!(bool, false)
+    cache: default!(bool, false),
 ) -> JsonB {
     JsonB(crate::bindings::transformers::transform(
         &task.0, &args.0, &inputs,
@@ -589,7 +605,7 @@ pub fn transform_string(
     task: String,
     args: default!(JsonB, "'{}'"),
     inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
-    cache: default!(bool, false)
+    cache: default!(bool, false),
 ) -> JsonB {
     let mut task_map = HashMap::new();
     task_map.insert("task", task);
diff --git a/pgml-extension/src/bindings/langchain.py b/pgml-extension/src/bindings/langchain.py
@@ -0,0 +1,29 @@
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    LatexTextSplitter,
+    MarkdownTextSplitter,
+    NLTKTextSplitter,
+    PythonCodeTextSplitter,
+    RecursiveCharacterTextSplitter,
+    SpacyTextSplitter,
+)
+import json
+
+SPLITTERS = {
+    "character": CharacterTextSplitter,
+    "latex": LatexTextSplitter,
+    "markdown": MarkdownTextSplitter,
+    "nltk": NLTKTextSplitter,
+    "python": PythonCodeTextSplitter,
+    "recursive_character": RecursiveCharacterTextSplitter,
+    "spacy": SpacyTextSplitter,
+}
+
+
+def chunk(splitter, text, args):
+    kwargs = json.loads(args)
+
+    if splitter in SPLITTERS:
+        return SPLITTERS[splitter](**kwargs).split_text(text)
+    else:
+        raise ValueError("Unknown splitter: {}".format(splitter))
diff --git a/pgml-extension/src/bindings/langchain.rs b/pgml-extension/src/bindings/langchain.rs
@@ -0,0 +1,37 @@
+use once_cell::sync::Lazy;
+use pgrx::*;
+use pyo3::prelude::*;
+use pyo3::types::PyTuple;
+
+static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
+    Python::with_gil(|py| -> Py<PyModule> {
+        let src = include_str!(concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/src/bindings/langchain.py"
+        ));
+
+        PyModule::from_code(py, src, "", "").unwrap().into()
+    })
+});
+
+pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
+    crate::bindings::venv::activate();
+
+    let kwargs = serde_json::to_string(kwargs).unwrap();
+
+    Python::with_gil(|py| -> Vec<String> {
+        let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();
+
+        chunk
+            .call1(
+                py,
+                PyTuple::new(
+                    py,
+                    &[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
+                ),
+            )
+            .unwrap()
+            .extract(py)
+            .unwrap()
+    })
+}
diff --git a/pgml-extension/src/bindings/mod.rs b/pgml-extension/src/bindings/mod.rs
@@ -5,6 +5,8 @@ use pgrx::*;
 
 use crate::orm::*;
 
+#[cfg(feature = "python")]
+pub mod langchain;
 pub mod lightgbm;
 pub mod linfa;
 #[cfg(feature = "python")]
diff --git a/pgml-extension/src/bindings/transformers.rs b/pgml-extension/src/bindings/transformers.rs
@@ -40,11 +40,7 @@ pub fn transform(
                 py,
                 PyTuple::new(
                     py,
-                    &[
-                        task.into_py(py),
-                        args.into_py(py),
-                        inputs.into_py(py),
-                    ],
+                    &[task.into_py(py), args.into_py(py), inputs.into_py(py)],
                 ),
             )
             .unwrap()
diff --git a/pgml-extension/tests/test.sql b/pgml-extension/tests/test.sql
@@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
 \i examples/multi_classification.sql
 \i examples/regression.sql
 \i examples/vectors.sql
+\i examples/chunking.sql
 -- transformers are generally too slow to run in the test suite
 --\i examples/transformers.sql