Content-Length: 707676 | pFad | http://github.com/postgresml/postgresml/commit/9949cde49f46920532f18f62a1e8f96f88cd6083

30 Add LangChain splitters (#655) · postgresml/postgresml@9949cde · GitHub
Skip to content

Commit 9949cde

Browse files
authored
Add LangChain splitters (#655)
1 parent 484b2fc commit 9949cde

File tree

8 files changed

+152
-8
lines changed

8 files changed

+152
-8
lines changed

pgml-extension/examples/chunking.sql

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
--- Chunk text for LLM embeddings and vectorization.
2+
3+
DROP TABLE documents CASCADE;
4+
CREATE TABLE documents (
5+
id BIGSERIAL PRIMARY KEY,
6+
document TEXT NOT NULL,
7+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
8+
);
9+
10+
DROP TABLE splitters CASCADE;
11+
CREATE TABLE splitters (
12+
id BIGSERIAL PRIMARY KEY,
13+
splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
14+
);
15+
16+
DROP TABLE document_chunks CASCADE;
17+
CREATE TABLE document_chunks(
18+
id BIGSERIAL PRIMARY KEY,
19+
document_id BIGINT NOT NULL REFERENCES documents(id),
20+
splitter_id BIGINT NOT NULL REFERENCES splitters(id),
21+
chunk_index BIGINT NOT NULL,
22+
chunk VARCHAR
23+
);
24+
25+
INSERT INTO documents VALUES (
26+
1,
27+
'It was the best of times, it was the worst of times, it was the age of wisdom,
28+
it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
29+
it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
30+
we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
31+
the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
32+
NOW()
33+
);
34+
35+
INSERT INTO splitters VALUES (1, 'recursive_character');
36+
37+
WITH document AS (
38+
SELECT id, document
39+
FROM documents
40+
WHERE id = 1
41+
),
42+
43+
splitter AS (
44+
SELECT id, splitter
45+
FROM splitters
46+
WHERE id = 1
47+
)
48+
49+
INSERT INTO document_chunks SELECT
50+
nextval('document_chunks_id_seq'::regclass),
51+
(SELECT id FROM document),
52+
(SELECT id FROM splitter),
53+
chunk_index,
54+
chunk
55+
FROM
56+
pgml.chunk(
57+
(SELECT splitter FROM splitter),
58+
(SELECT document FROM document),
59+
'{"chunk_size": 2, "chunk_overlap": 2}'
60+
);
61+
62+
SELECT * FROM document_chunks LIMIT 5;

pgml-extension/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ torchvision==0.15.2
1818
tqdm==4.65.0
1919
transformers==4.29.2
2020
xgboost==1.7.5
21+
langchain==0.0.180

pgml-extension/src/api.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -565,7 +565,23 @@ fn load_dataset(
565565

566566
#[pg_extern(immutable, parallel_safe)]
567567
pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
568-
crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
568+
crate::bindings::transformers::embed(transformer, text, &kwargs.0)
569+
}
570+
571+
#[pg_extern(immutable, parallel_safe)]
572+
pub fn chunk(
573+
splitter: &str,
574+
text: &str,
575+
kwargs: default!(JsonB, "'{}'"),
576+
) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
577+
let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
578+
let chunks = chunks
579+
.into_iter()
580+
.enumerate()
581+
.map(|(i, chunk)| (i as i64 + 1, chunk))
582+
.collect::<Vec<(i64, String)>>();
583+
584+
TableIterator::new(chunks.into_iter())
569585
}
570586

571587
#[cfg(feature = "python")]
@@ -575,7 +591,7 @@ pub fn transform_json(
575591
task: JsonB,
576592
args: default!(JsonB, "'{}'"),
577593
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
578-
cache: default!(bool, false)
594+
cache: default!(bool, false),
579595
) -> JsonB {
580596
JsonB(crate::bindings::transformers::transform(
581597
&task.0, &args.0, &inputs,
@@ -589,7 +605,7 @@ pub fn transform_string(
589605
task: String,
590606
args: default!(JsonB, "'{}'"),
591607
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
592-
cache: default!(bool, false)
608+
cache: default!(bool, false),
593609
) -> JsonB {
594610
let mut task_map = HashMap::new();
595611
task_map.insert("task", task);
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from langchain.text_splitter import (
2+
CharacterTextSplitter,
3+
LatexTextSplitter,
4+
MarkdownTextSplitter,
5+
NLTKTextSplitter,
6+
PythonCodeTextSplitter,
7+
RecursiveCharacterTextSplitter,
8+
SpacyTextSplitter,
9+
)
10+
import json
11+
12+
SPLITTERS = {
13+
"character": CharacterTextSplitter,
14+
"latex": LatexTextSplitter,
15+
"markdown": MarkdownTextSplitter,
16+
"nltk": NLTKTextSplitter,
17+
"python": PythonCodeTextSplitter,
18+
"recursive_character": RecursiveCharacterTextSplitter,
19+
"spacy": SpacyTextSplitter,
20+
}
21+
22+
23+
def chunk(splitter, text, args):
24+
kwargs = json.loads(args)
25+
26+
if splitter in SPLITTERS:
27+
return SPLITTERS[splitter](**kwargs).split_text(text)
28+
else:
29+
raise ValueError("Unknown splitter: {}".format(splitter))
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
use once_cell::sync::Lazy;
2+
use pgrx::*;
3+
use pyo3::prelude::*;
4+
use pyo3::types::PyTuple;
5+
6+
static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
7+
Python::with_gil(|py| -> Py<PyModule> {
8+
let src = include_str!(concat!(
9+
env!("CARGO_MANIFEST_DIR"),
10+
"/src/bindings/langchain.py"
11+
));
12+
13+
PyModule::from_code(py, src, "", "").unwrap().into()
14+
})
15+
});
16+
17+
pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
18+
crate::bindings::venv::activate();
19+
20+
let kwargs = serde_json::to_string(kwargs).unwrap();
21+
22+
Python::with_gil(|py| -> Vec<String> {
23+
let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();
24+
25+
chunk
26+
.call1(
27+
py,
28+
PyTuple::new(
29+
py,
30+
&[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
31+
),
32+
)
33+
.unwrap()
34+
.extract(py)
35+
.unwrap()
36+
})
37+
}

pgml-extension/src/bindings/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use pgrx::*;
55

66
use crate::orm::*;
77

8+
#[cfg(feature = "python")]
9+
pub mod langchain;
810
pub mod lightgbm;
911
pub mod linfa;
1012
#[cfg(feature = "python")]

pgml-extension/src/bindings/transformers.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,7 @@ pub fn transform(
4040
py,
4141
PyTuple::new(
4242
py,
43-
&[
44-
task.into_py(py),
45-
args.into_py(py),
46-
inputs.into_py(py),
47-
],
43+
&[task.into_py(py), args.into_py(py), inputs.into_py(py)],
4844
),
4945
)
5046
.unwrap()

pgml-extension/tests/test.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
2727
\i examples/multi_classification.sql
2828
\i examples/regression.sql
2929
\i examples/vectors.sql
30+
\i examples/chunking.sql
3031
-- transformers are generally too slow to run in the test suite
3132
--\i examples/transformers.sql

0 commit comments

Comments
 (0)








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgresml/postgresml/commit/9949cde49f46920532f18f62a1e8f96f88cd6083

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy