Content-Length: 522164 | pFad | http://github.com/postgresml/postgresml/pull/655/files

BE Add LangChain splitters by levkk · Pull Request #655 · postgresml/postgresml · GitHub
Skip to content

Add LangChain splitters #655

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions pgml-extension/examples/chunking.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
--- Chunk text for LLM embeddings and vectorization.

DROP TABLE documents CASCADE;
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
document TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);

DROP TABLE splitters CASCADE;
CREATE TABLE splitters (
id BIGSERIAL PRIMARY KEY,
splitter VARCHAR NOT NULL DEFAULT 'recursive_character'
);

DROP TABLE document_chunks CASCADE;
CREATE TABLE document_chunks(
id BIGSERIAL PRIMARY KEY,
document_id BIGINT NOT NULL REFERENCES documents(id),
splitter_id BIGINT NOT NULL REFERENCES splitters(id),
chunk_index BIGINT NOT NULL,
chunk VARCHAR
);

INSERT INTO documents VALUES (
1,
'It was the best of times, it was the worst of times, it was the age of wisdom,
it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light,
it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us,
we had nothing before us, we were all going direct to Heaven, we were all going direct the other way—in short, the period was so far like
the present period, that some of its noisiest authorities insisted on its being received, for good or for evil, in the superlative degree of comparison only.',
NOW()
);

INSERT INTO splitters VALUES (1, 'recursive_character');

WITH document AS (
SELECT id, document
FROM documents
WHERE id = 1
),

splitter AS (
SELECT id, splitter
FROM splitters
WHERE id = 1
)

INSERT INTO document_chunks SELECT
nextval('document_chunks_id_seq'::regclass),
(SELECT id FROM document),
(SELECT id FROM splitter),
chunk_index,
chunk
FROM
pgml.chunk(
(SELECT splitter FROM splitter),
(SELECT document FROM document),
'{"chunk_size": 2, "chunk_overlap": 2}'
);

SELECT * FROM document_chunks LIMIT 5;
1 change: 1 addition & 0 deletions pgml-extension/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ torchvision==0.15.2
tqdm==4.65.0
transformers==4.29.2
xgboost==1.7.5
langchain==0.0.180
22 changes: 19 additions & 3 deletions pgml-extension/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,23 @@ fn load_dataset(

#[pg_extern(immutable, parallel_safe)]
pub fn embed(transformer: &str, text: &str, kwargs: default!(JsonB, "'{}'")) -> Vec<f32> {
crate::bindings::transformers::embed(transformer, &text, &kwargs.0)
crate::bindings::transformers::embed(transformer, text, &kwargs.0)
}

#[pg_extern(immutable, parallel_safe)]
pub fn chunk(
splitter: &str,
text: &str,
kwargs: default!(JsonB, "'{}'"),
) -> TableIterator<'static, (name!(chunk_index, i64), name!(chunk, String))> {
let chunks = crate::bindings::langchain::chunk(splitter, text, &kwargs.0);
let chunks = chunks
.into_iter()
.enumerate()
.map(|(i, chunk)| (i as i64 + 1, chunk))
.collect::<Vec<(i64, String)>>();

TableIterator::new(chunks.into_iter())
}

#[cfg(feature = "python")]
Expand All @@ -575,7 +591,7 @@ pub fn transform_json(
task: JsonB,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false)
cache: default!(bool, false),
) -> JsonB {
JsonB(crate::bindings::transformers::transform(
&task.0, &args.0, &inputs,
Expand All @@ -589,7 +605,7 @@ pub fn transform_string(
task: String,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false)
cache: default!(bool, false),
) -> JsonB {
let mut task_map = HashMap::new();
task_map.insert("task", task);
Expand Down
29 changes: 29 additions & 0 deletions pgml-extension/src/bindings/langchain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from langchain.text_splitter import (
CharacterTextSplitter,
LatexTextSplitter,
MarkdownTextSplitter,
NLTKTextSplitter,
PythonCodeTextSplitter,
RecursiveCharacterTextSplitter,
SpacyTextSplitter,
)
import json

SPLITTERS = {
"character": CharacterTextSplitter,
"latex": LatexTextSplitter,
"markdown": MarkdownTextSplitter,
"nltk": NLTKTextSplitter,
"python": PythonCodeTextSplitter,
"recursive_character": RecursiveCharacterTextSplitter,
"spacy": SpacyTextSplitter,
}


def chunk(splitter, text, args):
kwargs = json.loads(args)

if splitter in SPLITTERS:
return SPLITTERS[splitter](**kwargs).split_text(text)
else:
raise ValueError("Unknown splitter: {}".format(splitter))
37 changes: 37 additions & 0 deletions pgml-extension/src/bindings/langchain.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use once_cell::sync::Lazy;
use pgrx::*;
use pyo3::prelude::*;
use pyo3::types::PyTuple;

static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
Python::with_gil(|py| -> Py<PyModule> {
let src = include_str!(concat!(
env!("CARGO_MANIFEST_DIR"),
"/src/bindings/langchain.py"
));

PyModule::from_code(py, src, "", "").unwrap().into()
})
});

pub fn chunk(splitter: &str, text: &str, kwargs: &serde_json::Value) -> Vec<String> {
crate::bindings::venv::activate();

let kwargs = serde_json::to_string(kwargs).unwrap();

Python::with_gil(|py| -> Vec<String> {
let chunk: Py<PyAny> = PY_MODULE.getattr(py, "chunk").unwrap().into();

chunk
.call1(
py,
PyTuple::new(
py,
&[splitter.into_py(py), text.into_py(py), kwargs.into_py(py)],
),
)
.unwrap()
.extract(py)
.unwrap()
})
}
2 changes: 2 additions & 0 deletions pgml-extension/src/bindings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use pgrx::*;

use crate::orm::*;

#[cfg(feature = "python")]
pub mod langchain;
pub mod lightgbm;
pub mod linfa;
#[cfg(feature = "python")]
Expand Down
6 changes: 1 addition & 5 deletions pgml-extension/src/bindings/transformers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,7 @@ pub fn transform(
py,
PyTuple::new(
py,
&[
task.into_py(py),
args.into_py(py),
inputs.into_py(py),
],
&[task.into_py(py), args.into_py(py), inputs.into_py(py)],
),
)
.unwrap()
Expand Down
1 change: 1 addition & 0 deletions pgml-extension/tests/test.sql
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ SELECT pgml.load_dataset('wine');
\i examples/multi_classification.sql
\i examples/regression.sql
\i examples/vectors.sql
\i examples/chunking.sql
-- transformers are generally too slow to run in the test suite
--\i examples/transformers.sql








ApplySandwichStrip

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier!      Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

Fetched URL: http://github.com/postgresml/postgresml/pull/655/files

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy