Skip to content

mindsdb vs postgresml blog post #704

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pgml-dashboard/src/api/docs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ async fn blog_handler<'a>(path: PathBuf, cluster: Cluster) -> Result<ResponseOk,
cluster,
&path,
vec![
NavLink::new("MindsDB vs PostgresML")
.href("/blog/mindsdb-vs-postgresml"),
NavLink::new("Introducing PostgresML Python SDK: Build End-to-End Vector Search Applications without OpenAI and Pinecone")
.href("/blog/introducing-postgresml-python-sdk-build-end-to-end-vector-search-applications-without-openai-and-pinecone"),
NavLink::new("PostgresML raises $4.7M to launch serverless AI application databases based on Postgres")
Expand Down
313 changes: 313 additions & 0 deletions pgml-dashboard/static/blog/mindsdb-vs-postgresml.md

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file added pgml-dashboard/static/images/blog/mindsdb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion pgml-dashboard/templates/layout/nav/top.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<a class="nav-link" href="/docs/guides/setup/quick_start_with_docker/">Docs</a>
</li>
<li class="nav-item d-flex align-items-center">
<a class="nav-link" href="/blog/postgresml-raises-4.7M-to-launch-serverless-ai-application-databases-based-on-postgres">Blog</a>
<a class="nav-link" href="/blog/mindsdb-vs-postgresml">Blog</a>
</li>
<li class="nav-item d-flex align-items-center">
<a class="nav-link" href="https://github.com/postgresml/postgresml" target="_blank">Open Source</a>
Expand Down
1 change: 1 addition & 0 deletions pgml-extension/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ deepspeed==0.9.2
huggingface-hub==0.14.1
InstructorEmbedding==1.0.0
lightgbm==3.3.5
orjson==3.9.0
pandas==2.0.1
rich==13.3.5
rouge==1.0.1
Expand Down
10 changes: 5 additions & 5 deletions pgml-extension/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ pub fn embed_batch(
inputs: Vec<&str>,
kwargs: default!(JsonB, "'{}'"),
) -> Vec<Vec<f32>> {
crate::bindings::transformers::embed(transformer, &inputs, &kwargs.0)
crate::bindings::transformers::embed(transformer, inputs, &kwargs.0)
}

#[pg_extern(immutable, parallel_safe)]
Expand All @@ -602,11 +602,11 @@ pub fn chunk(
pub fn transform_json(
task: JsonB,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false),
) -> JsonB {
JsonB(crate::bindings::transformers::transform(
&task.0, &args.0, &inputs,
&task.0, &args.0, inputs,
))
}

Expand All @@ -616,14 +616,14 @@ pub fn transform_json(
pub fn transform_string(
task: String,
args: default!(JsonB, "'{}'"),
inputs: default!(Vec<String>, "ARRAY[]::TEXT[]"),
inputs: default!(Vec<&str>, "ARRAY[]::TEXT[]"),
cache: default!(bool, false),
) -> JsonB {
let mut task_map = HashMap::new();
task_map.insert("task", task);
let task_json = json!(task_map);
JsonB(crate::bindings::transformers::transform(
&task_json, &args.0, &inputs,
&task_json, &args.0, inputs,
))
}

Expand Down
44 changes: 20 additions & 24 deletions pgml-extension/src/bindings/transformers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import json
import math
import os
import shutil
import time
import numpy as np

import datasets
from InstructorEmbedding import INSTRUCTOR
import numpy
import orjson
from rouge import Rouge
from sacrebleu.metrics import BLEU
from sentence_transformers import SentenceTransformer
Expand Down Expand Up @@ -42,7 +42,6 @@
__cache_sentence_transformer_by_name = {}
__cache_transform_pipeline_by_task = {}


DTYPE_MAP = {
"uint8": torch.uint8,
"int8": torch.int8,
Expand All @@ -58,6 +57,10 @@
"bool": torch.bool,
}

def orjson_default(obj):
if isinstance(obj, numpy.float32):
return float(obj)
raise TypeError

def convert_dtype(kwargs):
if "torch_dtype" in kwargs:
Expand All @@ -78,18 +81,10 @@ def ensure_device(kwargs):
else:
kwargs["device"] = "cpu"


class NumpyJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.float32):
return float(obj)
return super().default(obj)


def transform(task, args, inputs):
task = json.loads(task)
args = json.loads(args)
inputs = json.loads(inputs)
task = orjson.loads(task)
args = orjson.loads(args)
inputs = orjson.loads(inputs)

key = ",".join([f"{key}:{val}" for (key, val) in sorted(task.items())])
if key not in __cache_transform_pipeline_by_task:
Expand All @@ -103,17 +98,18 @@ def transform(task, args, inputs):
pipe = __cache_transform_pipeline_by_task[key]

if pipe.task == "question-answering":
inputs = [json.loads(input) for input in inputs]
inputs = [orjson.loads(input) for input in inputs]

convert_eos_token(pipe.tokenizer, args)

return json.dumps(pipe(inputs, **args), cls=NumpyJSONEncoder)
results = pipe(inputs, **args)

return orjson.dumps(results, default=orjson_default).decode()


def embed(transformer, inputs, kwargs):

inputs = json.loads(inputs)
kwargs = json.loads(kwargs)
kwargs = orjson.loads(kwargs)

ensure_device(kwargs)
instructor = transformer.startswith("hkunlp/instructor")

Expand All @@ -137,7 +133,7 @@ def embed(transformer, inputs, kwargs):


def load_dataset(name, subset, limit: None, kwargs: "{}"):
kwargs = json.loads(kwargs)
kwargs = orjson.loads(kwargs)

if limit:
dataset = datasets.load_dataset(
Expand All @@ -164,7 +160,7 @@ def load_dataset(name, subset, limit: None, kwargs: "{}"):
else:
raise PgMLException(f"Unhandled dataset type: {type(dataset)}")

return json.dumps({"data": data, "types": types})
return orjson.dumps({"data": data, "types": types}).decode()


def tokenize_text_classification(tokenizer, max_length, x, y):
Expand Down Expand Up @@ -421,7 +417,7 @@ def compute_metrics_text_generation(model, tokenizer, hyperparams, y):


def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
hyperparams = json.loads(hyperparams)
hyperparams = orjson.loads(hyperparams)
model_name = hyperparams.pop("model_name")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Expand Down Expand Up @@ -562,7 +558,7 @@ def generate(model_id, data, config):
result = get_transformer_by_model_id(model_id)
tokenizer = result["tokenizer"]
model = result["model"]
config = json.loads(config)
config = orjson.loads(config)
all_preds = []

batch_size = 1 # TODO hyperparams
Expand Down
7 changes: 3 additions & 4 deletions pgml-extension/src/bindings/transformers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ static PY_MODULE: Lazy<Py<PyModule>> = Lazy::new(|| {
pub fn transform(
task: &serde_json::Value,
args: &serde_json::Value,
inputs: &Vec<String>,
inputs: Vec<&str>,
) -> serde_json::Value {
crate::bindings::venv::activate();

let task = serde_json::to_string(task).unwrap();
let args = serde_json::to_string(args).unwrap();
let inputs = serde_json::to_string(inputs).unwrap();
let inputs = serde_json::to_string(&inputs).unwrap();

let results = Python::with_gil(|py| -> String {
let transform: Py<PyAny> = PY_MODULE.getattr(py, "transform").unwrap().into();
Expand All @@ -56,11 +56,10 @@ pub fn transform(
serde_json::from_str(&results).unwrap()
}

pub fn embed(transformer: &str, inputs: &[&str], kwargs: &serde_json::Value) -> Vec<Vec<f32>> {
pub fn embed(transformer: &str, inputs: Vec<&str>, kwargs: &serde_json::Value) -> Vec<Vec<f32>> {
crate::bindings::venv::activate();

let kwargs = serde_json::to_string(kwargs).unwrap();
let inputs = serde_json::to_string(&inputs).unwrap();
Python::with_gil(|py| -> Vec<Vec<f32>> {
let embed: Py<PyAny> = PY_MODULE.getattr(py, "embed").unwrap().into();
embed
Expand Down
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy