diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index d458507bf..4ade8bf6a 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -1,4 +1,6 @@ accelerate==0.19.0 +auto-gptq==0.2.2 +ctransformers==0.2.8 datasets==2.12.0 deepspeed==0.9.2 huggingface-hub==0.14.1 diff --git a/pgml-extension/src/bindings/transformers.py b/pgml-extension/src/bindings/transformers.py index 70eb35aee..b7d0af968 100644 --- a/pgml-extension/src/bindings/transformers.py +++ b/pgml-extension/src/bindings/transformers.py @@ -81,6 +81,47 @@ def ensure_device(kwargs): else: kwargs["device"] = "cpu" + +class GPTQPipeline(object): + def __init__(self, model_name, **task): + import auto_gptq + from huggingface_hub import snapshot_download + model_path = snapshot_download(model_name) + + self.model = auto_gptq.AutoGPTQForCausalLM.from_quantized(model_path, **task) + if "use_fast_tokenizer" in task: + self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=task.pop("use_fast_tokenizer")) + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + self.task = "text-generation" + + def __call__(self, inputs, **kwargs): + outputs = [] + for input in inputs: + tokens = self.tokenizer(input, return_tensors="pt").to(self.model.device).input_ids + token_ids = self.model.generate(input_ids=tokens, **kwargs)[0] + outputs.append(self.tokenizer.decode(token_ids)) + return outputs + + +class GGMLPipeline(object): + def __init__(self, model_name, **task): + import ctransformers + + task.pop("model") + task.pop("task") + task.pop("device") + self.model = ctransformers.AutoModelForCausalLM.from_pretrained(model_name, **task) + self.tokenizer = None + self.task = "text-generation" + + def __call__(self, inputs, **kwargs): + outputs = [] + for input in inputs: + outputs.append(self.model(input, **kwargs)) + return outputs + + def transform(task, args, inputs): task = orjson.loads(task) args = orjson.loads(args) @@ -90,21 +131,25 @@ def transform(task, args, inputs): if key not in __cache_transform_pipeline_by_task: ensure_device(task) convert_dtype(task) - pipe = transformers.pipeline(**task) - if pipe.tokenizer is None: - pipe.tokenizer = AutoTokenizer.from_pretrained(pipe.model.name_or_path) + model_name = task.get("model", None) + model_name = model_name.lower() if model_name else None + if model_name and "-ggml" in model_name: + pipe = GGMLPipeline(model_name, **task) + elif model_name and "-gptq" in model_name: + pipe = GPTQPipeline(model_name, **task) + else: + pipe = transformers.pipeline(**task) + if pipe.tokenizer is None: + pipe.tokenizer = AutoTokenizer.from_pretrained(pipe.model.name_or_path) __cache_transform_pipeline_by_task[key] = pipe pipe = __cache_transform_pipeline_by_task[key] if pipe.task == "question-answering": inputs = [orjson.loads(input) for input in inputs] - convert_eos_token(pipe.tokenizer, args) - results = pipe(inputs, **args) - - return orjson.dumps(results, default=orjson_default).decode() + return orjson.dumps(pipe(inputs, **args), default=orjson_default).decode() def embed(transformer, inputs, kwargs):
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: