diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0843fc0e4..4c63d53cd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: python3-pip \ python3 \ lld - sudo pip3 install -r requirements.linux.txt + sudo pip3 install -r requirements.linux.txt --no-cache-dir - name: Cache dependencies uses: buildjet/cache@v3 if: steps.pgml_extension_changed.outputs.PGML_EXTENSION_CHANGED_FILES != '0' diff --git a/pgml-extension/requirements.linux.txt b/pgml-extension/requirements.linux.txt index 166689ca9..cea57b7d9 100644 --- a/pgml-extension/requirements.linux.txt +++ b/pgml-extension/requirements.linux.txt @@ -1,70 +1,83 @@ -accelerate==0.27.2 -aiohttp==3.9.1 +accelerate==0.30.1 +aiohttp==3.9.5 aiosignal==1.3.1 annotated-types==0.6.0 -anyio==4.2.0 -appdirs==1.4.4 +anyio==4.3.0 async-timeout==4.0.3 -attrs==23.1.0 -auto-gptq==0.6.0 -bitsandbytes==0.41.3.post2 -black==24.1.1 -catboost==1.2.2 -certifi==2023.11.17 +attrs==23.2.0 +auto_gptq==0.7.1 +bitsandbytes==0.43.1 +catboost==1.2.5 +certifi==2024.2.2 charset-normalizer==3.3.2 click==8.1.7 +cloudpickle==3.0.0 +cmake==3.29.2 colorama==0.4.6 coloredlogs==15.0.1 -contourpy==1.2.0 +contourpy==1.2.1 ctransformers==0.2.27 cycler==0.12.1 -dataclasses-json==0.6.3 -datasets==2.15.0 -deepspeed==0.12.5 +dataclasses-json==0.6.6 +datasets==2.16.1 +deepspeed==0.14.2 dill==0.3.7 -docker-pycreds==0.4.0 -docstring-parser==0.15 -einops==0.7.0 -evaluate==0.4.1 -exceptiongroup==1.2.0 -filelock==3.13.1 -fonttools==4.47.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.6.1 +einops==0.8.0 +email_validator==2.1.1 +exceptiongroup==1.2.1 +fastapi==0.111.0 +fastapi-cli==0.0.3 +filelock==3.14.0 +fonttools==4.51.0 frozenlist==1.4.1 fsspec==2023.10.0 -gekko==1.0.6 -gitdb==4.0.11 -GitPython==3.1.41 -graphviz==0.20.1 -greenlet==3.0.2 +gekko==1.1.1 +graphviz==0.20.3 +greenlet==3.0.3 +h11==0.14.0 hjson==3.1.0 -huggingface-hub==0.19.4 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.23.0 humanfriendly==10.0 -idna==3.6 -InstructorEmbedding==1.0.1 -Jinja2==3.1.2 -joblib==1.3.2 +idna==3.7 +interegular==0.3.3 +Jinja2==3.1.4 +joblib==1.4.2 jsonpatch==1.33 jsonpointer==2.4 +jsonschema==4.22.0 +jsonschema-specifications==2023.12.1 kiwisolver==1.4.5 -langchain==0.0.351 -langchain-community==0.0.4 -langchain-core==0.1.1 -langsmith==0.0.72 -lightgbm==4.1.0 -lxml==4.9.3 +langchain==0.1.19 +langchain-community==0.0.38 +langchain-core==0.1.52 +langchain-text-splitters==0.0.1 +langsmith==0.1.56 +lark==1.1.9 +lightgbm==4.3.0 +llvmlite==0.42.0 +lm-format-enforcer==0.9.8 +lxml==5.2.1 markdown-it-py==3.0.0 -MarkupSafe==2.1.3 -marshmallow==3.20.1 -matplotlib==3.8.2 +MarkupSafe==2.1.5 +marshmallow==3.21.2 +matplotlib==3.8.4 mdurl==0.1.2 mpmath==1.3.0 -multidict==6.0.4 +msgpack==1.0.8 +multidict==6.0.5 multiprocess==0.70.15 mypy-extensions==1.0.0 -networkx==3.2.1 +nest-asyncio==1.6.0 +networkx==3.3 ninja==1.11.1.1 -nltk==3.8.1 -numpy==1.26.2 +numba==0.59.1 +numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 nvidia-cuda-cupti-cu12==12.1.105 nvidia-cuda-nvrtc-cu12==12.1.105 @@ -74,72 +87,82 @@ nvidia-cufft-cu12==11.0.2.54 nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 -nvidia-nccl-cu12==2.18.1 -nvidia-nvjitlink-cu12==12.3.101 +nvidia-ml-py==12.550.52 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.127 nvidia-nvtx-cu12==12.1.105 -optimum==1.16.1 -orjson==3.9.10 +openai==1.28.0 +optimum==1.19.2 +orjson==3.10.3 +outlines==0.0.34 packaging==23.2 -pandas==2.1.4 -pathspec==0.12.1 -peft==0.7.1 -Pillow==10.1.0 -platformdirs==4.2.0 -plotly==5.18.0 +pandas==2.2.2 +peft==0.10.0 +pillow==10.3.0 +plotly==5.22.0 portalocker==2.8.2 -protobuf==4.25.1 -psutil==5.9.7 +prometheus-fastapi-instrumentator==7.0.0 +prometheus_client==0.20.0 +protobuf==5.26.1 +psutil==5.9.8 py-cpuinfo==9.0.0 pyarrow==11.0.0 pyarrow-hotfix==0.6 -pydantic==2.5.2 -pydantic_core==2.14.5 -Pygments==2.17.2 +pydantic==2.7.1 +pydantic_core==2.18.2 +Pygments==2.18.0 pynvml==11.5.0 -pyparsing==3.1.1 -python-dateutil==2.8.2 -pytz==2023.3.post1 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.9 +pytz==2024.1 PyYAML==6.0.1 -regex==2023.10.3 +ray==2.21.0 +referencing==0.35.1 +regex==2024.5.10 requests==2.31.0 -responses==0.18.0 rich==13.7.1 rouge==1.0.1 -sacrebleu==2.4.0 +rpds-py==0.18.1 +sacrebleu==2.4.2 sacremoses==0.1.1 -safetensors==0.4.1 -scikit-learn==1.3.2 -scipy==1.11.4 -sentence-transformers==2.5.1 -sentencepiece==0.1.99 -sentry-sdk==1.40.2 -setproctitle==1.3.3 -shtab==1.6.5 +safetensors==0.4.3 +scikit-learn==1.4.2 +scipy==1.13.0 +sentence-transformers==2.7.0 +sentencepiece==0.2.0 +shellingham==1.5.4 six==1.16.0 -smmap==5.0.1 -sniffio==1.3.0 -SQLAlchemy==2.0.23 +sniffio==1.3.1 +SQLAlchemy==2.0.30 +starlette==0.37.2 sympy==1.12 tabulate==0.9.0 -tenacity==8.2.3 -threadpoolctl==3.2.0 -tokenizers==0.15.0 -tomli==2.0.1 -torch==2.1.2 -torchaudio==2.1.2 -torchvision==0.16.2 -tqdm==4.66.1 -transformers==4.39.3 -transformers-stream-generator==0.0.4 -triton==2.1.0 -trl==0.7.10 +tenacity==8.3.0 +threadpoolctl==3.5.0 +tiktoken==0.6.0 +tokenizers==0.19.1 +torch==2.3.0 +torchaudio==2.3.0 +torchvision==0.18.0 +tqdm==4.66.4 +transformers==4.40.2 +transformers-stream-generator==0.0.5 +triton==2.3.0 +typer==0.12.3 typing-inspect==0.9.0 -typing_extensions==4.9.0 -tyro==0.7.2 -tzdata==2023.3 -urllib3==2.1.0 -xformers==0.0.23.post1 -xgboost==2.0.2 +typing_extensions==4.11.0 +tzdata==2024.1 +ujson==5.9.0 +urllib3==2.2.1 +uvicorn==0.29.0 +uvloop==0.19.0 +vllm==0.4.2 +vllm-nccl-cu12==2.18.1.0.4.0 +watchfiles==0.21.0 +websockets==12.0 +xformers==0.0.26.post1 +xgboost==2.0.3 xxhash==3.4.1 yarl==1.9.4 -vllm==0.4.0.post1 diff --git a/pgml-extension/requirements.macos.txt b/pgml-extension/requirements.macos.txt index 4cd2dc81b..579d1a63d 100644 --- a/pgml-extension/requirements.macos.txt +++ b/pgml-extension/requirements.macos.txt @@ -1,104 +1,101 @@ -accelerate==0.25.0 -aiohttp==3.9.1 +accelerate==0.30.1 +aiohttp==3.9.5 aiosignal==1.3.1 annotated-types==0.6.0 -anyio==4.2.0 -attrs==23.1.0 -bitsandbytes==0.41.3.post2 -catboost==1.2.2 -certifi==2023.11.17 +attrs==23.2.0 +bitsandbytes==0.42.0 +catboost==1.2.5 +certifi==2024.2.2 charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 coloredlogs==15.0.1 -contourpy==1.2.0 +contourpy==1.2.1 ctransformers==0.2.27 cycler==0.12.1 -dataclasses-json==0.6.3 -datasets==2.15.0 -deepspeed==0.12.5 +dataclasses-json==0.6.6 +datasets==2.16.1 +deepspeed==0.14.2 dill==0.3.7 -einops==0.7.0 -filelock==3.13.1 -fonttools==4.47.0 +einops==0.8.0 +filelock==3.14.0 +fonttools==4.51.0 frozenlist==1.4.1 fsspec==2023.10.0 -graphviz==0.20.1 +graphviz==0.20.3 hjson==3.1.0 -huggingface-hub==0.19.4 +huggingface-hub==0.23.0 humanfriendly==10.0 -idna==3.6 -InstructorEmbedding==1.0.1 -Jinja2==3.1.2 -joblib==1.3.2 +idna==3.7 +Jinja2==3.1.4 +joblib==1.4.2 jsonpatch==1.33 jsonpointer==2.4 kiwisolver==1.4.5 -langchain==0.0.351 -langchain-community==0.0.4 -langchain-core==0.1.1 -langsmith==0.0.72 -lightgbm==4.1.0 -lxml==4.9.3 -MarkupSafe==2.1.3 -marshmallow==3.20.1 -matplotlib==3.8.2 +langchain==0.1.19 +langchain-community==0.0.38 +langchain-core==0.1.52 +langchain-text-splitters==0.0.1 +langsmith==0.1.56 +lightgbm==4.3.0 +lxml==5.2.1 +MarkupSafe==2.1.5 +marshmallow==3.21.2 +matplotlib==3.8.4 mpmath==1.3.0 -multidict==6.0.4 +multidict==6.0.5 multiprocess==0.70.15 mypy-extensions==1.0.0 -networkx==3.2.1 +networkx==3.3 ninja==1.11.1.1 -nltk==3.8.1 -numpy==1.26.2 -optimum==1.16.1 -orjson==3.9.10 +numpy==1.26.4 +optimum==1.19.2 +orjson==3.10.3 packaging==23.2 -pandas==2.1.4 -peft==0.7.1 -Pillow==10.1.0 -plotly==5.18.0 +pandas==2.2.2 +peft==0.10.0 +pillow==10.3.0 +plotly==5.22.0 portalocker==2.8.2 -protobuf==4.25.1 -psutil==5.9.7 +protobuf==5.26.1 +psutil==5.9.8 py-cpuinfo==9.0.0 pyarrow==11.0.0 pyarrow-hotfix==0.6 -pydantic==2.5.2 -pydantic_core==2.14.5 +pydantic==2.7.1 +pydantic_core==2.18.2 pynvml==11.5.0 -pyparsing==3.1.1 -python-dateutil==2.8.2 -pytz==2023.3.post1 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 PyYAML==6.0.1 -regex==2023.10.3 +regex==2024.5.10 requests==2.31.0 rouge==1.0.1 -sacrebleu==2.4.0 +sacrebleu==2.4.2 sacremoses==0.1.1 -safetensors==0.4.1 -scikit-learn==1.3.2 -scipy==1.11.4 -sentence-transformers==2.5.1 -sentencepiece==0.1.99 +safetensors==0.4.3 +scikit-learn==1.4.2 +scipy==1.13.0 +sentence-transformers==2.7.0 +sentencepiece==0.2.0 six==1.16.0 -sniffio==1.3.0 -SQLAlchemy==2.0.23 +SQLAlchemy==2.0.30 sympy==1.12 tabulate==0.9.0 -tenacity==8.2.3 -threadpoolctl==3.2.0 -tokenizers==0.15.0 -torch==2.1.2 -torchaudio==2.1.2 -torchvision==0.16.2 -tqdm==4.66.1 -transformers==4.39.3 -transformers-stream-generator==0.0.4 +tenacity==8.3.0 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +torch==2.3.0 +torchaudio==2.3.0 +torchvision==0.18.0 +tqdm==4.66.4 +transformers==4.40.2 +transformers-stream-generator==0.0.5 typing-inspect==0.9.0 -typing_extensions==4.9.0 -tzdata==2023.3 -urllib3==2.1.0 -xgboost==2.0.2 +typing_extensions==4.11.0 +tzdata==2024.1 +urllib3==2.2.1 +xgboost==2.0.3 xxhash==3.4.1 yarl==1.9.4 diff --git a/pgml-extension/requirements.txt b/pgml-extension/requirements.txt index 8f37b28b3..625af5e1a 100644 --- a/pgml-extension/requirements.txt +++ b/pgml-extension/requirements.txt @@ -36,9 +36,9 @@ tokenizers transformers transformers-stream-generator xformers; sys_platform == 'linux' # only runs on nvidia hardware +vllm; sys_platform == 'linux' # only runs on linux # Embeddings -InstructorEmbedding sentence-transformers # Ratings diff --git a/pgml-extension/src/bindings/transformers/transformers.py b/pgml-extension/src/bindings/transformers/transformers.py index 939239675..b0d456104 100644 --- a/pgml-extension/src/bindings/transformers/transformers.py +++ b/pgml-extension/src/bindings/transformers/transformers.py @@ -8,7 +8,6 @@ from datetime import datetime import datasets -from InstructorEmbedding import INSTRUCTOR import numpy import orjson from rouge import Rouge @@ -502,9 +501,7 @@ def transform(task, args, inputs, stream=False): def create_embedding(transformer): - instructor = transformer.startswith("hkunlp/instructor") - klass = INSTRUCTOR if instructor else SentenceTransformer - return klass(transformer) + return SentenceTransformer(transformer) def embed_using(model, transformer, inputs, kwargs): @@ -512,13 +509,9 @@ def embed_using(model, transformer, inputs, kwargs): kwargs = orjson.loads(kwargs) instructor = transformer.startswith("hkunlp/instructor") - if instructor: - texts_with_instructions = [] + if instructor and "instruction" in kwargs: instruction = kwargs.pop("instruction") - for text in inputs: - texts_with_instructions.append([instruction, text]) - - inputs = texts_with_instructions + kwargs["prompt"] = instruction return model.encode(inputs, **kwargs) @@ -1029,7 +1022,6 @@ def __init__( path: str, hyperparameters: dict, ) -> None: - # initialize class variables self.project_id = project_id self.model_id = model_id @@ -1100,8 +1092,9 @@ def print_number_of_trainable_model_parameters(self, model): # Calculate and print the number and percentage of trainable parameters r_log("info", f"Trainable model parameters: {trainable_model_params}") r_log("info", f"All model parameters: {all_model_params}") - r_log("info", - f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%" + r_log( + "info", + f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%", ) def tokenize_function(self): @@ -1396,9 +1389,10 @@ def __init__( "bias": "none", "task_type": "CAUSAL_LM", } - r_log("info", + r_log( + "info", "LoRA configuration are not set. Using default parameters" - + json.dumps(self.lora_config_params) + + json.dumps(self.lora_config_params), ) self.prompt_template = None @@ -1406,13 +1400,11 @@ def __init__( self.prompt_template = hyperparameters.pop("prompt_template") def train(self): - args = TrainingArguments( output_dir=self.path, logging_dir=self.path, **self.training_args ) def formatting_prompts_func(example): - system_content = example["system"] user_content = example["user"] assistant_content = example["assistant"] @@ -1463,7 +1455,7 @@ def formatting_prompts_func(example): peft_config=LoraConfig(**self.lora_config_params), callbacks=[PGMLCallback(self.project_id, self.model_id)], ) - r_log("info","Creating Supervised Fine Tuning trainer done. Training ... ") + r_log("info", "Creating Supervised Fine Tuning trainer done. Training ... ") # Train self.trainer.train() @@ -1582,7 +1574,6 @@ def finetune_conversation( project_id, model_id, ): - train_dataset = datasets.Dataset.from_dict( { "system": system_train, pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy