From 9ec57f7fd4cca50e056e4162443c9e884ff16f3e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 24 Feb 2024 04:18:21 -0500 Subject: [PATCH 1/3] Basic support for hf pull on server --- llama_cpp/server/model.py | 15 +++++++++++++-- llama_cpp/server/settings.py | 4 ++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 5308dc2a8..816d089f3 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -120,9 +120,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: kv_overrides[key] = float(value) else: raise ValueError(f"Unknown value type {value_type}") + + import functools - _model = llama_cpp.Llama( - model_path=settings.model, + kwargs = {} + + if settings.hf_model_repo_id is not None: + create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model) + else: + create_fn = llama_cpp.Llama + kwargs["model_path"] = settings.model + + + _model = create_fn( + **kwargs, # Model Params n_gpu_layers=settings.n_gpu_layers, main_gpu=settings.main_gpu, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 790c6b129..c558beff9 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -143,6 +143,10 @@ class ModelSettings(BaseSettings): default=None, description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", ) + hf_model_repo_id: Optional[str] = Field( + default=None, + description="The HuggingFace repo_id to use to load model files from", + ) # Speculative Decoding draft_model: Optional[str] = Field( default=None, From 49eedd254d56d2ae5280d82e2a7a1c8d2a7e2de6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Feb 2024 14:26:23 -0500 Subject: [PATCH 2/3] Add hf_model_repo_id setting --- llama_cpp/server/settings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 8ac8df881..bd806748c 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -143,6 +143,11 @@ class ModelSettings(BaseSettings): default=None, description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().", ) + # Loading from HuggingFace Model Hub + hf_model_repo_id: Optional[str] = Field( + default=None, + description="The model repo id to use for the HuggingFace tokenizer model.", + ) # Speculative Decoding draft_model: Optional[str] = Field( default=None, From aef3c86d444ffca53634ead6b173b00635504ff3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 26 Feb 2024 14:34:25 -0500 Subject: [PATCH 3/3] Update README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index e3d76436a..8d0980e0d 100644 --- a/README.md +++ b/README.md @@ -577,6 +577,12 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format cha That will format the prompt according to how model expects it. You can find the prompt format in the model card. For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format". +If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub. + +```bash +python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf' +``` + ### Web Server Features - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy