From 9ec57f7fd4cca50e056e4162443c9e884ff16f3e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 24 Feb 2024 04:18:21 -0500
Subject: [PATCH 1/3] Basic support for hf pull on server

---
 llama_cpp/server/model.py    | 15 +++++++++++++--
 llama_cpp/server/settings.py |  4 ++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 5308dc2a8..816d089f3 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -120,9 +120,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                         kv_overrides[key] = float(value)
                     else:
                         raise ValueError(f"Unknown value type {value_type}")
+        
+        import functools
 
-        _model = llama_cpp.Llama(
-            model_path=settings.model,
+        kwargs = {}
+
+        if settings.hf_model_repo_id is not None:
+            create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model)
+        else:
+            create_fn = llama_cpp.Llama
+            kwargs["model_path"] = settings.model
+        
+
+        _model = create_fn(
+            **kwargs,
             # Model Params
             n_gpu_layers=settings.n_gpu_layers,
             main_gpu=settings.main_gpu,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 790c6b129..c558beff9 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -143,6 +143,10 @@ class ModelSettings(BaseSettings):
         default=None,
         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
     )
+    hf_model_repo_id: Optional[str] = Field(
+        default=None,
+        description="The HuggingFace repo_id to use to load model files from",
+    )
     # Speculative Decoding
     draft_model: Optional[str] = Field(
         default=None,

From 49eedd254d56d2ae5280d82e2a7a1c8d2a7e2de6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 26 Feb 2024 14:26:23 -0500
Subject: [PATCH 2/3] Add hf_model_repo_id setting

---
 llama_cpp/server/settings.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 8ac8df881..bd806748c 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -143,6 +143,11 @@ class ModelSettings(BaseSettings):
         default=None,
         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
     )
+    # Loading from HuggingFace Model Hub
+    hf_model_repo_id: Optional[str] = Field(
+        default=None,
+        description="The model repo id to use for the HuggingFace tokenizer model.",
+    )
     # Speculative Decoding
     draft_model: Optional[str] = Field(
         default=None,

From aef3c86d444ffca53634ead6b173b00635504ff3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 26 Feb 2024 14:34:25 -0500
Subject: [PATCH 3/3] Update README

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index e3d76436a..8d0980e0d 100644
--- a/README.md
+++ b/README.md
@@ -577,6 +577,12 @@ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format cha
 That will format the prompt according to how model expects it. You can find the prompt format in the model card.
 For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
 
+If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
+
+```bash
+python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
+```
+
 ### Web Server Features
 
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/abetlen/llama-cpp-python/pull/1222.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/abetlen/llama-cpp-python/pull/1222.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/abetlen/llama-cpp-python/pull/1222.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/abetlen/llama-cpp-python/pull/1222.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>