Skip to content

Commit 3434803

Browse files
committed
Merge branch 'main' into v0.2-wip
2 parents 77c9f49 + c7c700b commit 3434803

File tree

5 files changed

+43
-2
lines changed

5 files changed

+43
-2
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.77]
11+
12+
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
13+
- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B
14+
1015
## [0.1.76]
1116

1217
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ For instance, if you want to work with larger contexts, you can expand the conte
135135
llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
136136
```
137137

138+
### Loading llama-2 70b
139+
140+
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
141+
142+
```python
143+
llm = Llama(model_path="./models/7B/ggml-model.bin", n_gqa=8)
144+
```
145+
138146
## Web Server
139147

140148
`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.

llama_cpp/llama.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@ def __init__(
230230
tensor_split: Optional[List[float]] = None,
231231
rope_freq_base: float = 10000.0,
232232
rope_freq_scale: float = 1.0,
233+
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
234+
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
233235
verbose: bool = True,
234236
):
235237
"""Load a llama.cpp model from `model_path`.
@@ -291,6 +293,12 @@ def __init__(
291293
self.params.rope_freq_base = rope_freq_base
292294
self.params.rope_freq_scale = rope_freq_scale
293295

296+
if n_gqa is not None:
297+
self.params.n_gqa = n_gqa
298+
299+
if rms_norm_eps is not None:
300+
self.params.rms_norm_eps = rms_norm_eps
301+
294302
self.last_n_tokens_size = last_n_tokens_size
295303
self.n_batch = min(n_ctx, n_batch)
296304

@@ -1530,6 +1538,10 @@ def __getstate__(self):
15301538
lora_base=self.lora_base,
15311539
lora_path=self.lora_path,
15321540
tensor_split=self.tensor_split,
1541+
### TEMPORARY ###
1542+
n_gqa=self.params.n_gqa,
1543+
rms_norm_eps=self.params.rms_norm_eps,
1544+
### TEMPORARY ###
15331545
### DEPRECATED ###
15341546
n_parts=self.n_parts,
15351547
### DEPRECATED ###
@@ -1539,7 +1551,6 @@ def __setstate__(self, state):
15391551
self.__init__(
15401552
model_path=state["model_path"],
15411553
n_ctx=state["n_ctx"],
1542-
n_parts=state["n_parts"],
15431554
n_gpu_layers=state["n_gpu_layers"],
15441555
seed=state["seed"],
15451556
f16_kv=state["f16_kv"],
@@ -1556,6 +1567,13 @@ def __setstate__(self, state):
15561567
lora_path=state["lora_path"],
15571568
tensor_split=state["tensor_split"],
15581569
verbose=state["verbose"],
1570+
### TEMPORARY ###
1571+
n_gqa=state["n_gqa"],
1572+
rms_norm_eps=state["rms_norm_eps"],
1573+
### TEMPORARY ###
1574+
### DEPRECATED ###
1575+
n_parts=state["n_parts"],
1576+
### DEPRECATED ###
15591577
)
15601578

15611579
def save_state(self) -> LlamaState:

llama_cpp/server/app.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ class Settings(BaseSettings):
100100
default=True,
101101
description="Whether to interrupt requests when a new request is received.",
102102
)
103+
n_gqa: Optional[int] = Field(
104+
default=None,
105+
description="TEMPORARY: Set to 8 for Llama2 70B",
106+
)
107+
rms_norm_eps: Optional[float] = Field(
108+
default=None,
109+
description="TEMPORARY",
110+
)
103111

104112

105113
class ErrorResponse(TypedDict):
@@ -325,6 +333,8 @@ def create_app(settings: Optional[Settings] = None):
325333
last_n_tokens_size=settings.last_n_tokens_size,
326334
vocab_only=settings.vocab_only,
327335
verbose=settings.verbose,
336+
n_gqa=settings.n_gqa,
337+
rms_norm_eps=settings.rms_norm_eps,
328338
)
329339
if settings.cache:
330340
if settings.cache_type == "disk":

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
44

55
[project]
66
name = "llama_cpp_python"
7-
version = "0.1.76"
7+
version = "0.1.77"
88
description = "Python bindings for the llama.cpp library"
99
readme = "README.md"
1010
license = { text = "MIT" }

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy