Skip to content

Commit 51dce74

Browse files
committed
misc: Fix support for new parameters, deprecate rpc_servers parameter
1 parent 0d475d7 commit 51dce74

File tree

1 file changed

+19
-15
lines changed

1 file changed

+19
-15
lines changed

llama_cpp/llama.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ def __init__(
6666
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
6767
main_gpu: int = 0,
6868
tensor_split: Optional[List[float]] = None,
69-
rpc_servers: Optional[str] = None,
7069
vocab_only: bool = False,
7170
use_mmap: bool = True,
7271
use_mlock: bool = False,
@@ -93,6 +92,8 @@ def __init__(
9392
embedding: bool = False,
9493
offload_kqv: bool = True,
9594
flash_attn: bool = False,
95+
op_offloat: bool | None = None,
96+
swa_full: bool | None = None,
9697
# Sampling Params
9798
no_perf: bool = False,
9899
last_n_tokens_size: int = 64,
@@ -150,7 +151,6 @@ def __init__(
150151
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
151152
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
152153
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
153-
rpc_servers: Comma separated list of RPC servers to use for offloading
154154
vocab_only: Only load the vocabulary no weights.
155155
use_mmap: Use mmap if possible.
156156
use_mlock: Force the system to keep the model in RAM.
@@ -174,6 +174,8 @@ def __init__(
174174
embedding: Embedding mode only.
175175
offload_kqv: Offload K, Q, V to GPU.
176176
flash_attn: Use flash attention.
177+
op_offloat: offload host tensor operations to device
178+
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
177179
no_perf: Measure performance timings.
178180
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
179181
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -226,11 +228,6 @@ def __init__(
226228
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
227229
self.model_params.split_mode = split_mode
228230
self.model_params.main_gpu = main_gpu
229-
if rpc_servers is not None:
230-
self.model_params.rpc_servers = rpc_servers.encode("utf-8")
231-
self._rpc_servers = rpc_servers
232-
else:
233-
self._rpc_servers = None
234231
self.tensor_split = tensor_split
235232
self._c_tensor_split = None
236233
if self.tensor_split is not None:
@@ -341,12 +338,17 @@ def __init__(
341338
yarn_beta_slow if yarn_beta_slow != 0.0 else 0
342339
)
343340
self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
344-
self.context_params.logits_all = (
345-
logits_all if draft_model is None else True
346-
) # Must be set to True for speculative decoding
341+
self._logits_all = logits_all if draft_model is None else True
347342
self.context_params.embeddings = embedding # TODO: Rename to embeddings
348343
self.context_params.offload_kqv = offload_kqv
349344
self.context_params.flash_attn = flash_attn
345+
346+
if op_offloat is not None:
347+
self.context_params.op_offloat = op_offloat
348+
349+
if swa_full is not None:
350+
self.context_params.swa_full = swa_full
351+
350352
# KV cache quantization
351353
if type_k is not None:
352354
self.context_params.type_k = type_k
@@ -568,7 +570,7 @@ def eval_tokens(self) -> Deque[int]:
568570
def eval_logits(self) -> Deque[List[float]]:
569571
return deque(
570572
self.scores[: self.n_tokens, :].tolist(),
571-
maxlen=self._n_ctx if self.context_params.logits_all else 1,
573+
maxlen=self._n_ctx if self._logits_all else 1,
572574
)
573575

574576
def tokenize(
@@ -641,13 +643,13 @@ def eval(self, tokens: Sequence[int]):
641643
n_past = self.n_tokens
642644
n_tokens = len(batch)
643645
self._batch.set_batch(
644-
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
646+
batch=batch, n_past=n_past, logits_all=self._logits_all
645647
)
646648
self._ctx.decode(self._batch)
647649
# Save tokens
648650
self.input_ids[n_past : n_past + n_tokens] = batch
649651
# Save logits
650-
if self.context_params.logits_all:
652+
if self._logits_all:
651653
rows = n_tokens
652654
cols = self._n_vocab
653655
logits = np.ctypeslib.as_array(
@@ -1288,7 +1290,7 @@ def logit_bias_processor(
12881290
else:
12891291
stop_sequences = []
12901292

1291-
if logprobs is not None and self.context_params.logits_all is False:
1293+
if logprobs is not None and self._logits_all is False:
12921294
raise ValueError(
12931295
"logprobs is not supported for models created with logits_all=False"
12941296
)
@@ -2091,10 +2093,12 @@ def __getstate__(self):
20912093
yarn_beta_fast=self.context_params.yarn_beta_fast,
20922094
yarn_beta_slow=self.context_params.yarn_beta_slow,
20932095
yarn_orig_ctx=self.context_params.yarn_orig_ctx,
2094-
logits_all=self.context_params.logits_all,
2096+
logits_all=self._logits_all,
20952097
embedding=self.context_params.embeddings,
20962098
offload_kqv=self.context_params.offload_kqv,
20972099
flash_attn=self.context_params.flash_attn,
2100+
op_offloat=self.context_params.op_offloat,
2101+
swa_full=self.context_params.swa_full,
20982102
# Sampling Params
20992103
no_perf=self.context_params.no_perf,
21002104
last_n_tokens_size=self.last_n_tokens_size,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy