You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: llama_cpp/llama.py
+19-15Lines changed: 19 additions & 15 deletions
Original file line number
Diff line number
Diff line change
@@ -66,7 +66,6 @@ def __init__(
66
66
split_mode: int=llama_cpp.LLAMA_SPLIT_MODE_LAYER,
67
67
main_gpu: int=0,
68
68
tensor_split: Optional[List[float]] =None,
69
-
rpc_servers: Optional[str] =None,
70
69
vocab_only: bool=False,
71
70
use_mmap: bool=True,
72
71
use_mlock: bool=False,
@@ -93,6 +92,8 @@ def __init__(
93
92
embedding: bool=False,
94
93
offload_kqv: bool=True,
95
94
flash_attn: bool=False,
95
+
op_offloat: bool|None=None,
96
+
swa_full: bool|None=None,
96
97
# Sampling Params
97
98
no_perf: bool=False,
98
99
last_n_tokens_size: int=64,
@@ -150,7 +151,6 @@ def __init__(
150
151
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
151
152
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_MODE_LAYER: ignored
152
153
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
153
-
rpc_servers: Comma separated list of RPC servers to use for offloading
154
154
vocab_only: Only load the vocabulary no weights.
155
155
use_mmap: Use mmap if possible.
156
156
use_mlock: Force the system to keep the model in RAM.
@@ -174,6 +174,8 @@ def __init__(
174
174
embedding: Embedding mode only.
175
175
offload_kqv: Offload K, Q, V to GPU.
176
176
flash_attn: Use flash attention.
177
+
op_offloat: offload host tensor operations to device
178
+
swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
177
179
no_perf: Measure performance timings.
178
180
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
179
181
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -226,11 +228,6 @@ def __init__(
226
228
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments