@@ -39,6 +39,7 @@ def __init__(
39
39
n_threads : Optional [int ] = None ,
40
40
n_batch : int = 8 ,
41
41
last_n_tokens_size : int = 64 ,
42
+ lora_path : Optional [str ] = None ,
42
43
verbose : bool = True ,
43
44
):
44
45
"""Load a llama.cpp model from `model_path`.
@@ -57,6 +58,7 @@ def __init__(
57
58
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
58
59
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
59
60
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
61
+ lora_path: Path to a LoRA file to apply to the model.
60
62
verbose: Print verbose output to stderr.
61
63
62
64
Raises:
@@ -108,6 +110,17 @@ def __init__(
108
110
self .model_path .encode ("utf-8" ), self .params
109
111
)
110
112
113
+ self .lora_path = None
114
+ if lora_path :
115
+ self .lora_path = lora_path
116
+ if llama_cpp .llama_apply_lora_from_file (
117
+ self .ctx ,
118
+ self .lora_path .encode ("utf-8" ),
119
+ self .model_path .encode ("utf-8" ),
120
+ llama_cpp .c_int (self .n_threads ),
121
+ ):
122
+ raise RuntimeError (f"Failed to apply LoRA from path: { self .lora_path } " )
123
+
111
124
if self .verbose :
112
125
print (llama_cpp .llama_print_system_info ().decode ("utf-8" ), file = sys .stderr )
113
126
@@ -802,6 +815,7 @@ def __getstate__(self):
802
815
last_n_tokens_size = self .last_n_tokens_size ,
803
816
n_batch = self .n_batch ,
804
817
n_threads = self .n_threads ,
818
+ lora_path = self .lora_path ,
805
819
)
806
820
807
821
def __setstate__ (self , state ):
@@ -819,6 +833,7 @@ def __setstate__(self, state):
819
833
n_threads = state ["n_threads" ],
820
834
n_batch = state ["n_batch" ],
821
835
last_n_tokens_size = state ["last_n_tokens_size" ],
836
+ lora_path = state ["lora_path" ],
822
837
verbose = state ["verbose" ],
823
838
)
824
839
0 commit comments