From e522bad948f0816eaba3292ba36bc252fbc5897a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Mar 2025 23:24:27 +0000 Subject: [PATCH 1/9] chore(deps): bump pypa/cibuildwheel from 2.22.0 to 2.23.2 Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.22.0 to 2.23.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.22.0...v2.23.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/build-and-release.yaml | 4 ++-- .github/workflows/build-wheels-metal.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 7307c85ab..9efe440cf 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -42,7 +42,7 @@ jobs: shell: cmd - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -69,7 +69,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 9b97bf2f5..5bc44f2ea 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -43,7 +43,7 @@ jobs: shell: cmd - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" From f33dde30a1597b0e9d62bc7f35cb42a2e9910593 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Mon, 31 Mar 2025 04:15:39 +0900 Subject: [PATCH 2/9] feat: Add Gemma3 chat handler (#1976) --- llama_cpp/llama_chat_format.py | 89 ++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 17575c700..0d6d39cb8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3373,6 +3373,95 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) +class Gemma3ChatHandler(Llava15ChatHandler): + # Chat Format: + # 'user\n{system_prompt}\n\n{prompt}\nmodel\n' + + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = ( + "{{ '' }}" + "{%- if messages[0]['role'] == 'system' -%}" + "{%- if messages[0]['content'] is string -%}" + "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}" + "{%- else -%}" + "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}" + "{%- endif -%}" + "{%- set loop_messages = messages[1:] -%}" + "{%- else -%}" + "{%- set first_user_prefix = \"\" -%}" + "{%- set loop_messages = messages -%}" + "{%- endif -%}" + "{%- for message in loop_messages -%}" + "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{%- endif -%}" + "{%- if (message['role'] == 'assistant') -%}" + "{%- set role = \"model\" -%}" + "{%- else -%}" + "{%- set role = message['role'] -%}" + "{%- endif -%}" + "{{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] | trim }}" + "{%- elif message['content'] is iterable -%}" + "{%- for item in message['content'] -%}" + "{%- if item['type'] == 'image' -%}" + "{{ '' }}" + "{%- elif item['type'] == 'text' -%}" + "{{ item['text'] | trim }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- else -%}" + "{{ raise_exception(\"Invalid content type\") }}" + "{%- endif -%}" + "{{ '\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ 'model\n' }}" + "{%- endif -%}" + ) + + @staticmethod + def split_text_on_image_urls(text: str, image_urls: List[str]): + split_text: List[Tuple[Literal["text", "image_url"], str]] = [] + copied_urls = image_urls[:] + remaining = text + image_placeholder = "" + + while remaining: + # Find placeholder + pos = remaining.find(image_placeholder) + if pos != -1: + assert len(copied_urls) > 0 + if pos > 0: + split_text += [("text", remaining[:pos])] + split_text += [("text", "\n\n")] + split_text += [("image_url", copied_urls.pop(0))] + split_text += [("text", "\n\n")] + remaining = remaining[pos + len(image_placeholder):] + else: + assert len(copied_urls) == 0 + split_text.append(("text", remaining)) + remaining = "" + return split_text + + @staticmethod + def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): + image_urls: List[str] = [] + for message in messages: + if message["role"] == "user": + if message.get("content") is None: + continue + for content in message["content"]: + if isinstance(content, dict) and content.get("type") == "image": + if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str): + image_urls.append(content["image"]["url"]) + elif isinstance(content.get("url"), str): + image_urls.append(content["url"]) + return image_urls + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, From 25b2f8fe0d92cb27e364d3c9601dde77e50446bf Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Thu, 3 Apr 2025 06:25:21 +0900 Subject: [PATCH 3/9] resolve the image embedding issue in gemma3 --- llama_cpp/llama_chat_format.py | 101 ++++++++++++++++++++++------- llama_cpp/llava_cpp.py | 112 +++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0d6d39cb8..7ac0f4016 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2835,24 +2835,7 @@ def __call__( ) llama.eval(tokens) else: - image_bytes = self.load_image(value) - embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) - if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" - ) - n_past = ctypes.c_int(llama.n_tokens) - n_past_p = ctypes.pointer(n_past) - with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.llava_eval_image_embed( - llama.ctx, - embed, - llama.n_batch, - n_past_p, - ) - # Required to avoid issues with hf tokenizer - llama.input_ids[llama.n_tokens : n_past.value] = -1 - llama.n_tokens = n_past.value + self.eval_image(llama, value) # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() @@ -2938,6 +2921,26 @@ def __call__( ) return _convert_completion_to_chat(completion_or_chunks, stream=stream) + def eval_image(self, llama: llama.Llama, image_url: str): + image_bytes = self.load_image(image_url) + embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) + if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" + ) + n_past = ctypes.c_int(llama.n_tokens) + n_past_p = ctypes.pointer(n_past) + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.llava_eval_image_embed( + llama.ctx, + embed, + llama.n_batch, + n_past_p, + ) + # Required to avoid issues with hf tokenizer + llama.input_ids[llama.n_tokens : n_past.value] = -1 + llama.n_tokens = n_past.value + @staticmethod def _load_image(image_url: str) -> bytes: # TODO: Add Pillow support for other image formats beyond (jpg, png) @@ -3435,10 +3438,10 @@ def split_text_on_image_urls(text: str, image_urls: List[str]): if pos != -1: assert len(copied_urls) > 0 if pos > 0: - split_text += [("text", remaining[:pos])] - split_text += [("text", "\n\n")] - split_text += [("image_url", copied_urls.pop(0))] - split_text += [("text", "\n\n")] + split_text.append(("text", remaining[:pos])) + split_text.append(("text", "\n\n")) + split_text.append(("image_url", copied_urls.pop(0))) + split_text.append(("text", "\n\n")) remaining = remaining[pos + len(image_placeholder):] else: assert len(copied_urls) == 0 @@ -3461,6 +3464,60 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): image_urls.append(content["url"]) return image_urls + def eval_image(self, llama: llama.Llama, image_url: str): + import llama_cpp + + img_bytes = self.load_image(image_url) + img_u8_p = self._llava_cpp.clip_image_u8_init() + if not self._llava_cpp.clip_image_load_from_bytes( + ctypes.create_string_buffer(img_bytes, len(img_bytes)), + ctypes.c_size_t(len(img_bytes)), + img_u8_p, + ): + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to load image.") + + img_f32 = self._llava_cpp.clip_image_f32_batch() + img_f32_p = ctypes.byref(img_f32) + if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p): + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to preprocess image.") + + n_embd = llama_cpp.llama_model_n_embd(llama._model.model) + n_tokens = 256 + embed = (ctypes.c_float * (n_tokens * n_embd))() + if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed): + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to encode image.") + + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + llama_cpp.llama_set_causal_attn(llama.ctx, False) + + seq_id_0 = (ctypes.c_int32 * 1)() + seq_ids = (ctypes.POINTER(ctypes.c_int32) * (n_tokens + 1))() + for i in range(n_tokens): + seq_ids[i] = seq_id_0 + + batch = llama_cpp.llama_batch() + batch.n_tokens = n_tokens + batch.token = None + batch.embd = embed + batch.pos = (ctypes.c_int32 * n_tokens)(*[i + llama.n_tokens for i in range(n_tokens)]) + batch.seq_id = seq_ids + batch.n_seq_id = (ctypes.c_int32 * n_tokens)(*([1] * n_tokens)) + batch.logits = (ctypes.c_int8 * n_tokens)() + + if llama_cpp.llama_decode(llama.ctx, batch): + raise ValueError("Failed to decode image.") + + llama_cpp.llama_set_causal_attn(llama.ctx, True) + # Required to avoid issues with hf tokenizer + llama.input_ids[llama.n_tokens : llama.n_tokens + n_tokens] = -1 + llama.n_tokens += n_tokens + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index d9dfaf5fd..46ac5087f 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -7,6 +7,7 @@ c_int, c_uint8, c_float, + c_size_t, c_void_p, POINTER, _Pointer, # type: ignore @@ -141,6 +142,28 @@ def llava_eval_image_embed( ################################################ +# struct clip_image_u8_batch { +# struct clip_image_u8 * data; +# size_t size; +# }; +class clip_image_u8_batch(Structure): + _fields_ = [ + ("data", c_void_p), + ("size", c_size_t), + ] + + +# struct clip_image_f32_batch { +# struct clip_image_f32 * data; +# size_t size; +# }; +class clip_image_f32_batch(Structure): + _fields_ = [ + ("data", c_void_p), + ("size", c_size_t), + ] + + # /** load mmproj model */ # CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) @@ -156,3 +179,92 @@ def clip_model_load( def clip_free(ctx: clip_ctx_p, /): ... + +# CLIP_API struct clip_image_u8 * clip_image_u8_init (); +@ctypes_function("clip_image_u8_init", [], c_void_p) +def clip_image_u8_init() -> Optional[c_void_p]: + ... + + +# CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +@ctypes_function("clip_image_u8_free", [c_void_p], None) +def clip_image_u8_free(img: c_void_p, /): + ... + + +# CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +@ctypes_function("clip_image_f32_free", [c_void_p], None) +def clip_image_f32_free(img: c_void_p, /): + ... + + +# CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +@ctypes_function("clip_image_u8_batch_free", [POINTER(clip_image_u8_batch)], None) +def clip_image_u8_batch_free(batch: "_Pointer[clip_image_u8_batch]", /): + ... + + +# CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); +@ctypes_function("clip_image_f32_batch_free", [POINTER(clip_image_f32_batch)], None) +def clip_image_f32_batch_free(batch: "_Pointer[clip_image_f32_batch]", /): + ... + + +# /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ +# CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); +@ctypes_function( + "clip_image_preprocess", + [ + clip_ctx_p_ctypes, + c_void_p, + POINTER(clip_image_f32_batch), + ], + c_bool, +) +def clip_image_preprocess( + ctx: clip_ctx_p, + img: c_void_p, + res_imgs: "_Pointer[clip_image_f32_batch]", + /, +) -> bool: + ... + + +# CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); +@ctypes_function( + "clip_image_batch_encode", + [ + clip_ctx_p_ctypes, + c_int, + POINTER(clip_image_f32_batch), + POINTER(c_float), + ], + c_bool, +) +def clip_image_batch_encode( + ctx: clip_ctx_p, + n_threads: c_int, + imgs: "_Pointer[clip_image_f32_batch]", + vec: c_void_p +) -> bool: + ... + + +# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); +@ctypes_function( + "clip_image_load_from_bytes", + [ + c_void_p, + c_size_t, + c_void_p, + ], + c_bool, +) +def clip_image_load_from_bytes( + bytes: c_void_p, + bytes_length: c_size_t, + img: c_void_p, + /, +) -> bool: + ... From 1b455888d40aa2f64ace593ddeb7c54a3087d631 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:43:58 +0900 Subject: [PATCH 4/9] fix: added n_ctx check for prompt requirements when embedding images in Gemma3ChatHandler --- llama_cpp/llama_chat_format.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 7ac0f4016..cbac975bd 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3467,6 +3467,12 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): def eval_image(self, llama: llama.Llama, image_url: str): import llama_cpp + n_tokens = 256 + if llama.n_tokens + n_tokens > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + n_tokens} > {llama.n_ctx()}" + ) + img_bytes = self.load_image(image_url) img_u8_p = self._llava_cpp.clip_image_u8_init() if not self._llava_cpp.clip_image_load_from_bytes( @@ -3485,7 +3491,6 @@ def eval_image(self, llama: llama.Llama, image_url: str): raise ValueError("Failed to preprocess image.") n_embd = llama_cpp.llama_model_n_embd(llama._model.model) - n_tokens = 256 embed = (ctypes.c_float * (n_tokens * n_embd))() if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed): self._llava_cpp.clip_image_f32_batch_free(img_f32_p) From 360b04c4e69f9dfea29cdc30a1bbe7bf88ce84ce Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 16:07:58 +0200 Subject: [PATCH 5/9] update to match llama.cpp e0e912f api --- llama_cpp/llama_cpp.py | 125 ++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 33 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f3985ad2f..170020654 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -165,6 +165,10 @@ # llama_sampler_p = NewType("llama_sampler_p", int) # llama_sampler_p_ctypes = ctypes.c_void_p +# struct llama_kv_cache; +llama_kv_cache_p = NewType("llama_kv_cache_p", int) +llama_kv_cache_p_ctypes = ctypes.c_void_p + # typedef int32_t llama_pos; llama_pos = ctypes.c_int32 # typedef int32_t llama_token; @@ -259,7 +263,9 @@ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 - +LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 +LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 +LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 # // note: these values should be synchronized with ggml_rope # // TODO: maybe move this enum to ggml.h (ggml_rope_type) @@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure): value: Union[int, float, bool, bytes] + +# struct llama_model_tensor_buft_override { +# const char * pattern; +# ggml_backend_buffer_type_t buft; +# +# }; +class llama_model_tensor_buft_override(ctypes.Structure): + _fields_ = [ + ("pattern", ctypes.c_char_p), + ("buft", ctypes.c_void_p) + ] + + +llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override) + + # struct llama_model_params { # // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) # ggml_backend_dev_t * devices; +# // NULL-terminated list of buffer types to use for tensors that match a pattern +# const struct llama_model_tensor_buft_override * tensor_buft_overrides; + # int32_t n_gpu_layers; // number of layers to store in VRAM # enum llama_split_mode split_mode; // how to split the model across multiple GPUs @@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure): _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused + ("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p), ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int: def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... +# LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); +@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes) +def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: + ... # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) @@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll # // Returns the number of tokens in the KV cache (slow, use only for debug) # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); +# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); +@ctypes_function( + "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 +) +def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: + """Returns the number of tokens in the KV cache (slow, use only for debug) + If a KV cell has multiple sequences assigned to it, it will be counted multiple times + """ + ... + +# // Returns the number of tokens in the KV cache (slow, use only for debug) +# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times +# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead"); @ctypes_function( "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32 ) @@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int: # // Clear the KV cache - both cell info is erased and KV data is zeroed -# LLAMA_API void llama_kv_cache_clear( +# LLAMA_API void llama_kv_self_clear( # struct llama_context * ctx); -@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None) -def llama_kv_cache_clear(ctx: llama_context_p, /): +@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None) +def llama_kv_self_clear(ctx: llama_context_p, /): """Clear the KV cache""" ... @@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): # // seq_id < 0 : match any sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API bool llama_kv_cache_seq_rm( +# LLAMA_API bool llama_kv_self_seq_rm( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1); @ctypes_function( - "llama_kv_cache_seq_rm", + "llama_kv_self_seq_rm", [ llama_context_p_ctypes, llama_seq_id, @@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): ], ctypes.c_bool, ) -def llama_kv_cache_seq_rm( +def llama_kv_self_seq_rm( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm( # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_cp( +# LLAMA_API void llama_kv_self_seq_cp( # struct llama_context * ctx, # llama_seq_id seq_id_src, # llama_seq_id seq_id_dst, # llama_pos p0, # llama_pos p1); @ctypes_function( - "llama_kv_cache_seq_cp", + "llama_kv_self_seq_cp", [ llama_context_p_ctypes, llama_seq_id, @@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm( ], None, ) -def llama_kv_cache_seq_cp( +def llama_kv_self_seq_cp( ctx: llama_context_p, seq_id_src: Union[llama_seq_id, int], seq_id_dst: Union[llama_seq_id, int], @@ -1914,13 +1956,13 @@ def llama_kv_cache_seq_cp( # // Removes all tokens that do not belong to the specified sequence -# LLAMA_API void llama_kv_cache_seq_keep( +# LLAMA_API void llama_kv_self_seq_keep( # struct llama_context * ctx, # llama_seq_id seq_id); @ctypes_function( - "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None + "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None ) -def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): +def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): """Removes all tokens that do not belong to the specified sequence""" ... @@ -1928,17 +1970,17 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) # // If the KV cache is RoPEd, the KV data is updated accordingly: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() +# // - explicitly with llama_kv_self_update() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_add( +# LLAMA_API void llama_kv_self_seq_add( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, # llama_pos delta); @ctypes_function( - "llama_kv_cache_seq_add", + "llama_kv_self_seq_add", [ llama_context_p_ctypes, llama_seq_id, @@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in ], None, ) -def llama_kv_cache_seq_add( +def llama_kv_self_seq_add( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add( """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) If the KV cache is RoPEd, the KV data is updated accordingly: - lazily on next llama_decode() - - explicitly with llama_kv_cache_update() + - explicitly with llama_kv_self_update() p0 < 0 : [0, p1] p1 < 0 : [p0, inf)""" ... @@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add( # // If the KV cache is RoPEd, the KV data is updated accordingly # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_div( +# LLAMA_API void llama_kv_self_seq_div( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, # int d); @ctypes_function( - "llama_kv_cache_seq_div", + "llama_kv_self_seq_div", [ llama_context_p_ctypes, llama_seq_id, @@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add( ], None, ) -def llama_kv_cache_seq_div( +def llama_kv_self_seq_div( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div( # // Defragment the KV cache # // This will be applied: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() -# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None) -def llama_kv_cache_defrag(ctx: llama_context_p, /): +# // - explicitly with llama_kv_self_update() +# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx); +@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) +def llama_kv_self_defrag(ctx: llama_context_p, /): """Defragment the KV cache This will be applied: - lazily on next llama_decode() - - explicitly with llama_kv_cache_update()""" + - explicitly with llama_kv_self_update()""" ... # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None) -def llama_kv_cache_update(ctx: llama_context_p, /): +# LLAMA_API void llama_kv_self_update(struct llama_context * ctx); +@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) +def llama_kv_self_update(ctx: llama_context_p, /): """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)""" ... # // Check if the context supports KV cache shifting -# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool: +# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx); +@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) +def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: """Check if the context supports KV cache shifting""" ... @@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): ... +# // Set whether the model is in warmup mode or not +# // If true, all model tensors are activated during llama_decode() to load and cache their weights. +# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): + """Set whether to use causal attention or not + If set to true, the model will only attend to the past tokens""" + ... + + # // Set abort callback # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); @ctypes_function( @@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2( ... + + + +# /// @details Intializes a GBNF grammar, see grammars/README.md for details. +# /// @param vocab The vocabulary that this grammar will be used with. +# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails. +# /// @param grammar_root The name of the start symbol for the grammar. # LLAMA_API struct llama_sampler * llama_sampler_init_grammar( # const struct llama_vocab * vocab, # const char * grammar_str, From 924833237f881339124bd47425b4049af01298f7 Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 17:01:27 +0200 Subject: [PATCH 6/9] update _internals and _utils to match API changes, add deprecation warnings --- llama_cpp/_internals.py | 29 +++++++++++++++++++++++++---- llama_cpp/_utils.py | 16 ++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 343581dce..57a9a5ab4 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -13,6 +13,11 @@ from dataclasses import dataclass, field from contextlib import ExitStack +try: + from warnings import deprecated +except ImportError: + from ._utils import deprecated + import numpy as np import numpy.typing as npt @@ -276,21 +281,37 @@ def n_ctx(self) -> int: def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) + @deprecated("Use llama_kv_self_clear") def kv_cache_clear(self): - llama_cpp.llama_kv_cache_clear(self.ctx) + self.llama_kv_self_clear() + @deprecated("Use kv_self_seq_rm") def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + self.kv_self_seq_rm(seq_id, p0, p1) + @deprecated("Use kv_self_seq_cp") def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1) + @deprecated("Use kv_self_seq_keep") def kv_cache_seq_keep(self, seq_id: int): - llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) + self.kv_self_seq_keep(seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) + def llama_kv_self_clear(self): + llama_cpp.llama_llama_kv_self_clear(self.ctx) + + def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int): + llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1) + + def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): + llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + + def kv_self_seq_keep(self, seq_id: int): + llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id) + def get_state_size(self) -> int: return llama_cpp.llama_get_state_size(self.ctx) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 29628193b..75b39f694 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,5 +1,7 @@ import os import sys +import warnings +import functools from typing import Any, Dict @@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton): def __init__(self): super(Singleton, self).__init__() + + +def deprecated(reason): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + f"Call to deprecated function {func.__name__} ({reason}).", + category=DeprecationWarning, + stacklevel=2, + ) + return func(*args, **kwargs) + return wrapper + return decorator From b6e3c89826c4138283496ed36ddf2c32ec813d19 Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 17:18:18 +0200 Subject: [PATCH 7/9] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7841fc723..2004644b7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7841fc723e059d1fd9640e5c0ef19050fcc7c698 +Subproject commit 2004644b7a5da6fe080e51861ab583480280f1d3 From 025e7fa44bfd071eb36b5641448c4e80a0b29917 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Fri, 4 Apr 2025 20:17:26 +0900 Subject: [PATCH 8/9] fix: modify the gemma3 chat template to be compatible with openai api --- llama_cpp/llama_chat_format.py | 17 +---------------- llama_cpp/llava_cpp.py | 3 ++- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cbac975bd..4e1aad381 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3409,7 +3409,7 @@ class Gemma3ChatHandler(Llava15ChatHandler): "{{ message['content'] | trim }}" "{%- elif message['content'] is iterable -%}" "{%- for item in message['content'] -%}" - "{%- if item['type'] == 'image' -%}" + "{%- if item['type'] == 'image_url' -%}" "{{ '' }}" "{%- elif item['type'] == 'text' -%}" "{{ item['text'] | trim }}" @@ -3449,21 +3449,6 @@ def split_text_on_image_urls(text: str, image_urls: List[str]): remaining = "" return split_text - @staticmethod - def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): - image_urls: List[str] = [] - for message in messages: - if message["role"] == "user": - if message.get("content") is None: - continue - for content in message["content"]: - if isinstance(content, dict) and content.get("type") == "image": - if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str): - image_urls.append(content["image"]["url"]) - elif isinstance(content.get("url"), str): - image_urls.append(content["url"]) - return image_urls - def eval_image(self, llama: llama.Llama, image_url: str): import llama_cpp diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index 46ac5087f..8a382b4d9 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -245,7 +245,8 @@ def clip_image_batch_encode( ctx: clip_ctx_p, n_threads: c_int, imgs: "_Pointer[clip_image_f32_batch]", - vec: c_void_p + vec: c_void_p, + /, ) -> bool: ... From 8f21c9000425b4ebf437c6c95bae5e1c53d53173 Mon Sep 17 00:00:00 2001 From: bot08 <71845954+bot08@users.noreply.github.com> Date: Mon, 7 Apr 2025 22:43:17 +0300 Subject: [PATCH 9/9] Update README.md --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index e00456580..3e5c22617 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,20 @@ +# Example installing from this fork with CUDA support (PowerShell) + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +pip install git+https://github.com/bot08/llama-cpp-python.git@main +``` + +To force a clean rebuild: + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +pip install --upgrade --force-reinstall --no-cache-dir ` + git+https://github.com/bot08/llama-cpp-python.git@main +``` + +--- +

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies: