diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 7307c85ab..9efe440cf 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -42,7 +42,7 @@ jobs:
shell: cmd
- name: Build wheels
- uses: pypa/cibuildwheel@v2.22.0
+ uses: pypa/cibuildwheel@v2.23.2
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -69,7 +69,7 @@ jobs:
platforms: linux/arm64
- name: Build wheels
- uses: pypa/cibuildwheel@v2.22.0
+ uses: pypa/cibuildwheel@v2.23.2
env:
CIBW_SKIP: "*musllinux* pp*"
CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index 9b97bf2f5..5bc44f2ea 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -43,7 +43,7 @@ jobs:
shell: cmd
- name: Build wheels
- uses: pypa/cibuildwheel@v2.22.0
+ uses: pypa/cibuildwheel@v2.23.2
env:
# disable repair
CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/README.md b/README.md
index e00456580..3e5c22617 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,20 @@
+# Example installing from this fork with CUDA support (PowerShell)
+
+```powershell
+$env:CMAKE_ARGS = "-DGGML_CUDA=on"
+pip install git+https://github.com/bot08/llama-cpp-python.git@main
+```
+
+To force a clean rebuild:
+
+```powershell
+$env:CMAKE_ARGS = "-DGGML_CUDA=on"
+pip install --upgrade --force-reinstall --no-cache-dir `
+ git+https://github.com/bot08/llama-cpp-python.git@main
+```
+
+---
+
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 343581dce..57a9a5ab4 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -13,6 +13,11 @@
from dataclasses import dataclass, field
from contextlib import ExitStack
+try:
+ from warnings import deprecated
+except ImportError:
+ from ._utils import deprecated
+
import numpy as np
import numpy.typing as npt
@@ -276,21 +281,37 @@ def n_ctx(self) -> int:
def pooling_type(self) -> int:
return llama_cpp.llama_pooling_type(self.ctx)
+ @deprecated("Use llama_kv_self_clear")
def kv_cache_clear(self):
- llama_cpp.llama_kv_cache_clear(self.ctx)
+ self.llama_kv_self_clear()
+ @deprecated("Use kv_self_seq_rm")
def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
- llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+ self.kv_self_seq_rm(seq_id, p0, p1)
+ @deprecated("Use kv_self_seq_cp")
def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
- llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+ self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1)
+ @deprecated("Use kv_self_seq_keep")
def kv_cache_seq_keep(self, seq_id: int):
- llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+ self.kv_self_seq_keep(seq_id)
def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
+ def llama_kv_self_clear(self):
+ llama_cpp.llama_llama_kv_self_clear(self.ctx)
+
+ def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
+ llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
+
+ def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+ llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+
+ def kv_self_seq_keep(self, seq_id: int):
+ llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
+
def get_state_size(self) -> int:
return llama_cpp.llama_get_state_size(self.ctx)
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 29628193b..75b39f694 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -1,5 +1,7 @@
import os
import sys
+import warnings
+import functools
from typing import Any, Dict
@@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton):
def __init__(self):
super(Singleton, self).__init__()
+
+
+def deprecated(reason):
+ def decorator(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ warnings.warn(
+ f"Call to deprecated function {func.__name__} ({reason}).",
+ category=DeprecationWarning,
+ stacklevel=2,
+ )
+ return func(*args, **kwargs)
+ return wrapper
+ return decorator
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 17575c700..4e1aad381 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2835,24 +2835,7 @@ def __call__(
)
llama.eval(tokens)
else:
- image_bytes = self.load_image(value)
- embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
- if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
- raise ValueError(
- f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
- )
- n_past = ctypes.c_int(llama.n_tokens)
- n_past_p = ctypes.pointer(n_past)
- with suppress_stdout_stderr(disable=self.verbose):
- self._llava_cpp.llava_eval_image_embed(
- llama.ctx,
- embed,
- llama.n_batch,
- n_past_p,
- )
- # Required to avoid issues with hf tokenizer
- llama.input_ids[llama.n_tokens : n_past.value] = -1
- llama.n_tokens = n_past.value
+ self.eval_image(llama, value)
# Get prompt tokens to avoid a cache miss
prompt = llama.input_ids[: llama.n_tokens].tolist()
@@ -2938,6 +2921,26 @@ def __call__(
)
return _convert_completion_to_chat(completion_or_chunks, stream=stream)
+ def eval_image(self, llama: llama.Llama, image_url: str):
+ image_bytes = self.load_image(image_url)
+ embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
+ if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
+ raise ValueError(
+ f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
+ )
+ n_past = ctypes.c_int(llama.n_tokens)
+ n_past_p = ctypes.pointer(n_past)
+ with suppress_stdout_stderr(disable=self.verbose):
+ self._llava_cpp.llava_eval_image_embed(
+ llama.ctx,
+ embed,
+ llama.n_batch,
+ n_past_p,
+ )
+ # Required to avoid issues with hf tokenizer
+ llama.input_ids[llama.n_tokens : n_past.value] = -1
+ llama.n_tokens = n_past.value
+
@staticmethod
def _load_image(image_url: str) -> bytes:
# TODO: Add Pillow support for other image formats beyond (jpg, png)
@@ -3373,6 +3376,139 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
)
+class Gemma3ChatHandler(Llava15ChatHandler):
+ # Chat Format:
+ # 'user\n{system_prompt}\n\n{prompt}\nmodel\n'
+
+ DEFAULT_SYSTEM_MESSAGE = None
+
+ CHAT_FORMAT = (
+ "{{ '' }}"
+ "{%- if messages[0]['role'] == 'system' -%}"
+ "{%- if messages[0]['content'] is string -%}"
+ "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}"
+ "{%- else -%}"
+ "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}"
+ "{%- endif -%}"
+ "{%- set loop_messages = messages[1:] -%}"
+ "{%- else -%}"
+ "{%- set first_user_prefix = \"\" -%}"
+ "{%- set loop_messages = messages -%}"
+ "{%- endif -%}"
+ "{%- for message in loop_messages -%}"
+ "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}"
+ "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
+ "{%- endif -%}"
+ "{%- if (message['role'] == 'assistant') -%}"
+ "{%- set role = \"model\" -%}"
+ "{%- else -%}"
+ "{%- set role = message['role'] -%}"
+ "{%- endif -%}"
+ "{{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}"
+ "{%- if message['content'] is string -%}"
+ "{{ message['content'] | trim }}"
+ "{%- elif message['content'] is iterable -%}"
+ "{%- for item in message['content'] -%}"
+ "{%- if item['type'] == 'image_url' -%}"
+ "{{ '' }}"
+ "{%- elif item['type'] == 'text' -%}"
+ "{{ item['text'] | trim }}"
+ "{%- endif -%}"
+ "{%- endfor -%}"
+ "{%- else -%}"
+ "{{ raise_exception(\"Invalid content type\") }}"
+ "{%- endif -%}"
+ "{{ '\n' }}"
+ "{%- endfor -%}"
+ "{%- if add_generation_prompt -%}"
+ "{{ 'model\n' }}"
+ "{%- endif -%}"
+ )
+
+ @staticmethod
+ def split_text_on_image_urls(text: str, image_urls: List[str]):
+ split_text: List[Tuple[Literal["text", "image_url"], str]] = []
+ copied_urls = image_urls[:]
+ remaining = text
+ image_placeholder = ""
+
+ while remaining:
+ # Find placeholder
+ pos = remaining.find(image_placeholder)
+ if pos != -1:
+ assert len(copied_urls) > 0
+ if pos > 0:
+ split_text.append(("text", remaining[:pos]))
+ split_text.append(("text", "\n\n"))
+ split_text.append(("image_url", copied_urls.pop(0)))
+ split_text.append(("text", "\n\n"))
+ remaining = remaining[pos + len(image_placeholder):]
+ else:
+ assert len(copied_urls) == 0
+ split_text.append(("text", remaining))
+ remaining = ""
+ return split_text
+
+ def eval_image(self, llama: llama.Llama, image_url: str):
+ import llama_cpp
+
+ n_tokens = 256
+ if llama.n_tokens + n_tokens > llama.n_ctx():
+ raise ValueError(
+ f"Prompt exceeds n_ctx: {llama.n_tokens + n_tokens} > {llama.n_ctx()}"
+ )
+
+ img_bytes = self.load_image(image_url)
+ img_u8_p = self._llava_cpp.clip_image_u8_init()
+ if not self._llava_cpp.clip_image_load_from_bytes(
+ ctypes.create_string_buffer(img_bytes, len(img_bytes)),
+ ctypes.c_size_t(len(img_bytes)),
+ img_u8_p,
+ ):
+ self._llava_cpp.clip_image_u8_free(img_u8_p)
+ raise ValueError("Failed to load image.")
+
+ img_f32 = self._llava_cpp.clip_image_f32_batch()
+ img_f32_p = ctypes.byref(img_f32)
+ if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
+ self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+ self._llava_cpp.clip_image_u8_free(img_u8_p)
+ raise ValueError("Failed to preprocess image.")
+
+ n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
+ embed = (ctypes.c_float * (n_tokens * n_embd))()
+ if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
+ self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+ self._llava_cpp.clip_image_u8_free(img_u8_p)
+ raise ValueError("Failed to encode image.")
+
+ self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+ self._llava_cpp.clip_image_u8_free(img_u8_p)
+ llama_cpp.llama_set_causal_attn(llama.ctx, False)
+
+ seq_id_0 = (ctypes.c_int32 * 1)()
+ seq_ids = (ctypes.POINTER(ctypes.c_int32) * (n_tokens + 1))()
+ for i in range(n_tokens):
+ seq_ids[i] = seq_id_0
+
+ batch = llama_cpp.llama_batch()
+ batch.n_tokens = n_tokens
+ batch.token = None
+ batch.embd = embed
+ batch.pos = (ctypes.c_int32 * n_tokens)(*[i + llama.n_tokens for i in range(n_tokens)])
+ batch.seq_id = seq_ids
+ batch.n_seq_id = (ctypes.c_int32 * n_tokens)(*([1] * n_tokens))
+ batch.logits = (ctypes.c_int8 * n_tokens)()
+
+ if llama_cpp.llama_decode(llama.ctx, batch):
+ raise ValueError("Failed to decode image.")
+
+ llama_cpp.llama_set_causal_attn(llama.ctx, True)
+ # Required to avoid issues with hf tokenizer
+ llama.input_ids[llama.n_tokens : llama.n_tokens + n_tokens] = -1
+ llama.n_tokens += n_tokens
+
+
@register_chat_completion_handler("chatml-function-calling")
def chatml_function_calling(
llama: llama.Llama,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index f3985ad2f..170020654 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -165,6 +165,10 @@
# llama_sampler_p = NewType("llama_sampler_p", int)
# llama_sampler_p_ctypes = ctypes.c_void_p
+# struct llama_kv_cache;
+llama_kv_cache_p = NewType("llama_kv_cache_p", int)
+llama_kv_cache_p_ctypes = ctypes.c_void_p
+
# typedef int32_t llama_pos;
llama_pos = ctypes.c_int32
# typedef int32_t llama_token;
@@ -259,7 +263,9 @@
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
-
+LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
+LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
+LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
# // note: these values should be synchronized with ggml_rope
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
@@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
value: Union[int, float, bool, bytes]
+
+# struct llama_model_tensor_buft_override {
+# const char * pattern;
+# ggml_backend_buffer_type_t buft;
+#
+# };
+class llama_model_tensor_buft_override(ctypes.Structure):
+ _fields_ = [
+ ("pattern", ctypes.c_char_p),
+ ("buft", ctypes.c_void_p)
+ ]
+
+
+llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override)
+
+
# struct llama_model_params {
# // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
# ggml_backend_dev_t * devices;
+# // NULL-terminated list of buffer types to use for tensors that match a pattern
+# const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
# int32_t n_gpu_layers; // number of layers to store in VRAM
# enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):
_fields_ = [
("devices", ctypes.c_void_p), # NOTE: unnused
+ ("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p),
("n_gpu_layers", ctypes.c_int32),
("split_mode", ctypes.c_int),
("main_gpu", ctypes.c_int32),
@@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
...
+# LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
+@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes)
+def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+ ...
# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
@@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
# // Returns the number of tokens in the KV cache (slow, use only for debug)
# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+@ctypes_function(
+ "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+)
+def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+ """Returns the number of tokens in the KV cache (slow, use only for debug)
+ If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+ """
+ ...
+
+# // Returns the number of tokens in the KV cache (slow, use only for debug)
+# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
@ctypes_function(
"llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
)
@@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
# // Clear the KV cache - both cell info is erased and KV data is zeroed
-# LLAMA_API void llama_kv_cache_clear(
+# LLAMA_API void llama_kv_self_clear(
# struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
-def llama_kv_cache_clear(ctx: llama_context_p, /):
+@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
+def llama_kv_self_clear(ctx: llama_context_p, /):
"""Clear the KV cache"""
...
@@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
# // seq_id < 0 : match any sequence
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
-# LLAMA_API bool llama_kv_cache_seq_rm(
+# LLAMA_API bool llama_kv_self_seq_rm(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1);
@ctypes_function(
- "llama_kv_cache_seq_rm",
+ "llama_kv_self_seq_rm",
[
llama_context_p_ctypes,
llama_seq_id,
@@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
],
ctypes.c_bool,
)
-def llama_kv_cache_seq_rm(
+def llama_kv_self_seq_rm(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
@@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
# // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_cp(
+# LLAMA_API void llama_kv_self_seq_cp(
# struct llama_context * ctx,
# llama_seq_id seq_id_src,
# llama_seq_id seq_id_dst,
# llama_pos p0,
# llama_pos p1);
@ctypes_function(
- "llama_kv_cache_seq_cp",
+ "llama_kv_self_seq_cp",
[
llama_context_p_ctypes,
llama_seq_id,
@@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
],
None,
)
-def llama_kv_cache_seq_cp(
+def llama_kv_self_seq_cp(
ctx: llama_context_p,
seq_id_src: Union[llama_seq_id, int],
seq_id_dst: Union[llama_seq_id, int],
@@ -1914,13 +1956,13 @@ def llama_kv_cache_seq_cp(
# // Removes all tokens that do not belong to the specified sequence
-# LLAMA_API void llama_kv_cache_seq_keep(
+# LLAMA_API void llama_kv_self_seq_keep(
# struct llama_context * ctx,
# llama_seq_id seq_id);
@ctypes_function(
- "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+ "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
)
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
"""Removes all tokens that do not belong to the specified sequence"""
...
@@ -1928,17 +1970,17 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
# // If the KV cache is RoPEd, the KV data is updated accordingly:
# // - lazily on next llama_decode()
-# // - explicitly with llama_kv_cache_update()
+# // - explicitly with llama_kv_self_update()
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_add(
+# LLAMA_API void llama_kv_self_seq_add(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# llama_pos delta);
@ctypes_function(
- "llama_kv_cache_seq_add",
+ "llama_kv_self_seq_add",
[
llama_context_p_ctypes,
llama_seq_id,
@@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
],
None,
)
-def llama_kv_cache_seq_add(
+def llama_kv_self_seq_add(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
@@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
"""Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
If the KV cache is RoPEd, the KV data is updated accordingly:
- lazily on next llama_decode()
- - explicitly with llama_kv_cache_update()
+ - explicitly with llama_kv_self_update()
p0 < 0 : [0, p1]
p1 < 0 : [p0, inf)"""
...
@@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
# // If the KV cache is RoPEd, the KV data is updated accordingly
# // p0 < 0 : [0, p1]
# // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_div(
+# LLAMA_API void llama_kv_self_seq_div(
# struct llama_context * ctx,
# llama_seq_id seq_id,
# llama_pos p0,
# llama_pos p1,
# int d);
@ctypes_function(
- "llama_kv_cache_seq_div",
+ "llama_kv_self_seq_div",
[
llama_context_p_ctypes,
llama_seq_id,
@@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
],
None,
)
-def llama_kv_cache_seq_div(
+def llama_kv_self_seq_div(
ctx: llama_context_p,
seq_id: Union[llama_seq_id, int],
p0: Union[llama_pos, int],
@@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
# // Defragment the KV cache
# // This will be applied:
# // - lazily on next llama_decode()
-# // - explicitly with llama_kv_cache_update()
-# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
-def llama_kv_cache_defrag(ctx: llama_context_p, /):
+# // - explicitly with llama_kv_self_update()
+# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+def llama_kv_self_defrag(ctx: llama_context_p, /):
"""Defragment the KV cache
This will be applied:
- lazily on next llama_decode()
- - explicitly with llama_kv_cache_update()"""
+ - explicitly with llama_kv_self_update()"""
...
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
-def llama_kv_cache_update(ctx: llama_context_p, /):
+# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+def llama_kv_self_update(ctx: llama_context_p, /):
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
...
# // Check if the context supports KV cache shifting
-# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
+# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
"""Check if the context supports KV cache shifting"""
...
@@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
...
+# // Set whether the model is in warmup mode or not
+# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
+ """Set whether to use causal attention or not
+ If set to true, the model will only attend to the past tokens"""
+ ...
+
+
# // Set abort callback
# LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
@ctypes_function(
@@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
...
+
+
+
+# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+# /// @param vocab The vocabulary that this grammar will be used with.
+# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
+# /// @param grammar_root The name of the start symbol for the grammar.
# LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
# const struct llama_vocab * vocab,
# const char * grammar_str,
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index d9dfaf5fd..8a382b4d9 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -7,6 +7,7 @@
c_int,
c_uint8,
c_float,
+ c_size_t,
c_void_p,
POINTER,
_Pointer, # type: ignore
@@ -141,6 +142,28 @@ def llava_eval_image_embed(
################################################
+# struct clip_image_u8_batch {
+# struct clip_image_u8 * data;
+# size_t size;
+# };
+class clip_image_u8_batch(Structure):
+ _fields_ = [
+ ("data", c_void_p),
+ ("size", c_size_t),
+ ]
+
+
+# struct clip_image_f32_batch {
+# struct clip_image_f32 * data;
+# size_t size;
+# };
+class clip_image_f32_batch(Structure):
+ _fields_ = [
+ ("data", c_void_p),
+ ("size", c_size_t),
+ ]
+
+
# /** load mmproj model */
# CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
@ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
@@ -156,3 +179,93 @@ def clip_model_load(
def clip_free(ctx: clip_ctx_p, /):
...
+
+# CLIP_API struct clip_image_u8 * clip_image_u8_init ();
+@ctypes_function("clip_image_u8_init", [], c_void_p)
+def clip_image_u8_init() -> Optional[c_void_p]:
+ ...
+
+
+# CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+@ctypes_function("clip_image_u8_free", [c_void_p], None)
+def clip_image_u8_free(img: c_void_p, /):
+ ...
+
+
+# CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+@ctypes_function("clip_image_f32_free", [c_void_p], None)
+def clip_image_f32_free(img: c_void_p, /):
+ ...
+
+
+# CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
+@ctypes_function("clip_image_u8_batch_free", [POINTER(clip_image_u8_batch)], None)
+def clip_image_u8_batch_free(batch: "_Pointer[clip_image_u8_batch]", /):
+ ...
+
+
+# CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+@ctypes_function("clip_image_f32_batch_free", [POINTER(clip_image_f32_batch)], None)
+def clip_image_f32_batch_free(batch: "_Pointer[clip_image_f32_batch]", /):
+ ...
+
+
+# /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+# CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+@ctypes_function(
+ "clip_image_preprocess",
+ [
+ clip_ctx_p_ctypes,
+ c_void_p,
+ POINTER(clip_image_f32_batch),
+ ],
+ c_bool,
+)
+def clip_image_preprocess(
+ ctx: clip_ctx_p,
+ img: c_void_p,
+ res_imgs: "_Pointer[clip_image_f32_batch]",
+ /,
+) -> bool:
+ ...
+
+
+# CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+@ctypes_function(
+ "clip_image_batch_encode",
+ [
+ clip_ctx_p_ctypes,
+ c_int,
+ POINTER(clip_image_f32_batch),
+ POINTER(c_float),
+ ],
+ c_bool,
+)
+def clip_image_batch_encode(
+ ctx: clip_ctx_p,
+ n_threads: c_int,
+ imgs: "_Pointer[clip_image_f32_batch]",
+ vec: c_void_p,
+ /,
+) -> bool:
+ ...
+
+
+# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+@ctypes_function(
+ "clip_image_load_from_bytes",
+ [
+ c_void_p,
+ c_size_t,
+ c_void_p,
+ ],
+ c_bool,
+)
+def clip_image_load_from_bytes(
+ bytes: c_void_p,
+ bytes_length: c_size_t,
+ img: c_void_p,
+ /,
+) -> bool:
+ ...
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7841fc723..2004644b7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7841fc723e059d1fd9640e5c0ef19050fcc7c698
+Subproject commit 2004644b7a5da6fe080e51861ab583480280f1d3
pFad - Phonifier reborn
Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy