Skip to content

Commit 9f14fd2

Browse files
committed
Merge branch 'main' into remove-unwanted-bos
2 parents a6e5917 + 951e39c commit 9f14fd2

File tree

18 files changed

+262
-81
lines changed

18 files changed

+262
-81
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/cibuildwheel@v2.17.0
32+
uses: pypa/cibuildwheel@v2.18.1
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
5656
platforms: linux/arm64
5757

5858
- name: Build wheels
59-
uses: pypa/cibuildwheel@v2.17.0
59+
uses: pypa/cibuildwheel@v2.18.1
6060
env:
6161
CIBW_SKIP: "*musllinux* pp*"
6262
CIBW_REPAIR_WHEEL_COMMAND: ""

CHANGELOG.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.77]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@bde7cd3cd949c1a85d3a199498ac98e78039d46f
13+
- fix: string value kv_overrides by @abetlen in df45a4b3fe46e72664bda87301b318210c6d4782
14+
- fix: Fix typo in Llama3VisionAlphaChatHandler by @abetlen in 165b4dc6c188f8fda2fc616154e111f710484eba
15+
- fix: Use numpy recarray for candidates data, fixes bug with temp < 0 by @abetlen in af3ed503e9ce60fe6b5365031abad4176a3536b3
16+
fix: Disable Windows+CUDA workaround when compiling for HIPBLAS by Engininja2 in #1493
17+
18+
## [0.2.76]
19+
20+
- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
21+
- feat: Improve Llama.eval performance by avoiding list conversion by @thoughtp0lice in #1476
22+
- example: LLM inference with Ray Serve by @rgerganov in #1465
23+
24+
## [0.2.75]
25+
26+
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
27+
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
28+
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
29+
- misc: Remove unnecessary metadata lookups by @CISC in #1448
30+
31+
## [0.2.74]
32+
33+
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
34+
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
35+
- docs: Fix typo in README.md by @yupbank in #1444
36+
37+
## [0.2.73]
38+
39+
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
40+
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
41+
42+
## [0.2.72]
43+
44+
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
45+
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
46+
1047
## [0.2.71]
1148

1249
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217

CMakeLists.txt

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,21 @@ if (LLAMA_BUILD)
4141
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
4242
)
4343
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
44-
install(
45-
FILES $<TARGET_RUNTIME_DLLS:llama>
46-
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
47-
)
48-
install(
49-
FILES $<TARGET_RUNTIME_DLLS:llama>
50-
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
51-
)
44+
if (WIN32 AND (LLAMA_CUDA OR LLAMA_CUBLAS))
45+
install(
46+
FILES $<TARGET_RUNTIME_DLLS:llama>
47+
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
48+
)
49+
install(
50+
FILES $<TARGET_RUNTIME_DLLS:llama>
51+
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
52+
)
53+
endif()
5254

5355
if (LLAVA_BUILD)
54-
if (LLAMA_CUBLAS)
56+
if (LLAMA_CUBLAS OR LLAMA_CUDA)
5557
add_compile_definitions(GGML_USE_CUBLAS)
58+
add_compile_definitions(GGML_USE_CUDA)
5659
endif()
5760

5861
if (LLAMA_METAL)

Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,16 @@ build:
1313
python3 -m pip install --verbose -e .
1414

1515
build.debug:
16-
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
16+
python3 -m pip install \
17+
--verbose \
18+
--config-settings=cmake.verbose=true \
19+
--config-settings=logging.level=INFO \
20+
--config-settings=install.strip=false \
21+
--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
22+
--editable .
1723

1824
build.cuda:
19-
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
25+
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
2026

2127
build.opencl:
2228
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .

README.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -499,13 +499,16 @@ llm = Llama.from_pretrained(
499499

500500
`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
501501

502-
You'll first need to download one of the available multi-modal models in GGUF format:
502+
Below are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API).
503503

504-
- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
505-
- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
506-
- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
507-
- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
508-
- [moondream2](https://huggingface.co/vikhyatk/moondream2)
504+
| Model | `LlamaChatHandler` | `chat_format` |
505+
|:--- |:--- |:--- |
506+
| [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |
507+
| [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |
508+
| [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |
509+
| [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` |
510+
| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
511+
| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
509512

510513
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
511514

@@ -550,7 +553,7 @@ llm = Llama.from_pretrained(
550553
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
551554
)
552555

553-
respoonse = llm.create_chat_completion(
556+
response = llm.create_chat_completion(
554557
messages = [
555558
{
556559
"role": "user",

examples/ray/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
2+
3+
First, install the requirements:
4+
5+
```bash
6+
$ pip install -r requirements.txt
7+
```
8+
9+
Deploy a GGUF model to Ray Serve with the following command:
10+
11+
```bash
12+
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
13+
```
14+
15+
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
16+
17+
```bash
18+
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
19+
```

examples/ray/llm.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from starlette.requests import Request
2+
from typing import Dict
3+
from ray import serve
4+
from ray.serve import Application
5+
from llama_cpp import Llama
6+
7+
@serve.deployment
8+
class LlamaDeployment:
9+
def __init__(self, model_path: str):
10+
self._llm = Llama(model_path=model_path)
11+
12+
async def __call__(self, http_request: Request) -> Dict:
13+
input_json = await http_request.json()
14+
prompt = input_json["prompt"]
15+
max_tokens = input_json.get("max_tokens", 64)
16+
return self._llm(prompt, max_tokens=max_tokens)
17+
18+
19+
def llm_builder(args: Dict[str, str]) -> Application:
20+
return LlamaDeployment.bind(args["model_path"])

examples/ray/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ray[serve]
2+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
3+
llama-cpp-python

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.71"
4+
__version__ = "0.2.77"

llama_cpp/_internals.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -553,13 +553,12 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
553553
class _LlamaTokenDataArray:
554554
def __init__(self, *, n_vocab: int):
555555
self.n_vocab = n_vocab
556-
self.candidates_data = np.array(
557-
[],
556+
self.candidates_data = np.recarray(
557+
(self.n_vocab,),
558558
dtype=np.dtype(
559559
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
560560
),
561561
)
562-
self.candidates_data.resize(3, self.n_vocab, refcheck=False)
563562
self.candidates = llama_cpp.llama_token_data_array(
564563
data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
565564
size=self.n_vocab,
@@ -569,14 +568,11 @@ def __init__(self, *, n_vocab: int):
569568
self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
570569

571570
def copy_logits(self, logits: npt.NDArray[np.single]):
572-
self.candidates_data["id"][:] = self.default_candidates_data_id
573-
self.candidates_data["logit"][:] = logits
574-
self.candidates_data["p"][:] = self.default_candidates_data_p
575-
self.candidates.data = self.candidates_data.ctypes.data_as(
576-
llama_cpp.llama_token_data_p
577-
)
578-
self.candidates.sorted = ctypes.c_bool(False)
579-
self.candidates.size = ctypes.c_size_t(self.n_vocab)
571+
self.candidates_data.id[:] = self.default_candidates_data_id
572+
self.candidates_data.logit[:] = logits
573+
self.candidates_data.p[:] = self.default_candidates_data_p
574+
self.candidates.sorted = False
575+
self.candidates.size = self.n_vocab
580576

581577

582578
# Python wrappers over common/common
@@ -767,14 +763,14 @@ def sample(
767763
self.params.penalty_present,
768764
)
769765
if not self.params.penalize_nl:
770-
token_data_array.candidates_data["logit"][nl_token] = nl_logit
766+
token_data_array.candidates_data.logit[nl_token] = nl_logit
771767

772768
if self.grammar is not None:
773769
ctx_main.sample_grammar(token_data_array, self.grammar)
774770

775771
if self.params.temp < 0:
776772
ctx_main.sample_softmax(token_data_array)
777-
id = token_data_array.candidates_data["id"][0]
773+
id = token_data_array.candidates_data.id[0]
778774
elif self.params.temp == 0:
779775
id = ctx_main.sample_token_greedy(token_data_array)
780776
else:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy