From 47ca05ac79fec8ed0ac3856dc4261d2be249eb50 Mon Sep 17 00:00:00 2001 From: Antoine Lizee Date: Thu, 2 Nov 2023 01:29:06 +0000 Subject: [PATCH 1/5] fix: tokenization of special characters: (#850) It should behave like llama.cpp, where most out of the box usages treat special characters accordingly --- llama_cpp/llama.py | 6 +++--- llama_cpp/server/app.py | 2 +- test.py | 0 tests/test_llama.py | 9 +++++++++ 4 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 test.py diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bc747cffb..c9ea90fb4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -856,7 +856,7 @@ def create_embedding( data: List[EmbeddingData] = [] total_tokens = 0 for index, input in enumerate(inputs): - tokens = self.tokenize(input.encode("utf-8")) + tokens = self.tokenize(input.encode("utf-8"), special=True) self.reset() self.eval(tokens) n_tokens = len(tokens) @@ -928,7 +928,7 @@ def _create_completion( completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer prompt_tokens: List[int] = ( - self.tokenize(prompt.encode("utf-8")) + self.tokenize(prompt.encode("utf-8"), special=True) if prompt != "" else [self.token_bos()] ) @@ -1826,7 +1826,7 @@ def __init__(self, llama: Llama): def encode(self, text: str, add_bos: bool = True) -> List[int]: return self.llama.tokenize( - text.encode("utf-8", errors="ignore"), add_bos=add_bos + text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True ) def decode(self, tokens: List[int]) -> str: diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 930ad5df8..f8d8c7658 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -594,7 +594,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): token = token.encode("utf-8") - for input_id in llama.tokenize(token, add_bos=False): + for input_id in llama.tokenize(token, add_bos=False, special=True): to_bias[input_id] = score def logit_bias_processor( diff --git a/test.py b/test.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_llama.py b/tests/test_llama.py index 76291fbca..330b69b9c 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -25,6 +25,15 @@ def test_llama_cpp_tokenization(): detokenized = llama.detokenize(tokens) assert detokenized != text + text = b"Hello World" + tokens = llama.tokenize(text) + assert tokens[-1] != llama.token_eos() + assert tokens == [1, 15043, 2787, 829, 29879, 29958] + + tokens = llama.tokenize(text, special=True) + assert tokens[-1] == llama.token_eos() + assert tokens == [1, 10994, 2787, 2] + def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) From addc2f6077c12da39db5a4d01cbde5982d51c553 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 1 Nov 2023 21:31:54 -0400 Subject: [PATCH 2/5] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f587225e6..b1591b1d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Fix tokenization of special characters by @antoine-lizee in #850 + ## [0.2.12] - Update llama.cpp to ggerganov/llama.cpp@50337961a678fce4081554b24e56e86b67660163 From 3e180d77f1320b07bd279c9e2beffb1ec7722587 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 1 Nov 2023 21:37:53 -0400 Subject: [PATCH 3/5] Cleanup --- test.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index e69de29bb..000000000 From f0d1a1b255e926f8ed488531a487dfab85b3760e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 1 Nov 2023 22:17:53 -0400 Subject: [PATCH 4/5] Fix runner label --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 269016b2e..ed548eb54 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,7 +80,7 @@ jobs: build-linux-opencl: - runs-on: linux-latest + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 From d7ae8b5817aa8594d792d62f2c18dc87208f4306 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 2 Nov 2023 01:17:46 -0400 Subject: [PATCH 5/5] Fix clblast test --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ed548eb54..24448ec9f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -99,7 +99,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose + CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose - name: Test with pytest run: | python3 -m pytest pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy