diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 269016b2e..24448ec9f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -80,7 +80,7 @@ jobs: build-linux-opencl: - runs-on: linux-latest + runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -99,7 +99,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install --upgrade pip - CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose + CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose - name: Test with pytest run: | python3 -m pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index f587225e6..b1591b1d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- Fix tokenization of special characters by @antoine-lizee in #850 + ## [0.2.12] - Update llama.cpp to ggerganov/llama.cpp@50337961a678fce4081554b24e56e86b67660163 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index bc747cffb..c9ea90fb4 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -856,7 +856,7 @@ def create_embedding( data: List[EmbeddingData] = [] total_tokens = 0 for index, input in enumerate(inputs): - tokens = self.tokenize(input.encode("utf-8")) + tokens = self.tokenize(input.encode("utf-8"), special=True) self.reset() self.eval(tokens) n_tokens = len(tokens) @@ -928,7 +928,7 @@ def _create_completion( completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer prompt_tokens: List[int] = ( - self.tokenize(prompt.encode("utf-8")) + self.tokenize(prompt.encode("utf-8"), special=True) if prompt != "" else [self.token_bos()] ) @@ -1826,7 +1826,7 @@ def __init__(self, llama: Llama): def encode(self, text: str, add_bos: bool = True) -> List[int]: return self.llama.tokenize( - text.encode("utf-8", errors="ignore"), add_bos=add_bos + text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True ) def decode(self, tokens: List[int]) -> str: diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 930ad5df8..f8d8c7658 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -594,7 +594,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): token = token.encode("utf-8") - for input_id in llama.tokenize(token, add_bos=False): + for input_id in llama.tokenize(token, add_bos=False, special=True): to_bias[input_id] = score def logit_bias_processor( diff --git a/tests/test_llama.py b/tests/test_llama.py index 76291fbca..330b69b9c 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -25,6 +25,15 @@ def test_llama_cpp_tokenization(): detokenized = llama.detokenize(tokens) assert detokenized != text + text = b"Hello World" + tokens = llama.tokenize(text) + assert tokens[-1] != llama.token_eos() + assert tokens == [1, 15043, 2787, 829, 29879, 29958] + + tokens = llama.tokenize(text, special=True) + assert tokens[-1] == llama.token_eos() + assert tokens == [1, 10994, 2787, 2] + def test_llama_patch(monkeypatch): llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy