From 6469cac7eb928d92b070c1b7408750672aac2a13 Mon Sep 17 00:00:00 2001 From: Felipe Lorenz Date: Sat, 27 Jan 2024 12:17:58 -0500 Subject: [PATCH 1/7] Add endpoint to count tokens --- llama_cpp/server/app.py | 16 ++++++++++++++++ llama_cpp/server/types.py | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 368022c45..ff3a319b9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,6 +41,8 @@ CreateEmbeddingRequest, CreateChatCompletionRequest, ModelList, + QueryTokensUsageRequest, + QueryCountResponse, ) from llama_cpp.server.errors import RouteErrorHandler @@ -308,6 +310,20 @@ async def create_embedding( ) +@router.post( + "/query-count", summary="Query Count", dependencies=[Depends(authenticate)] +) +async def count_query_tokens( + body: QueryTokensUsageRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> QueryCountResponse: + tokens = llama_proxy(body.model).tokenize(body.query.encode("utf-8"), special=True) + + return { + "tokens": len(tokens) + } + + @router.post( "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)], response_model= Union[ diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index f0827d762..50e676876 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -176,6 +176,21 @@ class CreateEmbeddingRequest(BaseModel): } +class QueryTokensUsageRequest(BaseModel): + model: Optional[str] = model_field + query: Optional[str] = Field(description="The query to count tokens") + + model_config = { + "json_schema_extra": { + "examples": [ + { + "query": "How many tokens in this query?" + } + ] + } + } + + class ChatCompletionRequestMessage(BaseModel): role: Literal["system", "user", "assistant", "function"] = Field( default="user", description="The role of the message." @@ -264,3 +279,7 @@ class ModelData(TypedDict): class ModelList(TypedDict): object: Literal["list"] data: List[ModelData] + + +class QueryCountResponse(TypedDict): + tokens: int From 7057bcb0123cfe1681a6d49b93439e557e234514 Mon Sep 17 00:00:00 2001 From: Felipe Lorenz Date: Sat, 3 Feb 2024 17:42:45 -0500 Subject: [PATCH 2/7] Add tokenize and detokenize endpoints --- llama_cpp/server/app.py | 40 +++++++++++++++++++++++++++++++++------ llama_cpp/server/types.py | 23 ++++++++++++++++++---- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ff3a319b9..1096d51e1 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,8 +41,8 @@ CreateEmbeddingRequest, CreateChatCompletionRequest, ModelList, - QueryTokensUsageRequest, - QueryCountResponse, + TokenizeInputRequest, + QueryCountResponse, DetokenizeInputRequest, ) from llama_cpp.server.errors import RouteErrorHandler @@ -311,16 +311,44 @@ async def create_embedding( @router.post( - "/query-count", summary="Query Count", dependencies=[Depends(authenticate)] + "/tokenize", summary="Tokenize", dependencies=[Depends(authenticate())] +) +async def tokenize( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +): + tokens = llama_proxy(body.model).tokenize(body.inputx.encode("utf-8"), special=True) + + return { + "data": tokens + } + + +@router.post( + "/detokenize", summary="Detokenize", dependencies=[Depends(authenticate)] +) +async def detokenize( + body: DetokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +): + text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") + + return { + "text": text + } + + +@router.post( + "/tokenize/count", summary="Tokenize Count", dependencies=[Depends(authenticate)] ) async def count_query_tokens( - body: QueryTokensUsageRequest, + body: TokenizeInputRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), -) -> QueryCountResponse: +): tokens = llama_proxy(body.model).tokenize(body.query.encode("utf-8"), special=True) return { - "tokens": len(tokens) + "count": len(tokens) } diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index 50e676876..a89dfca2f 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -176,15 +176,30 @@ class CreateEmbeddingRequest(BaseModel): } -class QueryTokensUsageRequest(BaseModel): +class DetokenizeInputRequest(BaseModel): model: Optional[str] = model_field - query: Optional[str] = Field(description="The query to count tokens") + tokens: List[int] = Field(description="A list of toekns to detokenize.") + + model_config = { + "json_schema_extra": { + "example": [ + { + "tokens": [123,321,222] + } + ] + } + } + + +class TokenizeInputRequest(BaseModel): + model: Optional[str] = model_field + input: Optional[str] = Field(description="The input to tokenize.") model_config = { "json_schema_extra": { "examples": [ { - "query": "How many tokens in this query?" + "input": "How many tokens in this query?" } ] } @@ -282,4 +297,4 @@ class ModelList(TypedDict): class QueryCountResponse(TypedDict): - tokens: int + data: List[int] From a83fbeb1f0d6c54655e7e31684275fda5f559d3c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 2 Mar 2024 22:26:34 -0500 Subject: [PATCH 3/7] Change response key to tokens for tokenize endpoint --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c651eed46..0d29f14df 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -319,7 +319,7 @@ async def tokenize( ): tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - return {"data": tokens} + return {"tokens": tokens} @router.post("/detokenize", summary="Detokenize", dependencies=[Depends(authenticate)]) From 013daea858d40e0feff6202677e0c4ecd80a91e3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 2 Mar 2024 22:27:46 -0500 Subject: [PATCH 4/7] Fix dependency bug --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0d29f14df..3a1003607 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -312,7 +312,7 @@ async def create_embedding( ) -@router.post("/tokenize", summary="Tokenize", dependencies=[Depends(authenticate())]) +@router.post("/tokenize", summary="Tokenize", dependencies=[Depends(authenticate)]) async def tokenize( body: TokenizeInputRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), From fb2672d798166444e1213b0cbcb31bec570df103 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 2 Mar 2024 22:39:06 -0500 Subject: [PATCH 5/7] Cleanup --- examples/batch-processing/main.py | 30 ++++++++++++++ llama_cpp/server/app.py | 68 ++++++++++++++++--------------- llama_cpp/server/types.py | 66 +++++++++++++++--------------- 3 files changed, 99 insertions(+), 65 deletions(-) create mode 100644 examples/batch-processing/main.py diff --git a/examples/batch-processing/main.py b/examples/batch-processing/main.py new file mode 100644 index 000000000..2d8b25539 --- /dev/null +++ b/examples/batch-processing/main.py @@ -0,0 +1,30 @@ +import llama_cpp + +from contextlib import ExitStack + +from huggingface_hub import hf_hub_download # type: ignore + +repo_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" +filename = "qwen1_5-0_5b-chat-q8_0.gguf" + +hf_hub_download(repo_id=repo_id, filename=filename) +model_path = hf_hub_download(repo_id=repo_id, filename=filename, local_files_only=True) + +with ExitStack() as es: + llama_cpp.llama_backend_init() + es.callback(llama_cpp.llama_backend_free) + + model_params = llama_cpp.llama_model_default_params() + + model_params.use_lock = llama_cpp.llama_supports_mlock() + model_params.use_mmap = llama_cpp.llama_supports_mmap() + + model = llama_cpp.llama_load_model_from_file( + model_path.encode('utf-8'), + model_params + ) + assert model is not None + es.callback(lambda: llama_cpp.llama_free_model(model)) + + context_params = llama_cpp.llama_context_default_params() + context = llama_cpp.llama_new_context_with_model(model, context_params) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3a1003607..3d8195ee3 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -42,8 +42,10 @@ CreateChatCompletionRequest, ModelList, TokenizeInputRequest, - QueryCountResponse, + TokenizeInputResponse, + TokenizeInputCountResponse, DetokenizeInputRequest, + DetokenizeInputResponse, ) from llama_cpp.server.errors import RouteErrorHandler @@ -312,38 +314,6 @@ async def create_embedding( ) -@router.post("/tokenize", summary="Tokenize", dependencies=[Depends(authenticate)]) -async def tokenize( - body: TokenizeInputRequest, - llama_proxy: LlamaProxy = Depends(get_llama_proxy), -): - tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - - return {"tokens": tokens} - - -@router.post("/detokenize", summary="Detokenize", dependencies=[Depends(authenticate)]) -async def detokenize( - body: DetokenizeInputRequest, - llama_proxy: LlamaProxy = Depends(get_llama_proxy), -): - text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") - - return {"text": text} - - -@router.post( - "/tokenize/count", summary="Tokenize Count", dependencies=[Depends(authenticate)] -) -async def count_query_tokens( - body: TokenizeInputRequest, - llama_proxy: LlamaProxy = Depends(get_llama_proxy), -): - tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) - - return {"count": len(tokens)} - - @router.post( "/v1/chat/completions", summary="Chat", @@ -442,3 +412,35 @@ async def get_models( for model_alias in llama_proxy ], } + + +@router.post("/tokenize", summary="Tokenize", dependencies=[Depends(authenticate)]) +async def tokenize( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return {"tokens": tokens} + + +@router.post( + "/tokenize/count", summary="Tokenize Count", dependencies=[Depends(authenticate)] +) +async def count_query_tokens( + body: TokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> TokenizeInputCountResponse: + tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True) + + return {"count": len(tokens)} + + +@router.post("/detokenize", summary="Detokenize", dependencies=[Depends(authenticate)]) +async def detokenize( + body: DetokenizeInputRequest, + llama_proxy: LlamaProxy = Depends(get_llama_proxy), +) -> DetokenizeInputResponse: + text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8") + + return {"text": text} diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py index d1f9353d5..c8b2ebc6c 100644 --- a/llama_cpp/server/types.py +++ b/llama_cpp/server/types.py @@ -176,36 +176,6 @@ class CreateEmbeddingRequest(BaseModel): } -class DetokenizeInputRequest(BaseModel): - model: Optional[str] = model_field - tokens: List[int] = Field(description="A list of toekns to detokenize.") - - model_config = { - "json_schema_extra": { - "example": [ - { - "tokens": [123,321,222] - } - ] - } - } - - -class TokenizeInputRequest(BaseModel): - model: Optional[str] = model_field - input: Optional[str] = Field(description="The input to tokenize.") - - model_config = { - "json_schema_extra": { - "examples": [ - { - "input": "How many tokens in this query?" - } - ] - } - } - - class ChatCompletionRequestMessage(BaseModel): role: Literal["system", "user", "assistant", "function"] = Field( default="user", description="The role of the message." @@ -296,5 +266,37 @@ class ModelList(TypedDict): data: List[ModelData] -class QueryCountResponse(TypedDict): - data: List[int] +class TokenizeInputRequest(BaseModel): + model: Optional[str] = model_field + input: Optional[str] = Field(description="The input to tokenize.") + + model_config = { + "json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]} + } + + +class TokenizeInputResponse(BaseModel): + tokens: List[int] = Field(description="A list of tokens.") + + model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}} + + +class TokenizeInputCountResponse(BaseModel): + count: int = Field(description="The number of tokens in the input.") + + model_config = {"json_schema_extra": {"example": {"count": 5}}} + + +class DetokenizeInputRequest(BaseModel): + model: Optional[str] = model_field + tokens: List[int] = Field(description="A list of toekns to detokenize.") + + model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}} + + +class DetokenizeInputResponse(BaseModel): + text: str = Field(description="The detokenized text.") + + model_config = { + "json_schema_extra": {"example": {"text": "How many tokens in this query?"}} + } From e5cb9dbef720457765d0a0abbbb4a02fa5b7e214 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 8 Mar 2024 21:02:31 -0500 Subject: [PATCH 6/7] Remove example added by mistake --- examples/batch-processing/main.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 examples/batch-processing/main.py diff --git a/examples/batch-processing/main.py b/examples/batch-processing/main.py deleted file mode 100644 index 2d8b25539..000000000 --- a/examples/batch-processing/main.py +++ /dev/null @@ -1,30 +0,0 @@ -import llama_cpp - -from contextlib import ExitStack - -from huggingface_hub import hf_hub_download # type: ignore - -repo_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" -filename = "qwen1_5-0_5b-chat-q8_0.gguf" - -hf_hub_download(repo_id=repo_id, filename=filename) -model_path = hf_hub_download(repo_id=repo_id, filename=filename, local_files_only=True) - -with ExitStack() as es: - llama_cpp.llama_backend_init() - es.callback(llama_cpp.llama_backend_free) - - model_params = llama_cpp.llama_model_default_params() - - model_params.use_lock = llama_cpp.llama_supports_mlock() - model_params.use_mmap = llama_cpp.llama_supports_mmap() - - model = llama_cpp.llama_load_model_from_file( - model_path.encode('utf-8'), - model_params - ) - assert model is not None - es.callback(lambda: llama_cpp.llama_free_model(model)) - - context_params = llama_cpp.llama_context_default_params() - context = llama_cpp.llama_new_context_with_model(model, context_params) From 2da8050b5b014c3d0a7b4fdbf59492cb097fc47f Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 8 Mar 2024 21:07:23 -0500 Subject: [PATCH 7/7] Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints --- llama_cpp/server/app.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3d8195ee3..aa6afc112 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -201,6 +201,9 @@ async def authenticate( ) +openai_v1_tag = "OpenAI V1" + + @router.post( "/v1/completions", summary="Completion", @@ -232,11 +235,13 @@ async def authenticate( }, } }, + tags=[openai_v1_tag], ) @router.post( "/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], ) async def create_completion( request: Request, @@ -302,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: @router.post( - "/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)] + "/v1/embeddings", + summary="Embedding", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], ) async def create_embedding( request: CreateEmbeddingRequest, @@ -344,6 +352,7 @@ async def create_embedding( }, } }, + tags=[openai_v1_tag], ) async def create_chat_completion( request: Request, @@ -396,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: return iterator_or_completion -@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]) +@router.get( + "/v1/models", + summary="Models", + dependencies=[Depends(authenticate)], + tags=[openai_v1_tag], +) async def get_models( llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: @@ -414,7 +428,15 @@ async def get_models( } -@router.post("/tokenize", summary="Tokenize", dependencies=[Depends(authenticate)]) +extras_tag = "Extras" + + +@router.post( + "/extras/tokenize", + summary="Tokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) async def tokenize( body: TokenizeInputRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), @@ -425,7 +447,10 @@ async def tokenize( @router.post( - "/tokenize/count", summary="Tokenize Count", dependencies=[Depends(authenticate)] + "/extras/tokenize/count", + summary="Tokenize Count", + dependencies=[Depends(authenticate)], + tags=[extras_tag], ) async def count_query_tokens( body: TokenizeInputRequest, @@ -436,7 +461,12 @@ async def count_query_tokens( return {"count": len(tokens)} -@router.post("/detokenize", summary="Detokenize", dependencies=[Depends(authenticate)]) +@router.post( + "/extras/detokenize", + summary="Detokenize", + dependencies=[Depends(authenticate)], + tags=[extras_tag], +) async def detokenize( body: DetokenizeInputRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy