Skip to content

Add endpoint to count tokens #1136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 9, 2024
71 changes: 69 additions & 2 deletions llama_cpp/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
CreateEmbeddingRequest,
CreateChatCompletionRequest,
ModelList,
TokenizeInputRequest,
TokenizeInputResponse,
TokenizeInputCountResponse,
DetokenizeInputRequest,
DetokenizeInputResponse,
)
from llama_cpp.server.errors import RouteErrorHandler

Expand Down Expand Up @@ -196,6 +201,9 @@ async def authenticate(
)


openai_v1_tag = "OpenAI V1"


@router.post(
"/v1/completions",
summary="Completion",
Expand Down Expand Up @@ -227,11 +235,13 @@ async def authenticate(
},
}
},
tags=[openai_v1_tag],
)
@router.post(
"/v1/engines/copilot-codex/completions",
include_in_schema=False,
dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
)
async def create_completion(
request: Request,
Expand Down Expand Up @@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:


@router.post(
"/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]
"/v1/embeddings",
summary="Embedding",
dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
)
async def create_embedding(
request: CreateEmbeddingRequest,
Expand Down Expand Up @@ -339,6 +352,7 @@ async def create_embedding(
},
}
},
tags=[openai_v1_tag],
)
async def create_chat_completion(
request: Request,
Expand Down Expand Up @@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
return iterator_or_completion


@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)])
@router.get(
"/v1/models",
summary="Models",
dependencies=[Depends(authenticate)],
tags=[openai_v1_tag],
)
async def get_models(
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> ModelList:
Expand All @@ -407,3 +426,51 @@ async def get_models(
for model_alias in llama_proxy
],
}


extras_tag = "Extras"


@router.post(
"/extras/tokenize",
summary="Tokenize",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def tokenize(
body: TokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> TokenizeInputResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)

return {"tokens": tokens}


@router.post(
"/extras/tokenize/count",
summary="Tokenize Count",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def count_query_tokens(
body: TokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> TokenizeInputCountResponse:
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)

return {"count": len(tokens)}


@router.post(
"/extras/detokenize",
summary="Detokenize",
dependencies=[Depends(authenticate)],
tags=[extras_tag],
)
async def detokenize(
body: DetokenizeInputRequest,
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
) -> DetokenizeInputResponse:
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")

return {"text": text}
36 changes: 36 additions & 0 deletions llama_cpp/server/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,39 @@ class ModelData(TypedDict):
class ModelList(TypedDict):
object: Literal["list"]
data: List[ModelData]


class TokenizeInputRequest(BaseModel):
model: Optional[str] = model_field
input: Optional[str] = Field(description="The input to tokenize.")

model_config = {
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
}


class TokenizeInputResponse(BaseModel):
tokens: List[int] = Field(description="A list of tokens.")

model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}


class TokenizeInputCountResponse(BaseModel):
count: int = Field(description="The number of tokens in the input.")

model_config = {"json_schema_extra": {"example": {"count": 5}}}


class DetokenizeInputRequest(BaseModel):
model: Optional[str] = model_field
tokens: List[int] = Field(description="A list of toekns to detokenize.")

model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}


class DetokenizeInputResponse(BaseModel):
text: str = Field(description="The detokenized text.")

model_config = {
"json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy