Skip to content

Commit c139f8b

Browse files
felipeloabetlen
andauthored
feat: Add endpoints for tokenize, detokenize and count tokens (abetlen#1136)
* Add endpoint to count tokens * Add tokenize and detokenize endpoints * Change response key to tokens for tokenize endpoint * Fix dependency bug * Cleanup * Remove example added by mistake * Move tokenize, detokenize, and count to Extras namespace. Tag existing endpoints --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>
1 parent 1f3156d commit c139f8b

File tree

2 files changed

+105
-2
lines changed

2 files changed

+105
-2
lines changed

llama_cpp/server/app.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
CreateEmbeddingRequest,
4242
CreateChatCompletionRequest,
4343
ModelList,
44+
TokenizeInputRequest,
45+
TokenizeInputResponse,
46+
TokenizeInputCountResponse,
47+
DetokenizeInputRequest,
48+
DetokenizeInputResponse,
4449
)
4550
from llama_cpp.server.errors import RouteErrorHandler
4651

@@ -196,6 +201,9 @@ async def authenticate(
196201
)
197202

198203

204+
openai_v1_tag = "OpenAI V1"
205+
206+
199207
@router.post(
200208
"/v1/completions",
201209
summary="Completion",
@@ -227,11 +235,13 @@ async def authenticate(
227235
},
228236
}
229237
},
238+
tags=[openai_v1_tag],
230239
)
231240
@router.post(
232241
"/v1/engines/copilot-codex/completions",
233242
include_in_schema=False,
234243
dependencies=[Depends(authenticate)],
244+
tags=[openai_v1_tag],
235245
)
236246
async def create_completion(
237247
request: Request,
@@ -297,7 +307,10 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
297307

298308

299309
@router.post(
300-
"/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]
310+
"/v1/embeddings",
311+
summary="Embedding",
312+
dependencies=[Depends(authenticate)],
313+
tags=[openai_v1_tag],
301314
)
302315
async def create_embedding(
303316
request: CreateEmbeddingRequest,
@@ -339,6 +352,7 @@ async def create_embedding(
339352
},
340353
}
341354
},
355+
tags=[openai_v1_tag],
342356
)
343357
async def create_chat_completion(
344358
request: Request,
@@ -391,7 +405,12 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
391405
return iterator_or_completion
392406

393407

394-
@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)])
408+
@router.get(
409+
"/v1/models",
410+
summary="Models",
411+
dependencies=[Depends(authenticate)],
412+
tags=[openai_v1_tag],
413+
)
395414
async def get_models(
396415
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
397416
) -> ModelList:
@@ -407,3 +426,51 @@ async def get_models(
407426
for model_alias in llama_proxy
408427
],
409428
}
429+
430+
431+
extras_tag = "Extras"
432+
433+
434+
@router.post(
435+
"/extras/tokenize",
436+
summary="Tokenize",
437+
dependencies=[Depends(authenticate)],
438+
tags=[extras_tag],
439+
)
440+
async def tokenize(
441+
body: TokenizeInputRequest,
442+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
443+
) -> TokenizeInputResponse:
444+
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
445+
446+
return {"tokens": tokens}
447+
448+
449+
@router.post(
450+
"/extras/tokenize/count",
451+
summary="Tokenize Count",
452+
dependencies=[Depends(authenticate)],
453+
tags=[extras_tag],
454+
)
455+
async def count_query_tokens(
456+
body: TokenizeInputRequest,
457+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
458+
) -> TokenizeInputCountResponse:
459+
tokens = llama_proxy(body.model).tokenize(body.input.encode("utf-8"), special=True)
460+
461+
return {"count": len(tokens)}
462+
463+
464+
@router.post(
465+
"/extras/detokenize",
466+
summary="Detokenize",
467+
dependencies=[Depends(authenticate)],
468+
tags=[extras_tag],
469+
)
470+
async def detokenize(
471+
body: DetokenizeInputRequest,
472+
llama_proxy: LlamaProxy = Depends(get_llama_proxy),
473+
) -> DetokenizeInputResponse:
474+
text = llama_proxy(body.model).detokenize(body.tokens).decode("utf-8")
475+
476+
return {"text": text}

llama_cpp/server/types.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,39 @@ class ModelData(TypedDict):
264264
class ModelList(TypedDict):
265265
object: Literal["list"]
266266
data: List[ModelData]
267+
268+
269+
class TokenizeInputRequest(BaseModel):
270+
model: Optional[str] = model_field
271+
input: Optional[str] = Field(description="The input to tokenize.")
272+
273+
model_config = {
274+
"json_schema_extra": {"examples": [{"input": "How many tokens in this query?"}]}
275+
}
276+
277+
278+
class TokenizeInputResponse(BaseModel):
279+
tokens: List[int] = Field(description="A list of tokens.")
280+
281+
model_config = {"json_schema_extra": {"example": {"tokens": [123, 321, 222]}}}
282+
283+
284+
class TokenizeInputCountResponse(BaseModel):
285+
count: int = Field(description="The number of tokens in the input.")
286+
287+
model_config = {"json_schema_extra": {"example": {"count": 5}}}
288+
289+
290+
class DetokenizeInputRequest(BaseModel):
291+
model: Optional[str] = model_field
292+
tokens: List[int] = Field(description="A list of toekns to detokenize.")
293+
294+
model_config = {"json_schema_extra": {"example": [{"tokens": [123, 321, 222]}]}}
295+
296+
297+
class DetokenizeInputResponse(BaseModel):
298+
text: str = Field(description="The detokenized text.")
299+
300+
model_config = {
301+
"json_schema_extra": {"example": {"text": "How many tokens in this query?"}}
302+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy