diff --git a/continue.dev/config.json b/continue.dev/config.json new file mode 100644 index 000000000..cd35da598 --- /dev/null +++ b/continue.dev/config.json @@ -0,0 +1,86 @@ +{ + "models": [ + { + "title": "Codestral", + "model": "codestral-latest", + "apiBase": "http://127.0.0.1:4321/v1/", + "provider": "openai", + "apiKey": "key", + + "completionOptions": { + "maxTokens": 8000 + } + } + ], + "tabAutocompleteModel": { + "title": "Codestral", + "model": "codestral-latest", + "apiBase": "http://127.0.0.1:4321/v1/", + "provider": "openai", + "apiKey": "key", + + "completionOptions": { + "maxTokens": 200 + } + }, + "tabAutocompleteOptions": { + "useCache": true, + "disable": false + }, + "slashCommands": [ + { + "name": "edit", + "description": "Edit selected code" + }, + { + "name": "comment", + "description": "Write comments for the selected code" + }, + { + "name": "share", + "description": "Export this session as markdown" + }, + { + "name": "cmd", + "description": "Generate a shell command" + } + ], + "customCommands": [ + { + "name": "test", + "prompt": "Write a comprehensive set of unit tests for the selected code. It should setup, run tests that check for correctness including important edge cases, and teardown. Ensure that the tests are complete and sophisticated. Give the tests just as chat output, don't edit any file.", + "description": "Write unit tests for highlighted code" + } + ], + "contextProviders": [ + { + "name": "diff", + "params": {} + }, + { + "name": "open", + "params": {} + }, + { + "name": "terminal", + "params": {} + }, + { + "name": "problems", + "params": {} + }, + { + "name": "codebase", + "params": {} + }, + { + "name": "code", + "params": {} + }, + { + "name": "docs", + "params": {} + } + ], + "allowAnonymousTelemetry": false + } \ No newline at end of file diff --git a/continue.dev/fim.py b/continue.dev/fim.py new file mode 100644 index 000000000..18d012295 --- /dev/null +++ b/continue.dev/fim.py @@ -0,0 +1,2 @@ +def add + return a+b \ No newline at end of file diff --git a/continue.dev/readme.md b/continue.dev/readme.md new file mode 100644 index 000000000..fb35c9720 --- /dev/null +++ b/continue.dev/readme.md @@ -0,0 +1,14 @@ +``` +pip install uvicorn fastapi pydantic-settings sse-starlette starlette-context PyYAML +pip install scikit-build mistral-common + +cd vendor/ +rm -rf llama.cpp/ +git clone https://github.com/ggerganov/llama.cpp.git + +cd .. + +export CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_BUILD_TYPE=Debug" +pip install -e . + +``` \ No newline at end of file diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4cda4af7a..62cbec4f2 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -41,6 +41,8 @@ DetokenizeInputResponse, ) from llama_cpp.server.errors import RouteErrorHandler +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer +from mistral_common.tokens.instruct.request import FIMRequest router = APIRouter(route_class=RouteErrorHandler) @@ -264,6 +266,38 @@ async def create_completion( assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" + prompt = body.prompt + prompt = prompt.replace('[SUFFIX]','') + + print(f'##### PROMPT#:\n{json.dumps(body.prompt)}\n\n') + print(f'##### SUFFIX#:\n{json.dumps(body.suffix)}\n\n') + print(f'##### STOP REPR#:\n') + if body.stop is not None: + for s in body.stop: print(repr(s)) + print(f'##### STOP#:\n') + for s in body.stop: print(s) + print(f'\n\n') + + body.stop = ['[PREFIX]','[/PREFIX]', '', '[SUFFIX]', '[MIDDLE]'] + + tokenizer = MistralTokenizer.v3() + + if '[PREFIX]' in prompt: + prompt_parts = prompt.split('[PREFIX]') + prefix = prompt_parts[1] if len(prompt_parts) > 0 else '' + postfix = prompt_parts[0] + else: + prefix = prompt + postfix = '' + + fim_request = FIMRequest(prompt=prefix, suffix=postfix) + fim_tokens = tokenizer.encode_fim(fim_request) + body.prompt = fim_tokens.text + + print(f'##### prefix#:\n{repr(prefix)}\n\n') + print(f'##### postfix#:\n{repr(postfix)}\n\n') + print(f'##### fim_tokens.tex#:\n{repr(fim_tokens.text)}\n\n') + llama = llama_proxy( body.model if request.url.path != "/v1/engines/copilot-codex/completions" diff --git a/run-codestral.py b/run-codestral.py new file mode 100644 index 000000000..22b30a0da --- /dev/null +++ b/run-codestral.py @@ -0,0 +1,79 @@ +from fastapi import Request +from fastapi.responses import JSONResponse +from llama_cpp.server.settings import ServerSettings, ModelSettings +from llama_cpp.server.app import create_app +import uvicorn +import llama_cpp +import asyncio + +def dprint(*args, **kwargs): + print(*args, **kwargs) + +model_path = '/home/zbuntu/projects/Codestral-22B-v0.1-Q5_K_M.gguf' + +server_settings = ServerSettings() +server_settings.host = "127.0.0.1" +server_settings.port = 4321 + +model_setting: dict = { + "model": model_path, + "model_alias": "codegemma-7b8bit", + "n_gpu_layers": -1, + "rope_scaling_type": llama_cpp.LLAMA_ROPE_SCALING_TYPE_LINEAR, + "rope_freq_base": 1000000.0, + "rope_freq_scale": 1, + "n_ctx": 16000, + "n_threads":1 +} + +app = create_app( + server_settings=server_settings, + model_settings=[ModelSettings(**model_setting)], +) + +lock = asyncio.Lock() +queue = asyncio.Queue(maxsize=1) # Set the queue size to 1 + +async def wait_for_disconnect(request: Request, queue): + dprint('starting to wait for disconnect') + while True: + if await request.is_disconnected(): + break + await asyncio.sleep(0.5) + dprint('client disconnected') + await queue.get() + dprint('request removed from queue') + + +@app.middleware("http") +async def synchronize_requests(request: Request, call_next): + try: + dprint('adding request to queue') + await queue.put(request) # Add the request to the queue + + dprint('request added to queue') + async with lock: + dprint('acquired lock serving request') + try: + # Process the request + response = await call_next(request) + dprint('response received') + asyncio.create_task(wait_for_disconnect(request, queue)) + dprint('wait for disconnect thread created') + # print(response) + return response + finally: + dprint('request processed') + except asyncio.QueueFull: + dprint('exceptoin encountered') + # If the queue is full, return a 503 error response + return JSONResponse(status_code=503, content={"error": "Service unavailable, try again later"}) + + +uvicorn.run( + app, + host=server_settings.host, + port=server_settings.port, + ssl_keyfile=server_settings.ssl_keyfile, + ssl_certfile=server_settings.ssl_certfile, +) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy