diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile index 020c34df7..d852dc830 100644 --- a/docker/openblas_simple/Dockerfile +++ b/docker/openblas_simple/Dockerfile @@ -12,4 +12,4 @@ RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fas RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose # Run the server -CMD python3 -m llama_cpp.server +CMD python3 -m llama_cpp.server --host 0.0.0.0 --port 8000 --n_ctx 4096 \ No newline at end of file diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 67ee2dbc6..c5cef5baf 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -887,6 +887,7 @@ def _create_completion( completion_tokens: List[int] = [] # Add blank space to start of prompt to match OG llama tokenizer prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()] + print(prompt) text: bytes = b"" returned_tokens: int = 0 stop = ( @@ -1560,12 +1561,31 @@ def create_chat_completion( stop = ( stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] ) - chat_history = "".join( - f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' - for message in messages - ) - PROMPT = chat_history + "### Assistant:" - PROMPT_STOP = ["### Assistant:", "### Human:"] + + # assistant, user, assistant + # system, user, assistant + # chat_history = "".join( + # f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' + # for message in messages + # ) + # PROMPT = chat_history + "### Assistant:" + + + system_prompt_template = "[INST] <>\n{INSTRUCTION}\n<>\n" + user_prompt_template = "{MESSAGE}\n[/INST]\n" + assistant_prompt_template = "{MESSAGE}\n[INST]\n" + + chat_history = [] + for message in messages: + if message["role"] == 'system': + chat_history.append(system_prompt_template.format(INSTRUCTION=message["content"])) + elif message["role"] == 'user': + chat_history.append(user_prompt_template.format(MESSAGE=message["content"])) + elif message["role"] == 'assistant': + chat_history.append(assistant_prompt_template.format(MESSAGE=message["content"])) + PROMPT = "".join(chat_history) + + PROMPT_STOP = ["[INST]"] completion_or_chunks = self( prompt=PROMPT, stop=PROMPT_STOP + stop, diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 24d33fd64..26d24781c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -28,7 +28,7 @@ class Settings(BaseSettings): default=None, description="The alias of the model to use for generating completions.", ) - n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_ctx: int = Field(default=4096, ge=1, description="The context size.") n_gpu_layers: int = Field( default=0, ge=0, diff --git a/run.sh b/run.sh new file mode 100644 index 000000000..28355ab0d --- /dev/null +++ b/run.sh @@ -0,0 +1 @@ +docker run -p 8083:8000 --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/llama-2-13b-chat.gguf.q4_1.bin -v /home/s02009086/weight/models:/var/model -t llamacpp-llama2 \ No newline at end of file pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy