From 8c83804f62b1cc571eeade3fd63e9a77be5de693 Mon Sep 17 00:00:00 2001
From: Anil Pathak <anil@heyday.com>
Date: Sun, 14 Jan 2024 00:59:23 -0600
Subject: [PATCH 1/4] Fix low_level_api_chat_cpp to match current API

---
 .../low_level_api/low_level_api_chat_cpp.py   | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 44b6d4a35..1d7646179 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -18,6 +18,7 @@
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
 import util
+import os
 
 # A LLaMA interactive session
 class LLaMAInteract:
@@ -62,7 +63,7 @@ def __init__(self, params: GptParams) -> None:
 		self.multibyte_fix = []
 
 		# model load
-		self.lparams = llama_cpp.llama_context_default_params()
+		self.lparams = llama_cpp.llama_model_default_params()
 		self.lparams.n_ctx = self.params.n_ctx
 		self.lparams.n_parts = self.params.n_parts
 		self.lparams.seed = self.params.seed
@@ -72,7 +73,11 @@ def __init__(self, params: GptParams) -> None:
 
 		self.model = llama_cpp.llama_load_model_from_file(
 			self.params.model.encode("utf8"), self.lparams)
-		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
+
+		# Context Params.
+		self.cparams = llama_cpp.llama_context_default_params()
+
+		self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
 		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
@@ -244,7 +249,7 @@ def __init__(self, params: GptParams) -> None:
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
-		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
+		_n = llama_cpp.llama_tokenize(self.model, prompt.encode("utf8", errors="ignore"), len(prompt), _arr, len(_arr), bos, False)
 		return _arr[:_n]
 
 	def set_color(self, c):
@@ -304,7 +309,7 @@ def generate(self):
 					self.n_past += n_eval"""
 
 				if (llama_cpp.llama_eval(
-					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
+					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 
@@ -332,7 +337,7 @@ def generate(self):
 				id = 0
 
 				logits = llama_cpp.llama_get_logits(self.ctx)
-				n_vocab = llama_cpp.llama_n_vocab(self.ctx)
+				n_vocab = llama_cpp.llama_n_vocab(self.model)
 
 				# Apply params.logit_bias map
 				for key, value in self.params.logit_bias.items():
@@ -349,12 +354,20 @@ def generate(self):
 				last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
 
 				_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
-				llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
-				llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
-					_arr,
-					last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
+				llama_cpp.llama_sample_repetition_penalties(
+					ctx=self.ctx,
+					candidates=candidates_p,
+					last_tokens_data = _arr,
+					penalty_last_n = last_n_repeat,
+					penalty_repeat = llama_cpp.c_float(self.params.repeat_penalty),
+					penalty_freq = llama_cpp.c_float(self.params.frequency_penalty),
+					penalty_present = llama_cpp.c_float(self.params.presence_penalty),
+				)
+				
+				# NOT PRESENT IN CURRENT VERSION ?
+				# llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
+				# 	_arr,
+				# 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
 
 				if not self.params.penalize_nl:
 					logits[llama_cpp.llama_token_nl()] = nl_logit
@@ -473,7 +486,7 @@ def exit(self):
 	def token_to_str(self, token_id: int) -> bytes:
 		size = 32
 		buffer = (ctypes.c_char * size)()
-		n = llama_cpp.llama_token_to_piece_with_model(
+		n = llama_cpp.llama_token_to_piece(
 			self.model, llama_cpp.llama_token(token_id), buffer, size)
 		assert n <= size
 		return bytes(buffer[:n])
@@ -532,6 +545,9 @@ def interact(self):
 			print(i,end="",flush=True)
 		self.params.input_echo = False
 
+        # Using string instead of tokens to check for antiprompt,
+		# It is more reliable than tokens for interactive mode.
+		generated_str = ""
 		while self.params.interactive:
 			self.set_color(util.CONSOLE_COLOR_USER_INPUT)
 			if (self.params.instruct):
@@ -546,6 +562,10 @@ def interact(self):
 			try:
 				for i in self.output():
 					print(i,end="",flush=True)
+					generated_str += i
+					for ap in self.params.antiprompt:
+						if generated_str.endswith(ap):							
+							raise KeyboardInterrupt
 			except KeyboardInterrupt:
 				self.set_color(util.CONSOLE_COLOR_DEFAULT)
 				if not self.params.instruct:
@@ -561,7 +581,7 @@ def interact(self):
 	time_now = datetime.now()
 	prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
 The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
 The transcript only includes text, it does not include markup like HTML and Markdown.
 
@@ -575,8 +595,11 @@ def interact(self):
 {AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
-{USER_NAME}:"""
+{USER_NAME}:   """
+	
 	params = gpt_params_parse()
+	if params.prompt is None and params.file is None:
+		params.prompt = prompt
 
 	with LLaMAInteract(params) as m:
 		m.interact()

From 6775080807b44968f9b127f8a9775dbc3aa21985 Mon Sep 17 00:00:00 2001
From: Anil Pathak <anil@heyday.com>
Date: Sun, 14 Jan 2024 01:05:39 -0600
Subject: [PATCH 2/4] Fix low_level_api_chat_cpp to match current API

---
 examples/low_level_api/low_level_api_chat_cpp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 1d7646179..02c09afb0 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -18,7 +18,6 @@
 import llama_cpp
 from common import GptParams, gpt_params_parse, gpt_random_prompt
 import util
-import os
 
 # A LLaMA interactive session
 class LLaMAInteract:

From 139257bafd9bd7ac42d4704ce581f1db28da56a5 Mon Sep 17 00:00:00 2001
From: Anil Pathak <anil@heyday.com>
Date: Sun, 14 Jan 2024 01:09:59 -0600
Subject: [PATCH 3/4] Using None instead of empty string to so that default
 prompt template can be used if no prompt provided

---
 examples/low_level_api/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py
index 55d08db5f..1a5152530 100644
--- a/examples/low_level_api/common.py
+++ b/examples/low_level_api/common.py
@@ -106,7 +106,7 @@ def gpt_params_parse(argv = None):
     parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
 
     parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-p", "--prompt", type=str, default=None, help="initial prompt",dest="prompt")
     parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
     parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
     parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")

From e200577326f3b392c40cdfbcfe688c85273d1bfc Mon Sep 17 00:00:00 2001
From: Anil Pathak <anil@heyday.com>
Date: Mon, 15 Jan 2024 12:33:16 -0600
Subject: [PATCH 4/4] Support Accept text/event-stream in chat and completion
 endpoints, resolves #1083

---
 llama_cpp/server/app.py | 59 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index c54e4eb5c..fed0a6deb 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -197,7 +197,36 @@ async def authenticate(
 
 
 @router.post(
-    "/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]
+    "/v1/completions",
+    summary="Completion",
+    dependencies=[Depends(authenticate)],    
+    response_model= Union[
+        llama_cpp.CreateCompletionResponse,
+        str,
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateCompletionResponse"}                            
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream":{
+                    "schema": {                     
+                      "type": "string",
+                      "title": "Server Side Streaming response, when stream=True. " +
+                        "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                      "example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
+                    }
+                }
+            },
+        }
+    },
 )
 @router.post(
     "/v1/engines/copilot-codex/completions",
@@ -280,7 +309,33 @@ async def create_embedding(
 
 
 @router.post(
-    "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]
+    "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)],
+    response_model= Union[
+        llama_cpp.ChatCompletion, str
+    ],
+    responses={
+        "200": {
+            "description": "Successful Response",
+            "content": {
+                "application/json": {
+                    "schema": {
+                        "anyOf": [
+                            {"$ref": "#/components/schemas/CreateChatCompletionResponse"}                            
+                        ],
+                        "title": "Completion response, when stream=False",
+                    }
+                },
+                "text/event-stream":{
+                    "schema": {                     
+                      "type": "string",
+                      "title": "Server Side Streaming response, when stream=True" +
+                        "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format",  # noqa: E501
+                      "example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
+                    }
+                }
+            },
+        }
+    },
 )
 async def create_chat_completion(
     request: Request,

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://github.com/abetlen/llama-cpp-python/compare/main...aniljava:llama-cpp-python:main.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://github.com/abetlen/llama-cpp-python/compare/main...aniljava:llama-cpp-python:main.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://github.com/abetlen/llama-cpp-python/compare/main...aniljava:llama-cpp-python:main.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://github.com/abetlen/llama-cpp-python/compare/main...aniljava:llama-cpp-python:main.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>