diff --git a/pgml-apps/rag-retrieval-timing-tests/.env.development b/pgml-apps/rag-retrieval-timing-tests/.env.development new file mode 100644 index 000000000..4d6f78d50 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/.env.development @@ -0,0 +1,6 @@ +PINECONE_API_KEY= +QDRANT_API_KEY= +ZILLIZ_API_KEY= +WCS_API_KEY= +OPENAI_API_KEY= +HF_TOKEN= diff --git a/pgml-apps/rag-retrieval-timing-tests/README.md b/pgml-apps/rag-retrieval-timing-tests/README.md new file mode 100644 index 000000000..2a1dd39b6 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/README.md @@ -0,0 +1,7 @@ +# Rag Timing Tests + +This script runs timing tests for common rag systems. + +To run it copy `.env.deveopment` to `.env` and make sure to set the appropriate variables in the `.env` file, install the dependencies in `requirements.txt` and run `python3 __main__.py`. + +Notice that this script assumes certain actions to create databases or setup "collections" have been performed for each cloud provider. See the script for more details. diff --git a/pgml-apps/rag-retrieval-timing-tests/__main__.py b/pgml-apps/rag-retrieval-timing-tests/__main__.py new file mode 100644 index 000000000..7fd6b92aa --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/__main__.py @@ -0,0 +1,161 @@ +import time +import asyncio + +import postgresml as pgl +import zilliz_local as zl +import pinecone_local as pl +import qdrant_local as ql +import openai_local as al +import huggingface as hf +import weaviate_local as wl + +TRIAL_COUNT = 2 + +# The pairs we are testing with +tests = [ + { + "name": "PostgresML", + "vector_store": pgl, + "rag+": True, + "chatbot_service": al, + "async": True, + }, + {"name": "Weaviate", "vector_store": wl, "chatbot_service": al, "rag++": True}, + { + "name": "Zilliz", + "vector_store": zl, + "embedding_service": hf, + "chatbot_service": al, + }, + { + "name": "Pinecone", + "vector_store": pl, + "embedding_service": hf, + "chatbot_service": al, + }, + { + "name": "Qdrant", + "vector_store": ql, + "embedding_service": hf, + "chatbot_service": al, + }, +] + + +# Our documents +# We only really need to test on 2. When we search we are trying to get the first document back +documents = [ + {"id": "0", "metadata": {"text": "The hidden value is 1000"}}, + { + "id": "1", + "metadata": {"text": "This is just some random text"}, + }, +] + + +def maybe_do_async(func, check_dict, *args): + if "async" in check_dict and check_dict["async"]: + return asyncio.run(func(*args)) + else: + return func(*args) + + +def do_data_upsert(name, vector_store, **kwargs): + print(f"Doing Data Upsert For: {name}") + if "rag++" in kwargs or "rag+" in kwargs: + maybe_do_async(vector_store.upsert_data, kwargs, documents) + else: + texts = [d["metadata"]["text"] for d in documents] + (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(texts) + maybe_do_async(vector_store.upsert_data, kwargs, documents, embeddings) + print(f"Done Doing Data Upsert For: {name}\n") + + +def do_normal_rag_test(name, vector_store, **kwargs): + print(f"Doing RAG Test For: {name}") + query = "What is the hidden value?" + if "rag++" in kwargs: + (result, time_to_complete) = maybe_do_async( + vector_store.get_llm_response, kwargs, query + ) + time_to_embed = 0 + time_to_search = 0 + elif "rag+" in kwargs: + time_to_embed = 0 + (context, time_to_search) = maybe_do_async( + vector_store.do_search, kwargs, query + ) + (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response( + query, context + ) + else: + (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings( + [query] + ) + (context, time_to_search) = vector_store.do_search(embeddings[0]) + (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response( + query, context + ) + print(f"\tThe LLM Said: {result}") + time_for_retrieval = time_to_embed + time_to_search + total_time = time_to_embed + time_to_search + time_to_complete + print(f"Done Doing RAG Test For: {name}") + print(f"- Time to Embed: {time_to_embed}") + print(f"- Time to Search: {time_to_search}") + print(f"- Total Time for Retrieval: {time_for_retrieval}") + print(f"- Time for Chatbot Completion: {time_to_complete}") + print(f"- Total Time Taken: {total_time}\n") + return { + "time_to_embed": time_to_embed, + "time_to_search": time_to_search, + "time_for_retrieval": time_for_retrieval, + "time_to_complete": time_to_complete, + "total_time": total_time, + } + + +if __name__ == "__main__": + print("----------Doing Data Setup-------------------------\n") + for test in tests: + do_data_upsert(**test) + print("\n----------Done Doing Data Setup------------------\n\n") + + print("----------Doing Rag Tests-------------------------\n") + stats = {} + for i in range(TRIAL_COUNT): + for test in tests: + times = do_normal_rag_test(**test) + if not test["name"] in stats: + stats[test["name"]] = [] + stats[test["name"]].append(times) + print("\n----------Done Doing Rag Tests---------------------\n") + + print("------------Final Results---------------------------\n") + for test in tests: + trials = stats[test["name"]] + ( + time_to_embed, + time_to_search, + time_for_retrieval, + time_to_complete, + total_time, + ) = [ + sum(trial[key] for trial in trials) + for key in [ + "time_to_embed", + "time_to_search", + "time_for_retrieval", + "time_to_complete", + "total_time", + ] + ] + print(f'Done Doing RAG Test For: {test["name"]}') + print(f"- Average Time to Embed: {(time_to_embed / TRIAL_COUNT):0.4f}") + print(f"- Average Time to Search: {(time_to_search / TRIAL_COUNT):0.4f}") + print( + f"- Average Total Time for Retrieval: {(time_for_retrieval / TRIAL_COUNT):0.4f}" + ) + print( + f"- Average Time for Chatbot Completion: {(time_to_complete / TRIAL_COUNT):0.4f}" + ) + print(f"- Average Total Time Taken: {(total_time / TRIAL_COUNT):0.4f}\n") diff --git a/pgml-apps/rag-retrieval-timing-tests/huggingface.py b/pgml-apps/rag-retrieval-timing-tests/huggingface.py new file mode 100644 index 000000000..5d1f03666 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/huggingface.py @@ -0,0 +1,29 @@ +import requests +import time +import os +import sys +from dotenv import load_dotenv + +# Load our environment variables +load_dotenv() +HF_TOKEN = os.getenv("HF_TOKEN") + + +# Get the embedding from HuggingFace +def get_embeddings(inputs): + print("\tGetting embeddings from HuggingFace") + tic = time.perf_counter() + headers = {"Authorization": f"Bearer {HF_TOKEN}"} + payload = {"inputs": inputs} + response = requests.post( + "https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small", + headers=headers, + json=payload, + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone getting embeddings: {toc - tic:0.4f}\n") + response = response.json() + if "error" in response: + sys.exit(response) + return (response, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/openai_local.py b/pgml-apps/rag-retrieval-timing-tests/openai_local.py new file mode 100644 index 000000000..6d9a39823 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/openai_local.py @@ -0,0 +1,26 @@ +from openai import OpenAI +import time + +# Create our OpenAI client +client = OpenAI() + + +# Get LLM response from OpenAI +def get_llm_response(query, context): + print("\tGetting LLM response from OpenAI") + tic = time.perf_counter() + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": f"You are a helpful assistant. Given the context, provide an answer to the user: \n{context}", + }, + {"role": "user", "content": query}, + ], + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone getting the LLM response: {time_taken:0.4f}") + response = completion.choices[0].message.content + return (response, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py b/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py new file mode 100644 index 000000000..283e83aa4 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py @@ -0,0 +1,43 @@ +from pinecone import Pinecone, ServerlessSpec +from dotenv import load_dotenv +import time +import os + +# Load our environment variables +load_dotenv() +PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") + +# Create our Pinecone client +# Note we created their default index using their gcp-start region and us-central1 region +pc = Pinecone(api_key=PINECONE_API_KEY) +index = pc.Index("test") + + +# Store some initial documents to retrieve +def upsert_data(documents, embeddings): + for document, embedding in zip(documents, embeddings): + document["values"] = embedding + print("\tStarting PineCone upsert") + tic = time.perf_counter() + index.upsert(documents, namespace="ns1") + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone PineCone upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +# Do cosine similarity search over our pinecone index +def do_search(vector): + print("\tDoing cosine similarity search with PineCone") + tic = time.perf_counter() + results = index.query( + namespace="ns1", + vector=vector, + top_k=1, + include_metadata=True, + ) + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + result = results["matches"][0]["metadata"]["text"] + return (result, time_done) diff --git a/pgml-apps/rag-retrieval-timing-tests/postgresml.py b/pgml-apps/rag-retrieval-timing-tests/postgresml.py new file mode 100644 index 000000000..83f7fc3de --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/postgresml.py @@ -0,0 +1,62 @@ +from pgml import Collection, Pipeline +from dotenv import load_dotenv +import time + +# Load our environment variables +load_dotenv() + +# Initialize our Collection and Pipeline +collection = Collection("test_collection") +pipeline = Pipeline( + "test_pipeline", + { + "text": { + "semantic_search": { + "model": "intfloat/e5-small", + }, + } + }, +) + + +# Add the Pipeline to our collection +# We only need to do this once +async def setup_pipeline(): + await collection.add_pipeline(pipeline) + + +async def upsert_data(documents): + documents = [ + {"id": document["id"], "text": document["metadata"]["text"]} + for document in documents + ] + print("Starting PostgresML upsert") + tic = time.perf_counter() + await collection.upsert_documents(documents) + toc = time.perf_counter() + time_taken = toc - tic + print(f"Done PostgresML upsert: {time_taken:0.4f}\n") + + +async def do_search(query): + print( + "\tDoing embedding and cosine similarity search over our PostgresML Collection" + ) + tic = time.perf_counter() + results = await collection.vector_search( + { + "query": { + "fields": { + "text": { + "query": query, + }, + } + }, + "limit": 1, + }, + pipeline, + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone doing embedding and cosine similarity search: {time_taken:0.4f}\n") + return (results[0]["chunk"], time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py b/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py new file mode 100644 index 000000000..46be70b05 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py @@ -0,0 +1,49 @@ +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct +from dotenv import load_dotenv +import time +import os + +# Load our environment variables +load_dotenv() +QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") + +# Create our Qdrant client +qdrant = QdrantClient( + url="https://059364f6-62c5-4f80-9f19-cf6d6394caae.us-east4-0.gcp.cloud.qdrant.io:6333", + api_key=QDRANT_API_KEY, +) + +# Create our Qdrant collection +qdrant.recreate_collection( + collection_name="test", + vectors_config=VectorParams(size=384, distance=Distance.COSINE), +) + + +# Store some initial documents to retrieve +def upsert_data(documents, embeddings): + points = [ + PointStruct( + id=int(document["id"]), vector=embedding, payload=document["metadata"] + ) + for document, embedding in zip(documents, embeddings) + ] + print("\tStarting Qdrant upsert") + tic = time.perf_counter() + qdrant.upsert(collection_name="test", points=points) + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone Qdrant upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +# Do cosine similarity search over our Qdrant collection +def do_search(vector): + print("\tDoing cosine similarity search with Qdrant") + tic = time.perf_counter() + results = qdrant.search(collection_name="test", query_vector=vector, limit=1) + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + return (results, time_done) diff --git a/pgml-apps/rag-retrieval-timing-tests/requirements.txt b/pgml-apps/rag-retrieval-timing-tests/requirements.txt new file mode 100644 index 000000000..bea10b7d6 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/requirements.txt @@ -0,0 +1,36 @@ +annotated-types==0.6.0 +anyio==4.3.0 +Authlib==1.3.0 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +cryptography==42.0.5 +distro==1.9.0 +grpcio==1.62.0 +grpcio-health-checking==1.62.0 +grpcio-tools==1.62.0 +h11==0.14.0 +h2==4.1.0 +hpack==4.0.0 +httpcore==1.0.4 +httpx==0.27.0 +hyperframe==6.0.1 +idna==3.6 +numpy==1.26.4 +openai==1.13.3 +pgml==1.0.0 +pinecone-client==3.1.0 +portalocker==2.8.2 +protobuf==4.25.3 +pycparser==2.21 +pydantic==2.6.3 +pydantic_core==2.16.3 +python-dotenv==1.0.1 +qdrant-client==1.7.3 +requests==2.31.0 +sniffio==1.3.1 +tqdm==4.66.2 +typing_extensions==4.10.0 +urllib3==2.2.1 +validators==0.22.0 +weaviate-client==4.5.1 diff --git a/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py b/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py new file mode 100644 index 000000000..025a3a5e5 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py @@ -0,0 +1,71 @@ +import weaviate +import weaviate.classes as wvc +import os +import time +from dotenv import load_dotenv + +# Load our environment variables +load_dotenv() +WEVIATE_API_KEY = os.getenv("WCS_API_KEY") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +HF_TOKEN = os.getenv("HF_TOKEN") + +# Connect to a WCS instance +client = weaviate.connect_to_wcs( + cluster_url="https://test-n0wsyrvs.weaviate.network", + auth_credentials=weaviate.auth.AuthApiKey(WEVIATE_API_KEY), + headers={ + "X-OpenAI-Api-Key": OPENAI_API_KEY, + "X-HuggingFace-Api-Key": HF_TOKEN, + }, +) + +# NOTE: We can only create this once or it seems to error out +# Create Weaviate Collection +# test_collection = client.collections.create( +# name="test", +# vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_huggingface( +# "intfloat/e5-small" +# ), +# generative_config=wvc.config.Configure.Generative.openai(), +# properties=[ +# wvc.config.Property( +# name="text", +# data_type=wvc.config.DataType.TEXT, +# vectorize_property_name=True, +# tokenization=wvc.config.Tokenization.LOWERCASE, +# ), +# ], +# vector_index_config=wvc.config.Configure.VectorIndex.hnsw( +# distance_metric=wvc.config.VectorDistances.COSINE, +# quantizer=wvc.config.Configure.VectorIndex.Quantizer.pq(), +# ), +# ) +test_collection = client.collections.get("test") + + +def upsert_data(documents): + documents = [ + wvc.data.DataObject(properties={"text": document["metadata"]["text"]}) + for document in documents + ] + print("Starting PostgresML upsert") + tic = time.perf_counter() + test_collection.data.insert_many(documents) + toc = time.perf_counter() + time_taken = toc - tic + print(f"Done PostgresML upsert: {time_taken:0.4f}\n") + + +def get_llm_response(query): + print("\tDoing Embedding, Search, and Getting LLM Fesponse from Weaviate") + tic = time.perf_counter() + response = test_collection.generate.near_text( + query=query, limit=1, grouped_task=query + ) + toc = time.perf_counter() + time_taken = toc - tic + print( + f"\tDone Doing Embedding, Search, and Getting LLM Response from Weaviate: {time_taken:0.4f}" + ) + return (response.generated, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py b/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py new file mode 100644 index 000000000..fa462e7f2 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py @@ -0,0 +1,65 @@ +import requests +from dotenv import load_dotenv +import time +import json +import os + +# Load our environment variables +load_dotenv() +ZILLIZ_API_KEY = os.getenv("ZILLIZ_API_KEY") + + +# We created a simple zilliz collection called test with fields: +# primary_key - a int primary key +# vector - a vector of 384 +# text a varchar + + +def upsert_data(documents, embeddings): + documents = [ + { + "vector": embedding, + "primary_key": document["id"], + "text": document["metadata"]["text"], + } + for document, embedding in zip(documents, embeddings) + ] + headers = { + "Authorization": f"Bearer {ZILLIZ_API_KEY}", + "Accept": "application/json", + "Content-Type": "application/json", + } + url = ( + "https://in03-23659a0ce4651d6.api.gcp-us-west1.zillizcloud.com/v1/vector/insert" + ) + payload = { + "collectionName": "test", + "data": documents, + } + print("\tStarting Zilliz upsert") + tic = time.perf_counter() + requests.post(url, data=json.dumps(payload), headers=headers) + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone Zilliz upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +def do_search(vector): + print("\tDoing cosine similarity search with Zilliz") + url = ( + "https://in03-23659a0ce4651d6.api.gcp-us-west1.zillizcloud.com/v1/vector/search" + ) + payload = {"collectionName": "test", "vector": vector, "outputFields": ["text"]} + headers = { + "Authorization": f"Bearer {ZILLIZ_API_KEY}", + "Accept": "application/json", + "Content-Type": "application/json", + } + tic = time.perf_counter() + results = requests.post(url, data=json.dumps(payload), headers=headers).json() + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + result = results["data"][0]["text"] + return (result, time_done)
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: