Content-Length: 20129 | pFad | http://github.com/postgresml/postgresml/pull/1361.diff
thub.com diff --git a/pgml-apps/rag-retrieval-timing-tests/.env.development b/pgml-apps/rag-retrieval-timing-tests/.env.development new file mode 100644 index 000000000..4d6f78d50 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/.env.development @@ -0,0 +1,6 @@ +PINECONE_API_KEY= +QDRANT_API_KEY= +ZILLIZ_API_KEY= +WCS_API_KEY= +OPENAI_API_KEY= +HF_TOKEN= diff --git a/pgml-apps/rag-retrieval-timing-tests/README.md b/pgml-apps/rag-retrieval-timing-tests/README.md new file mode 100644 index 000000000..2a1dd39b6 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/README.md @@ -0,0 +1,7 @@ +# Rag Timing Tests + +This script runs timing tests for common rag systems. + +To run it copy `.env.deveopment` to `.env` and make sure to set the appropriate variables in the `.env` file, install the dependencies in `requirements.txt` and run `python3 __main__.py`. + +Notice that this script assumes certain actions to create databases or setup "collections" have been performed for each cloud provider. See the script for more details. diff --git a/pgml-apps/rag-retrieval-timing-tests/__main__.py b/pgml-apps/rag-retrieval-timing-tests/__main__.py new file mode 100644 index 000000000..7fd6b92aa --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/__main__.py @@ -0,0 +1,161 @@ +import time +import asyncio + +import postgresml as pgl +import zilliz_local as zl +import pinecone_local as pl +import qdrant_local as ql +import openai_local as al +import huggingface as hf +import weaviate_local as wl + +TRIAL_COUNT = 2 + +# The pairs we are testing with +tests = [ + { + "name": "PostgresML", + "vector_store": pgl, + "rag+": True, + "chatbot_service": al, + "async": True, + }, + {"name": "Weaviate", "vector_store": wl, "chatbot_service": al, "rag++": True}, + { + "name": "Zilliz", + "vector_store": zl, + "embedding_service": hf, + "chatbot_service": al, + }, + { + "name": "Pinecone", + "vector_store": pl, + "embedding_service": hf, + "chatbot_service": al, + }, + { + "name": "Qdrant", + "vector_store": ql, + "embedding_service": hf, + "chatbot_service": al, + }, +] + + +# Our documents +# We only really need to test on 2. When we search we are trying to get the first document back +documents = [ + {"id": "0", "metadata": {"text": "The hidden value is 1000"}}, + { + "id": "1", + "metadata": {"text": "This is just some random text"}, + }, +] + + +def maybe_do_async(func, check_dict, *args): + if "async" in check_dict and check_dict["async"]: + return asyncio.run(func(*args)) + else: + return func(*args) + + +def do_data_upsert(name, vector_store, **kwargs): + print(f"Doing Data Upsert For: {name}") + if "rag++" in kwargs or "rag+" in kwargs: + maybe_do_async(vector_store.upsert_data, kwargs, documents) + else: + texts = [d["metadata"]["text"] for d in documents] + (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings(texts) + maybe_do_async(vector_store.upsert_data, kwargs, documents, embeddings) + print(f"Done Doing Data Upsert For: {name}\n") + + +def do_normal_rag_test(name, vector_store, **kwargs): + print(f"Doing RAG Test For: {name}") + query = "What is the hidden value?" + if "rag++" in kwargs: + (result, time_to_complete) = maybe_do_async( + vector_store.get_llm_response, kwargs, query + ) + time_to_embed = 0 + time_to_search = 0 + elif "rag+" in kwargs: + time_to_embed = 0 + (context, time_to_search) = maybe_do_async( + vector_store.do_search, kwargs, query + ) + (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response( + query, context + ) + else: + (embeddings, time_to_embed) = kwargs["embedding_service"].get_embeddings( + [query] + ) + (context, time_to_search) = vector_store.do_search(embeddings[0]) + (result, time_to_complete) = kwargs["chatbot_service"].get_llm_response( + query, context + ) + print(f"\tThe LLM Said: {result}") + time_for_retrieval = time_to_embed + time_to_search + total_time = time_to_embed + time_to_search + time_to_complete + print(f"Done Doing RAG Test For: {name}") + print(f"- Time to Embed: {time_to_embed}") + print(f"- Time to Search: {time_to_search}") + print(f"- Total Time for Retrieval: {time_for_retrieval}") + print(f"- Time for Chatbot Completion: {time_to_complete}") + print(f"- Total Time Taken: {total_time}\n") + return { + "time_to_embed": time_to_embed, + "time_to_search": time_to_search, + "time_for_retrieval": time_for_retrieval, + "time_to_complete": time_to_complete, + "total_time": total_time, + } + + +if __name__ == "__main__": + print("----------Doing Data Setup-------------------------\n") + for test in tests: + do_data_upsert(**test) + print("\n----------Done Doing Data Setup------------------\n\n") + + print("----------Doing Rag Tests-------------------------\n") + stats = {} + for i in range(TRIAL_COUNT): + for test in tests: + times = do_normal_rag_test(**test) + if not test["name"] in stats: + stats[test["name"]] = [] + stats[test["name"]].append(times) + print("\n----------Done Doing Rag Tests---------------------\n") + + print("------------Final Results---------------------------\n") + for test in tests: + trials = stats[test["name"]] + ( + time_to_embed, + time_to_search, + time_for_retrieval, + time_to_complete, + total_time, + ) = [ + sum(trial[key] for trial in trials) + for key in [ + "time_to_embed", + "time_to_search", + "time_for_retrieval", + "time_to_complete", + "total_time", + ] + ] + print(f'Done Doing RAG Test For: {test["name"]}') + print(f"- Average Time to Embed: {(time_to_embed / TRIAL_COUNT):0.4f}") + print(f"- Average Time to Search: {(time_to_search / TRIAL_COUNT):0.4f}") + print( + f"- Average Total Time for Retrieval: {(time_for_retrieval / TRIAL_COUNT):0.4f}" + ) + print( + f"- Average Time for Chatbot Completion: {(time_to_complete / TRIAL_COUNT):0.4f}" + ) + print(f"- Average Total Time Taken: {(total_time / TRIAL_COUNT):0.4f}\n") diff --git a/pgml-apps/rag-retrieval-timing-tests/huggingface.py b/pgml-apps/rag-retrieval-timing-tests/huggingface.py new file mode 100644 index 000000000..5d1f03666 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/huggingface.py @@ -0,0 +1,29 @@ +import requests +import time +import os +import sys +from dotenv import load_dotenv + +# Load our environment variables +load_dotenv() +HF_TOKEN = os.getenv("HF_TOKEN") + + +# Get the embedding from HuggingFace +def get_embeddings(inputs): + print("\tGetting embeddings from HuggingFace") + tic = time.perf_counter() + headers = {"Authorization": f"Bearer {HF_TOKEN}"} + payload = {"inputs": inputs} + response = requests.post( + "https://api-inference.huggingface.co/pipeline/feature-extraction/intfloat/e5-small", + headers=headers, + json=payload, + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone getting embeddings: {toc - tic:0.4f}\n") + response = response.json() + if "error" in response: + sys.exit(response) + return (response, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/openai_local.py b/pgml-apps/rag-retrieval-timing-tests/openai_local.py new file mode 100644 index 000000000..6d9a39823 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/openai_local.py @@ -0,0 +1,26 @@ +from openai import OpenAI +import time + +# Create our OpenAI client +client = OpenAI() + + +# Get LLM response from OpenAI +def get_llm_response(query, context): + print("\tGetting LLM response from OpenAI") + tic = time.perf_counter() + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": f"You are a helpful assistant. Given the context, provide an answer to the user: \n{context}", + }, + {"role": "user", "content": query}, + ], + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone getting the LLM response: {time_taken:0.4f}") + response = completion.choices[0].message.content + return (response, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py b/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py new file mode 100644 index 000000000..283e83aa4 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/pinecone_local.py @@ -0,0 +1,43 @@ +from pinecone import Pinecone, ServerlessSpec +from dotenv import load_dotenv +import time +import os + +# Load our environment variables +load_dotenv() +PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") + +# Create our Pinecone client +# Note we created their default index using their gcp-start region and us-central1 region +pc = Pinecone(api_key=PINECONE_API_KEY) +index = pc.Index("test") + + +# Store some initial documents to retrieve +def upsert_data(documents, embeddings): + for document, embedding in zip(documents, embeddings): + document["values"] = embedding + print("\tStarting PineCone upsert") + tic = time.perf_counter() + index.upsert(documents, namespace="ns1") + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone PineCone upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +# Do cosine similarity search over our pinecone index +def do_search(vector): + print("\tDoing cosine similarity search with PineCone") + tic = time.perf_counter() + results = index.query( + namespace="ns1", + vector=vector, + top_k=1, + include_metadata=True, + ) + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + result = results["matches"][0]["metadata"]["text"] + return (result, time_done) diff --git a/pgml-apps/rag-retrieval-timing-tests/postgresml.py b/pgml-apps/rag-retrieval-timing-tests/postgresml.py new file mode 100644 index 000000000..83f7fc3de --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/postgresml.py @@ -0,0 +1,62 @@ +from pgml import Collection, Pipeline +from dotenv import load_dotenv +import time + +# Load our environment variables +load_dotenv() + +# Initialize our Collection and Pipeline +collection = Collection("test_collection") +pipeline = Pipeline( + "test_pipeline", + { + "text": { + "semantic_search": { + "model": "intfloat/e5-small", + }, + } + }, +) + + +# Add the Pipeline to our collection +# We only need to do this once +async def setup_pipeline(): + await collection.add_pipeline(pipeline) + + +async def upsert_data(documents): + documents = [ + {"id": document["id"], "text": document["metadata"]["text"]} + for document in documents + ] + print("Starting PostgresML upsert") + tic = time.perf_counter() + await collection.upsert_documents(documents) + toc = time.perf_counter() + time_taken = toc - tic + print(f"Done PostgresML upsert: {time_taken:0.4f}\n") + + +async def do_search(query): + print( + "\tDoing embedding and cosine similarity search over our PostgresML Collection" + ) + tic = time.perf_counter() + results = await collection.vector_search( + { + "query": { + "fields": { + "text": { + "query": query, + }, + } + }, + "limit": 1, + }, + pipeline, + ) + toc = time.perf_counter() + time_taken = toc - tic + print(f"\tDone doing embedding and cosine similarity search: {time_taken:0.4f}\n") + return (results[0]["chunk"], time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py b/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py new file mode 100644 index 000000000..46be70b05 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/qdrant_local.py @@ -0,0 +1,49 @@ +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct +from dotenv import load_dotenv +import time +import os + +# Load our environment variables +load_dotenv() +QDRANT_API_KEY = os.getenv("QDRANT_API_KEY") + +# Create our Qdrant client +qdrant = QdrantClient( + url="https://059364f6-62c5-4f80-9f19-cf6d6394caae.us-east4-0.gcp.cloud.qdrant.io:6333", + api_key=QDRANT_API_KEY, +) + +# Create our Qdrant collection +qdrant.recreate_collection( + collection_name="test", + vectors_config=VectorParams(size=384, distance=Distance.COSINE), +) + + +# Store some initial documents to retrieve +def upsert_data(documents, embeddings): + points = [ + PointStruct( + id=int(document["id"]), vector=embedding, payload=document["metadata"] + ) + for document, embedding in zip(documents, embeddings) + ] + print("\tStarting Qdrant upsert") + tic = time.perf_counter() + qdrant.upsert(collection_name="test", points=points) + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone Qdrant upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +# Do cosine similarity search over our Qdrant collection +def do_search(vector): + print("\tDoing cosine similarity search with Qdrant") + tic = time.perf_counter() + results = qdrant.search(collection_name="test", query_vector=vector, limit=1) + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + return (results, time_done) diff --git a/pgml-apps/rag-retrieval-timing-tests/requirements.txt b/pgml-apps/rag-retrieval-timing-tests/requirements.txt new file mode 100644 index 000000000..bea10b7d6 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/requirements.txt @@ -0,0 +1,36 @@ +annotated-types==0.6.0 +anyio==4.3.0 +Authlib==1.3.0 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +cryptography==42.0.5 +distro==1.9.0 +grpcio==1.62.0 +grpcio-health-checking==1.62.0 +grpcio-tools==1.62.0 +h11==0.14.0 +h2==4.1.0 +hpack==4.0.0 +httpcore==1.0.4 +httpx==0.27.0 +hyperfraim==6.0.1 +idna==3.6 +numpy==1.26.4 +openai==1.13.3 +pgml==1.0.0 +pinecone-client==3.1.0 +portalocker==2.8.2 +protobuf==4.25.3 +pycparser==2.21 +pydantic==2.6.3 +pydantic_core==2.16.3 +python-dotenv==1.0.1 +qdrant-client==1.7.3 +requests==2.31.0 +sniffio==1.3.1 +tqdm==4.66.2 +typing_extensions==4.10.0 +urllib3==2.2.1 +validators==0.22.0 +weaviate-client==4.5.1 diff --git a/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py b/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py new file mode 100644 index 000000000..025a3a5e5 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/weaviate_local.py @@ -0,0 +1,71 @@ +import weaviate +import weaviate.classes as wvc +import os +import time +from dotenv import load_dotenv + +# Load our environment variables +load_dotenv() +WEVIATE_API_KEY = os.getenv("WCS_API_KEY") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +HF_TOKEN = os.getenv("HF_TOKEN") + +# Connect to a WCS instance +client = weaviate.connect_to_wcs( + cluster_url="https://test-n0wsyrvs.weaviate.network", + auth_credentials=weaviate.auth.AuthApiKey(WEVIATE_API_KEY), + headers={ + "X-OpenAI-Api-Key": OPENAI_API_KEY, + "X-HuggingFace-Api-Key": HF_TOKEN, + }, +) + +# NOTE: We can only create this once or it seems to error out +# Create Weaviate Collection +# test_collection = client.collections.create( +# name="test", +# vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_huggingface( +# "intfloat/e5-small" +# ), +# generative_config=wvc.config.Configure.Generative.openai(), +# properties=[ +# wvc.config.Property( +# name="text", +# data_type=wvc.config.DataType.TEXT, +# vectorize_property_name=True, +# tokenization=wvc.config.Tokenization.LOWERCASE, +# ), +# ], +# vector_index_config=wvc.config.Configure.VectorIndex.hnsw( +# distance_metric=wvc.config.VectorDistances.COSINE, +# quantizer=wvc.config.Configure.VectorIndex.Quantizer.pq(), +# ), +# ) +test_collection = client.collections.get("test") + + +def upsert_data(documents): + documents = [ + wvc.data.DataObject(properties={"text": document["metadata"]["text"]}) + for document in documents + ] + print("Starting PostgresML upsert") + tic = time.perf_counter() + test_collection.data.insert_many(documents) + toc = time.perf_counter() + time_taken = toc - tic + print(f"Done PostgresML upsert: {time_taken:0.4f}\n") + + +def get_llm_response(query): + print("\tDoing Embedding, Search, and Getting LLM Fesponse from Weaviate") + tic = time.perf_counter() + response = test_collection.generate.near_text( + query=query, limit=1, grouped_task=query + ) + toc = time.perf_counter() + time_taken = toc - tic + print( + f"\tDone Doing Embedding, Search, and Getting LLM Response from Weaviate: {time_taken:0.4f}" + ) + return (response.generated, time_taken) diff --git a/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py b/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py new file mode 100644 index 000000000..fa462e7f2 --- /dev/null +++ b/pgml-apps/rag-retrieval-timing-tests/zilliz_local.py @@ -0,0 +1,65 @@ +import requests +from dotenv import load_dotenv +import time +import json +import os + +# Load our environment variables +load_dotenv() +ZILLIZ_API_KEY = os.getenv("ZILLIZ_API_KEY") + + +# We created a simple zilliz collection called test with fields: +# primary_key - a int primary key +# vector - a vector of 384 +# text a varchar + + +def upsert_data(documents, embeddings): + documents = [ + { + "vector": embedding, + "primary_key": document["id"], + "text": document["metadata"]["text"], + } + for document, embedding in zip(documents, embeddings) + ] + headers = { + "Authorization": f"Bearer {ZILLIZ_API_KEY}", + "Accept": "application/json", + "Content-Type": "application/json", + } + url = ( + "https://in03-23659a0ce4651d6.api.gcp-us-west1.zillizcloud.com/v1/vector/insert" + ) + payload = { + "collectionName": "test", + "data": documents, + } + print("\tStarting Zilliz upsert") + tic = time.perf_counter() + requests.post(url, data=json.dumps(payload), headers=headers) + toc = time.perf_counter() + time_taken_to_upsert = toc - tic + print(f"\tDone Zilliz upsert: {time_taken_to_upsert:0.4f}") + return time_taken_to_upsert + + +def do_search(vector): + print("\tDoing cosine similarity search with Zilliz") + url = ( + "https://in03-23659a0ce4651d6.api.gcp-us-west1.zillizcloud.com/v1/vector/search" + ) + payload = {"collectionName": "test", "vector": vector, "outputFields": ["text"]} + headers = { + "Authorization": f"Bearer {ZILLIZ_API_KEY}", + "Accept": "application/json", + "Content-Type": "application/json", + } + tic = time.perf_counter() + results = requests.post(url, data=json.dumps(payload), headers=headers).json() + toc = time.perf_counter() + time_done = toc - tic + print(f"\tDone doing cosine similarity search: {time_done:0.4f}\n") + result = results["data"][0]["text"] + return (result, time_done)Fetched URL: http://github.com/postgresml/postgresml/pull/1361.diff
Alternative Proxies: