Skip to content

Commit e6e62cc

Browse files
committed
fix up active nav
1 parent 6510734 commit e6e62cc

File tree

28 files changed

+776
-43
lines changed

28 files changed

+776
-43
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import random
2+
embeddings = [[random.random() for _ in range(128)] for _ in range (10_000)]
3+
print(embeddings)
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
-- SELECT ARRAY_AGG(random()) AS vector
2+
-- FROM generate_series(1, 1280000) i
3+
-- GROUP BY i % 10000;
4+
5+
SELECT 1 FROM (
6+
SELECT ARRAY_AGG(random()) AS vector
7+
FROM generate_series(1, 1280000) i
8+
GROUP BY i % 10000
9+
) f LIMIT 0;
10+
11+
-- CREATE TABLE embeddings AS
12+
-- SELECT ARRAY_AGG(random()) AS vector
13+
-- FROM generate_series(1, 1280000) i
14+
-- GROUP BY i % 10000;
15+
16+
-- COPY embeddings TO '/tmp/embeddings.csv' DELIMITER ',' CSV HEADER;
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import sys
2+
import numpy
3+
numpy.set_printoptions(threshold=sys.maxsize)
4+
5+
embeddings = numpy.random.rand(10_000, 128)
6+
print(embeddings)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pgml.sql
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
from datasets import load_dataset
6+
from tqdm.auto import tqdm
7+
from datasets import Dataset
8+
from dotenv import load_dotenv
9+
10+
load_dotenv(".env")
11+
12+
api_org =os.environ["HF_API_KEY"]
13+
endpoint = os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers = {
16+
'Authorization': f'Bearer {api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad = Dataset.from_file("squad-train.arrow")
21+
data = squad.to_pandas()
22+
data = data.drop_duplicates(subset=["context"])
23+
passages = list(data['context'])
24+
25+
total_documents = 10000
26+
batch_size = 1
27+
passages = passages[:total_documents]
28+
29+
start = time()
30+
for i in tqdm(range(0, len(passages), batch_size)):
31+
# find end of batch
32+
i_end = min(i+batch_size, len(passages))
33+
# extract batch
34+
batch = passages[i:i_end]
35+
# generate embeddings for batch via endpoints
36+
res = requests.post(
37+
endpoint,
38+
headers=headers,
39+
json={"inputs": batch}
40+
)
41+
42+
print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
from datasets import load_dataset
6+
import pinecone
7+
from tqdm.auto import tqdm
8+
from datasets import Dataset
9+
10+
api_org =os.environ["HF_API_KEY"]
11+
endpoint = os.environ["HF_ENDPOINT"]
12+
# add the api org token to the headers
13+
headers = {
14+
'Authorization': f'Bearer {api_org}'
15+
}
16+
17+
#squad = load_dataset("squad", split='train')
18+
squad = Dataset.from_file("squad-train.arrow")
19+
data = squad.to_pandas()
20+
data = data.drop_duplicates(subset=["context"])
21+
passages = list(data['context'])
22+
23+
total_documents = 10000
24+
batch_size = 64
25+
passages = passages[:total_documents]
26+
27+
# connect to pinecone environment
28+
pinecone.init(
29+
api_key=os.environ["PINECONE_API_KEY"],
30+
environment=os.environ["PINECONE_ENVIRONMENT"]
31+
)
32+
33+
index_name = 'hf-endpoints'
34+
35+
# check if the movie-emb index exists
36+
if index_name not in pinecone.list_indexes():
37+
# create the index if it does not exist
38+
pinecone.create_index(
39+
index_name,
40+
dimension=dim,
41+
metric="cosine"
42+
)
43+
44+
# connect to movie-emb index we created
45+
index = pinecone.Index(index_name)
46+
47+
start = time()
48+
# we will use batches of 64
49+
for i in tqdm(range(0, len(passages), batch_size)):
50+
# find end of batch
51+
i_end = min(i+batch_size, len(passages))
52+
# extract batch
53+
batch = passages[i:i_end]
54+
# generate embeddings for batch via endpoints
55+
res = requests.post(
56+
endpoint,
57+
headers=headers,
58+
json={"inputs": batch}
59+
)
60+
emb = res.json()['embeddings']
61+
# get metadata (just the original text)
62+
meta = [{'text': text} for text in batch]
63+
# create IDs
64+
ids = [str(x) for x in range(i, i_end)]
65+
# add all to upsert list
66+
to_upsert = list(zip(ids, emb, meta))
67+
# upsert/insert these records to pinecone
68+
_ = index.upsert(vectors=to_upsert)
69+
70+
print("Time taken for HF for %d documents = %0.3f" % (len(passages),time() - start))
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import requests
3+
from time import time
4+
from rich import print
5+
import pinecone
6+
from tqdm.auto import tqdm
7+
from datasets import Dataset
8+
from dotenv import load_dotenv
9+
from statistics import mean
10+
11+
load_dotenv(".env")
12+
api_org =os.environ["HF_API_KEY"]
13+
endpoint = os.environ["HF_ENDPOINT"]
14+
# add the api org token to the headers
15+
headers = {
16+
'Authorization': f'Bearer {api_org}'
17+
}
18+
19+
#squad = load_dataset("squad", split='train')
20+
squad = Dataset.from_file("squad-train.arrow")
21+
data = squad.to_pandas()
22+
data = data.drop_duplicates(subset=["context"])
23+
passages = list(data['context'])
24+
25+
# connect to pinecone environment
26+
pinecone.init(
27+
api_key=os.environ["PINECONE_API_KEY"],
28+
environment=os.environ["PINECONE_ENVIRONMENT"]
29+
)
30+
31+
index_name = 'hf-endpoints'
32+
33+
# check if the movie-emb index exists
34+
if index_name not in pinecone.list_indexes():
35+
# create the index if it does not exist
36+
pinecone.create_index(
37+
index_name,
38+
dimension=dim,
39+
metric="cosine"
40+
)
41+
42+
# connect to movie-emb index we created
43+
index = pinecone.Index(index_name)
44+
45+
46+
run_times = []
47+
for query in data["context"][0:100]:
48+
start = time()
49+
# encode with HF endpoints
50+
res = requests.post(endpoint, headers=headers, json={"inputs": query})
51+
xq = res.json()['embeddings']
52+
# query and return top 5
53+
xc = index.query(xq, top_k=5, include_metadata=True)
54+
_end = time()
55+
run_times.append(_end-start)
56+
print("HF + Pinecone Average query time: %0.3f"%(mean(run_times)))
57+
58+
59+
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from pgml import Database
2+
import os
3+
from datasets import load_dataset
4+
from time import time
5+
from dotenv import load_dotenv
6+
from rich import print
7+
import asyncio
8+
from tqdm.auto import tqdm
9+
10+
async def main():
11+
load_dotenv()
12+
conninfo = os.environ.get("DATABASE_URL")
13+
db = Database(conninfo)
14+
15+
collection_name = "squad_collection_benchmark"
16+
collection = await db.create_or_get_collection(collection_name)
17+
model_id = await collection.register_model(model_name="intfloat/e5-large")
18+
await collection.generate_embeddings(model_id=model_id)
19+
20+
if __name__ == "__main__":
21+
asyncio.run(main())
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
DO $$
2+
DECLARE
3+
curr_id integer := 0;
4+
batch_size integer:= 2;
5+
total_records integer:= 10000;
6+
curr_val text[]; -- Use "text[]" instead of "varchar[]"
7+
embed_result json; -- Store the result of the pgml.embed function
8+
BEGIN
9+
LOOP
10+
--BEGIN RAISE NOTICE 'updating % to %', curr_id, curr_id + batch_size; END;
11+
SELECT ARRAY(SELECT chunk::text
12+
FROM squad_collection_benchmark.chunks
13+
WHERE id BETWEEN curr_id + 1 AND curr_id + batch_size)
14+
INTO curr_val;
15+
16+
-- Use the correct syntax to call pgml.embed and store the result
17+
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
18+
19+
curr_id := curr_id + batch_size;
20+
EXIT WHEN curr_id >= total_records;
21+
END LOOP;
22+
23+
SELECT ARRAY(SELECT chunk::text
24+
FROM squad_collection_benchmark.chunks
25+
WHERE id BETWEEN curr_id-batch_size AND total_records)
26+
INTO curr_val;
27+
28+
-- Use the correct syntax to call pgml.embed and store the result
29+
PERFORM embed FROM pgml.embed('intfloat/e5-large', curr_val);
30+
31+
END;
32+
$$;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from pgml import Database
2+
import os
3+
from datasets import load_dataset
4+
from time import time
5+
from dotenv import load_dotenv
6+
from rich import print
7+
import asyncio
8+
from tqdm.auto import tqdm
9+
10+
async def main():
11+
load_dotenv()
12+
conninfo = os.environ.get("DATABASE_URL")
13+
db = Database(conninfo)
14+
15+
collection_name = "squad_collection_benchmark"
16+
collection = await db.create_or_get_collection(collection_name)
17+
18+
data = load_dataset("squad", split="train")
19+
data = data.to_pandas()
20+
data = data.drop_duplicates(subset=["context"])
21+
22+
documents = [
23+
{"id": r["id"], "text": r["context"], "title": r["title"]}
24+
for r in data.to_dict(orient="records")
25+
]
26+
27+
print("Ingesting and chunking documents ..")
28+
total_documents = 10000
29+
batch_size = 64
30+
embedding_times = []
31+
total_time = 0
32+
documents = documents[:total_documents]
33+
for i in tqdm(range(0,len(documents),batch_size)):
34+
i_end = min(i+batch_size,len(documents))
35+
batch = documents[i:i_end]
36+
await collection.upsert_documents(batch)
37+
await collection.generate_chunks()
38+
print("Ingesting and chunking completed")
39+
40+
if __name__ == "__main__":
41+
asyncio.run(main())

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy