postgresml · montanalow · Jun 12, 2023 · Jun 12, 2023
diff --git a/...content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md b/...content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md
@@ -127,8 +127,10 @@ Since our corpus of documents (movie reviews) are all relatively short and simil
 
 It takes a couple of minutes to download and cache the `intfloat/e5-small` model to generate the first embedding. After that, it's pretty fast.
 
+Note how we prefix the text we want to embed with either `passage: ` or `query: `, the e5 model requires us to prefix our data with `passage: ` if we're generating embeddings for our corpus and `query: ` if we want to find semantically similar content.
+
 ```postgresql
-SELECT pgml.embed('intfloat/e5-small', 'hi mom');
+SELECT pgml.embed('intfloat/e5-small', 'passage: hi mom');
 ```
 
 This is a pretty powerful function, because we can pass any arbitrary text to any open source model, and it will generate an embedding for us. We can benchmark how long it takes to generate an embedding for a single review, using client-side timings in Postgres:
@@ -147,7 +149,7 @@ Aside from using this function with strings passed from a client, we can use it
 ```postgresql
 SELECT
     review_body,
-    pgml.embed('intfloat/e5-small', review_body)
+    pgml.embed('intfloat/e5-small', 'passage: ' || review_body)
 FROM pgml.amazon_us_reviews
 LIMIT 1;
 ```
@@ -171,7 +173,7 @@ Time to generate an embedding increases with the length of the input text, and v
 ```postgresql
 SELECT
     review_body,
-    pgml.embed('intfloat/e5-small', review_body) AS embedding
+    pgml.embed('intfloat/e5-small', 'passage: ' || review_body) AS embedding
 FROM pgml.amazon_us_reviews
 LIMIT 1000;
 ```
@@ -191,7 +193,7 @@ SELECT
     reviqew_body,
     pgml.embed(
         'intfloat/e5-small',
-        review_body,
+        'passage: ' || review_body,
         '{"device": "cpu"}'
     ) AS embedding
 FROM pgml.amazon_us_reviews
@@ -328,7 +330,7 @@ BEGIN
         UPDATE pgml.amazon_us_reviews
         SET review_embedding_e5_large = pgml.embed(
                 'intfloat/e5-large',
-                review_body
+                'passage: ' || review_body
             )
         WHERE id BETWEEN i AND i + 10
             AND review_embedding_e5_large IS NULL;

diff --git a/...og/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md b/...og/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md
@@ -137,7 +137,7 @@ We can find a customer that our embeddings model feels is close to the sentiment
 WITH request AS (
   SELECT pgml.embed(
     'intfloat/e5-large',
-    'I love all Star Wars, but Empire Strikes Back is particularly amazing'
+    'query: I love all Star Wars, but Empire Strikes Back is particularly amazing'
   )::vector(1024) AS embedding
 )
 
@@ -147,17 +147,17 @@ SELECT
   star_rating_avg,
   1 - (
     movie_embedding_e5_large <=> (SELECT embedding FROM request)
-  ) AS cosine_similiarity
+  ) AS cosine_similarity
 FROM customers
-ORDER BY cosine_similiarity DESC
+ORDER BY cosine_similarity DESC
 LIMIT 1;
 ```
 
 !!!
 
 !!! results
 
-| id       | total_reviews | star_rating_avg    | cosine_similiarity |
+| id       | total_reviews | star_rating_avg    | cosine_similarity |
 |----------|---------------|--------------------|--------------------|
 | 44366773 | 1             | 2.0000000000000000 | 0.8831349398621555 |
 
@@ -215,7 +215,7 @@ Now we can write our personalized SQL query. It's nearly the same as our query f
 WITH request AS (
   SELECT pgml.embed(
     'intfloat/e5-large',
-    'Best 1980''s scifi movie'
+    'query: Best 1980''s scifi movie'
   )::vector(1024) AS embedding
 ),
 
@@ -226,18 +226,18 @@ customer AS (
   WHERE id = '44366773'
 ),
 
--- vector similarity search for movies and calculate a customer_cosine_similiarity at the same time
+-- vector similarity search for movies and calculate a customer_cosine_similarity at the same time
 first_pass AS (
   SELECT
     title,
     total_reviews,
     star_rating_avg,
     1 - (
       review_embedding_e5_large <=> (SELECT embedding FROM request)
-    ) AS request_cosine_similiarity,
+    ) AS request_cosine_similarity,
     (1 - (
       review_embedding_e5_large <=> (SELECT embedding FROM customer)
-    ) - 0.9) * 10 AS  customer_cosine_similiarity,
+    ) - 0.9) * 10 AS  customer_cosine_similarity,
     star_rating_avg / 5 AS star_rating_score
   FROM movies
   WHERE total_reviews > 10
@@ -251,9 +251,9 @@ SELECT
   total_reviews,
   round(star_rating_avg, 2) as star_rating_avg,
   star_rating_score,
-  request_cosine_similiarity,
-  customer_cosine_similiarity,
-  request_cosine_similiarity + customer_cosine_similiarity + star_rating_score AS final_score
+  request_cosine_similarity,
+  customer_cosine_similarity,
+  request_cosine_similarity + customer_cosine_similarity + star_rating_score AS final_score
 FROM first_pass
 ORDER BY final_score DESC
 LIMIT 10;
@@ -263,7 +263,7 @@ LIMIT 10;
 
 !!! results
 
-| title                                                                | total_reviews | star_rating_avg | star_rating_score      | request_cosine_similiarity | customer_cosine_similiarity | final_score        |
+| title                                                                | total_reviews | star_rating_avg | star_rating_score      | request_cosine_similarity | customer_cosine_similarity | final_score        |
 |----------------------------------------------------------------------|---------------|-----------------|------------------------|----------------------------|-----------------------------|--------------------|
 | Star Wars, Episode V: The Empire Strikes Back (Widescreen Edition)   | 78            | 4.44            | 0.88717948717948718000 | 0.8295302273865711         | 0.9999999999999998          | 2.716709714566058  |
 | Star Wars, Episode IV: A New Hope (Widescreen Edition)               | 80            | 4.36            | 0.87250000000000000000 | 0.8339361274771777         | 0.9336656923446551          | 2.640101819821833  |
@@ -280,15 +280,15 @@ LIMIT 10;
 
 !!!
 
-Bingo. Now we're boosting movies by `(customer_cosine_similiarity - 0.9) * 10`, and we've kept our previous boost for movies with a high average star rating. Not only does Episode V top the list as expected, Episode IV is a close second. This query has gotten fairly complex! But the results are perfect for me, I mean our hypothetical customer who is searching for "Best 1980's scifi movie" but has already revealed to us with their one movie review that they think like the comment "I love all Star Wars, but Empire Strikes Back is particularly amazing". I promise I'm not just doing all of this to find a new movie to watch tonight.
+Bingo. Now we're boosting movies by `(customer_cosine_similarity - 0.9) * 10`, and we've kept our previous boost for movies with a high average star rating. Not only does Episode V top the list as expected, Episode IV is a close second. This query has gotten fairly complex! But the results are perfect for me, I mean our hypothetical customer who is searching for "Best 1980's scifi movie" but has already revealed to us with their one movie review that they think like the comment "I love all Star Wars, but Empire Strikes Back is particularly amazing". I promise I'm not just doing all of this to find a new movie to watch tonight.
 
 You can compare this to our non-personalized results from the previous article for reference Forbidden Planet used to be the top result, but now it's #3.
 
 !!! code_block time="124.119 ms"
 
 !!! results
 
-| title                                                | total_reviews | star_rating_avg |        final_score |      star_rating_score | cosine_similiarity |
+| title                                                | total_reviews | star_rating_avg |        final_score |      star_rating_score | cosine_similarity |
 |:-----------------------------------------------------|--------------:|----------------:|-------------------:|-----------------------:|-------------------:|
 | Forbidden Planet (Two-Disc 50th Anniversary Edition) |           255 |            4.82 | 1.8216832158805154 | 0.96392156862745098000 | 0.8577616472530644 |
 | Back to the Future                                   |            31 |            4.94 |   1.82090702765472 | 0.98709677419354838000 | 0.8338102534611714 |

diff --git a/.../blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md b/.../blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md
@@ -129,7 +129,7 @@ We'll start with semantic search. Given a user query, e.g. "Best 1980's scifi mo
 WITH request AS (
   SELECT pgml.embed(
     'intfloat/e5-large',
-    'Best 1980''s scifi movie'
+    'query: Best 1980''s scifi movie'
   )::vector(1024) AS embedding
 )
 
@@ -142,17 +142,17 @@ SELECT
     review_embedding_e5_large <=> (
       SELECT embedding FROM request
     )
-  ) AS cosine_similiarity
+  ) AS cosine_similarity
 FROM pgml.amazon_us_reviews
-ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request)
+ORDER BY cosine_similarity
 LIMIT 5;
 ```
 
 !!!
 
 !!! results
 
-| review_body                                         | product_title                                                 | star_rating | total_votes | cosine_similiarity |
+| review_body                                         | product_title                                                 | star_rating | total_votes | cosine_similarity |
 |-----------------------------------------------------|---------------------------------------------------------------|-------------|-------------|--------------------|
 | best 80s SciFi movie ever                           | The Adventures of Buckaroo Banzai Across the Eighth Dimension | 5           | 1           | 0.956207707312679  |
 | One of the best 80's sci-fi movies, beyond a doubt! | Close Encounters of the Third Kind [Blu-ray]                  | 5           | 1           | 0.9298004258989776 |
@@ -270,7 +270,7 @@ SELECT
   title,
   1 - (
     review_embedding_e5_large <=> (SELECT embedding FROM request)
-  ) AS cosine_similiarity
+  ) AS cosine_similarity
 FROM movies
 ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request)
 LIMIT 10;
@@ -280,7 +280,7 @@ LIMIT 10;
 
 !!! results
 
-| title                                                              | cosine_similiarity |
+| title                                                              | cosine_similarity |
 |--------------------------------------------------------------------|--------------------|
 | THX 1138 (The George Lucas Director's Cut Special Edition/ 2-Disc) | 0.8652007733744973 |
 | 2010: The Year We Make Contact                                     | 0.8621574666546908 |
@@ -328,7 +328,7 @@ SELECT
   title,
   1 - (
     review_embedding_e5_large <=> (SELECT embedding FROM request)
-  ) AS cosine_similiarity
+  ) AS cosine_similarity
 FROM movies
 ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request)
 LIMIT 10;
@@ -338,7 +338,7 @@ LIMIT 10;
 
 !!! results
 
-| title                                                              | cosine_similiarity |
+| title                                                              | cosine_similarity |
 |--------------------------------------------------------------------|--------------------|
 | THX 1138 (The George Lucas Director's Cut Special Edition/ 2-Disc) | 0.8652007733744973 |
 | Big Trouble in Little China [UMD for PSP]                          | 0.8649691870870362 |
@@ -411,7 +411,7 @@ SET ivfflat.probes = 1;
 WITH request AS (
   SELECT pgml.embed(
     'intfloat/e5-large',
-    'Best 1980''s scifi movie'
+    'query: Best 1980''s scifi movie'
   )::vector(1024) AS embedding
 )
 
@@ -420,7 +420,7 @@ SELECT
   total_reviews,
   1 - (
     review_embedding_e5_large <=> (SELECT embedding FROM request)
-  ) AS cosine_similiarity
+  ) AS cosine_similarity
 FROM movies
 WHERE <strong>total_reviews > 10</strong>
 ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request)
@@ -431,7 +431,7 @@ LIMIT 10;
 
 !!! results
 
-| title                                                | total_reviews | cosine_similiarity |
+| title                                                | total_reviews | cosine_similarity |
 |------------------------------------------------------|---------------|--------------------|
 | 2010: The Year We Make Contact                       | 29            | 0.8621574666546908 |
 | Forbidden Planet                                     | 202           | 0.861032948199611  |
@@ -467,7 +467,7 @@ SQL is a very expressive language that can handle a lot of complexity. To keep t
 WITH request AS (
   SELECT pgml.embed(
     'intfloat/e5-large',
-    'Best 1980''s scifi movie'
+    'query: Best 1980''s scifi movie'
   )::vector(1024) AS embedding
 ),
 
@@ -479,7 +479,7 @@ first_pass AS (
       star_rating_avg,
       1 - (
         review_embedding_e5_large <=> (SELECT embedding FROM request)
-      ) AS cosine_similiarity,
+      ) AS cosine_similarity,
       star_rating_avg / 5 AS star_rating_score
     FROM movies
     WHERE total_reviews > 10
@@ -493,8 +493,8 @@ SELECT
   total_reviews,
   round(star_rating_avg, 2) as star_rating_avg,
   star_rating_score,
-  cosine_similiarity,
-  cosine_similiarity + star_rating_score AS final_score
+  cosine_similarity,
+  cosine_similarity + star_rating_score AS final_score
 FROM first_pass
 ORDER BY final_score DESC
 LIMIT 10;
@@ -504,7 +504,7 @@ LIMIT 10;
 
 !!! results
 
-| title                                                | total_reviews | star_rating_avg |        final_score |      star_rating_score | cosine_similiarity |
+| title                                                | total_reviews | star_rating_avg |        final_score |      star_rating_score | cosine_similarity |
 |:-----------------------------------------------------|--------------:|----------------:|-------------------:|-----------------------:|-------------------:|
 | Forbidden Planet (Two-Disc 50th Anniversary Edition) |           255 |            4.82 | 1.8216832158805154 | 0.96392156862745098000 | 0.8577616472530644 |
 | Back to the Future                                   |            31 |            4.94 |   1.82090702765472 | 0.98709677419354838000 | 0.8338102534611714 |