From f99e79022b51c73f7bfaf6da8dd804732e1acaf7 Mon Sep 17 00:00:00 2001 From: thomaskluiters Date: Mon, 12 Jun 2023 10:34:03 +0200 Subject: [PATCH] Minor improvements in blog posts Rename 'similiarity' to 'similarity' Prefix e5 embedding requests with either 'request: ' or 'passage: ' --- ...s-with-open-source-models-in-postgresml.md | 12 ++++--- ...h-results-with-huggingface-and-pgvector.md | 28 ++++++++-------- ...rating-query-embeddings-in-the-database.md | 32 +++++++++---------- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/pgml-dashboard/content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md b/pgml-dashboard/content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md index 55d017f85..a0b544519 100644 --- a/pgml-dashboard/content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md +++ b/pgml-dashboard/content/blog/generating-llm-embeddings-with-open-source-models-in-postgresml.md @@ -127,8 +127,10 @@ Since our corpus of documents (movie reviews) are all relatively short and simil It takes a couple of minutes to download and cache the `intfloat/e5-small` model to generate the first embedding. After that, it's pretty fast. +Note how we prefix the text we want to embed with either `passage: ` or `query: `, the e5 model requires us to prefix our data with `passage: ` if we're generating embeddings for our corpus and `query: ` if we want to find semantically similar content. + ```postgresql -SELECT pgml.embed('intfloat/e5-small', 'hi mom'); +SELECT pgml.embed('intfloat/e5-small', 'passage: hi mom'); ``` This is a pretty powerful function, because we can pass any arbitrary text to any open source model, and it will generate an embedding for us. We can benchmark how long it takes to generate an embedding for a single review, using client-side timings in Postgres: @@ -147,7 +149,7 @@ Aside from using this function with strings passed from a client, we can use it ```postgresql SELECT review_body, - pgml.embed('intfloat/e5-small', review_body) + pgml.embed('intfloat/e5-small', 'passage: ' || review_body) FROM pgml.amazon_us_reviews LIMIT 1; ``` @@ -171,7 +173,7 @@ Time to generate an embedding increases with the length of the input text, and v ```postgresql SELECT review_body, - pgml.embed('intfloat/e5-small', review_body) AS embedding + pgml.embed('intfloat/e5-small', 'passage: ' || review_body) AS embedding FROM pgml.amazon_us_reviews LIMIT 1000; ``` @@ -191,7 +193,7 @@ SELECT reviqew_body, pgml.embed( 'intfloat/e5-small', - review_body, + 'passage: ' || review_body, '{"device": "cpu"}' ) AS embedding FROM pgml.amazon_us_reviews @@ -328,7 +330,7 @@ BEGIN UPDATE pgml.amazon_us_reviews SET review_embedding_e5_large = pgml.embed( 'intfloat/e5-large', - review_body + 'passage: ' || review_body ) WHERE id BETWEEN i AND i + 10 AND review_embedding_e5_large IS NULL; diff --git a/pgml-dashboard/content/blog/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md b/pgml-dashboard/content/blog/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md index 11678cd76..fa3f0ac9d 100644 --- a/pgml-dashboard/content/blog/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md +++ b/pgml-dashboard/content/blog/personalize-embedding-vector-search-results-with-huggingface-and-pgvector.md @@ -137,7 +137,7 @@ We can find a customer that our embeddings model feels is close to the sentiment WITH request AS ( SELECT pgml.embed( 'intfloat/e5-large', - 'I love all Star Wars, but Empire Strikes Back is particularly amazing' + 'query: I love all Star Wars, but Empire Strikes Back is particularly amazing' )::vector(1024) AS embedding ) @@ -147,9 +147,9 @@ SELECT star_rating_avg, 1 - ( movie_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS cosine_similiarity + ) AS cosine_similarity FROM customers -ORDER BY cosine_similiarity DESC +ORDER BY cosine_similarity DESC LIMIT 1; ``` @@ -157,7 +157,7 @@ LIMIT 1; !!! results -| id | total_reviews | star_rating_avg | cosine_similiarity | +| id | total_reviews | star_rating_avg | cosine_similarity | |----------|---------------|--------------------|--------------------| | 44366773 | 1 | 2.0000000000000000 | 0.8831349398621555 | @@ -215,7 +215,7 @@ Now we can write our personalized SQL query. It's nearly the same as our query f WITH request AS ( SELECT pgml.embed( 'intfloat/e5-large', - 'Best 1980''s scifi movie' + 'query: Best 1980''s scifi movie' )::vector(1024) AS embedding ), @@ -226,7 +226,7 @@ customer AS ( WHERE id = '44366773' ), --- vector similarity search for movies and calculate a customer_cosine_similiarity at the same time +-- vector similarity search for movies and calculate a customer_cosine_similarity at the same time first_pass AS ( SELECT title, @@ -234,10 +234,10 @@ first_pass AS ( star_rating_avg, 1 - ( review_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS request_cosine_similiarity, + ) AS request_cosine_similarity, (1 - ( review_embedding_e5_large <=> (SELECT embedding FROM customer) - ) - 0.9) * 10 AS customer_cosine_similiarity, + ) - 0.9) * 10 AS customer_cosine_similarity, star_rating_avg / 5 AS star_rating_score FROM movies WHERE total_reviews > 10 @@ -251,9 +251,9 @@ SELECT total_reviews, round(star_rating_avg, 2) as star_rating_avg, star_rating_score, - request_cosine_similiarity, - customer_cosine_similiarity, - request_cosine_similiarity + customer_cosine_similiarity + star_rating_score AS final_score + request_cosine_similarity, + customer_cosine_similarity, + request_cosine_similarity + customer_cosine_similarity + star_rating_score AS final_score FROM first_pass ORDER BY final_score DESC LIMIT 10; @@ -263,7 +263,7 @@ LIMIT 10; !!! results -| title | total_reviews | star_rating_avg | star_rating_score | request_cosine_similiarity | customer_cosine_similiarity | final_score | +| title | total_reviews | star_rating_avg | star_rating_score | request_cosine_similarity | customer_cosine_similarity | final_score | |----------------------------------------------------------------------|---------------|-----------------|------------------------|----------------------------|-----------------------------|--------------------| | Star Wars, Episode V: The Empire Strikes Back (Widescreen Edition) | 78 | 4.44 | 0.88717948717948718000 | 0.8295302273865711 | 0.9999999999999998 | 2.716709714566058 | | Star Wars, Episode IV: A New Hope (Widescreen Edition) | 80 | 4.36 | 0.87250000000000000000 | 0.8339361274771777 | 0.9336656923446551 | 2.640101819821833 | @@ -280,7 +280,7 @@ LIMIT 10; !!! -Bingo. Now we're boosting movies by `(customer_cosine_similiarity - 0.9) * 10`, and we've kept our previous boost for movies with a high average star rating. Not only does Episode V top the list as expected, Episode IV is a close second. This query has gotten fairly complex! But the results are perfect for me, I mean our hypothetical customer who is searching for "Best 1980's scifi movie" but has already revealed to us with their one movie review that they think like the comment "I love all Star Wars, but Empire Strikes Back is particularly amazing". I promise I'm not just doing all of this to find a new movie to watch tonight. +Bingo. Now we're boosting movies by `(customer_cosine_similarity - 0.9) * 10`, and we've kept our previous boost for movies with a high average star rating. Not only does Episode V top the list as expected, Episode IV is a close second. This query has gotten fairly complex! But the results are perfect for me, I mean our hypothetical customer who is searching for "Best 1980's scifi movie" but has already revealed to us with their one movie review that they think like the comment "I love all Star Wars, but Empire Strikes Back is particularly amazing". I promise I'm not just doing all of this to find a new movie to watch tonight. You can compare this to our non-personalized results from the previous article for reference Forbidden Planet used to be the top result, but now it's #3. @@ -288,7 +288,7 @@ You can compare this to our non-personalized results from the previous article f !!! results -| title | total_reviews | star_rating_avg | final_score | star_rating_score | cosine_similiarity | +| title | total_reviews | star_rating_avg | final_score | star_rating_score | cosine_similarity | |:-----------------------------------------------------|--------------:|----------------:|-------------------:|-----------------------:|-------------------:| | Forbidden Planet (Two-Disc 50th Anniversary Edition) | 255 | 4.82 | 1.8216832158805154 | 0.96392156862745098000 | 0.8577616472530644 | | Back to the Future | 31 | 4.94 | 1.82090702765472 | 0.98709677419354838000 | 0.8338102534611714 | diff --git a/pgml-dashboard/content/blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md b/pgml-dashboard/content/blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md index fd4a38e4b..f70054f8f 100644 --- a/pgml-dashboard/content/blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md +++ b/pgml-dashboard/content/blog/tuning-vector-recall-while-generating-query-embeddings-in-the-database.md @@ -129,7 +129,7 @@ We'll start with semantic search. Given a user query, e.g. "Best 1980's scifi mo WITH request AS ( SELECT pgml.embed( 'intfloat/e5-large', - 'Best 1980''s scifi movie' + 'query: Best 1980''s scifi movie' )::vector(1024) AS embedding ) @@ -142,9 +142,9 @@ SELECT review_embedding_e5_large <=> ( SELECT embedding FROM request ) - ) AS cosine_similiarity + ) AS cosine_similarity FROM pgml.amazon_us_reviews -ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request) +ORDER BY cosine_similarity LIMIT 5; ``` @@ -152,7 +152,7 @@ LIMIT 5; !!! results -| review_body | product_title | star_rating | total_votes | cosine_similiarity | +| review_body | product_title | star_rating | total_votes | cosine_similarity | |-----------------------------------------------------|---------------------------------------------------------------|-------------|-------------|--------------------| | best 80s SciFi movie ever | The Adventures of Buckaroo Banzai Across the Eighth Dimension | 5 | 1 | 0.956207707312679 | | One of the best 80's sci-fi movies, beyond a doubt! | Close Encounters of the Third Kind [Blu-ray] | 5 | 1 | 0.9298004258989776 | @@ -270,7 +270,7 @@ SELECT title, 1 - ( review_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS cosine_similiarity + ) AS cosine_similarity FROM movies ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request) LIMIT 10; @@ -280,7 +280,7 @@ LIMIT 10; !!! results -| title | cosine_similiarity | +| title | cosine_similarity | |--------------------------------------------------------------------|--------------------| | THX 1138 (The George Lucas Director's Cut Special Edition/ 2-Disc) | 0.8652007733744973 | | 2010: The Year We Make Contact | 0.8621574666546908 | @@ -328,7 +328,7 @@ SELECT title, 1 - ( review_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS cosine_similiarity + ) AS cosine_similarity FROM movies ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request) LIMIT 10; @@ -338,7 +338,7 @@ LIMIT 10; !!! results -| title | cosine_similiarity | +| title | cosine_similarity | |--------------------------------------------------------------------|--------------------| | THX 1138 (The George Lucas Director's Cut Special Edition/ 2-Disc) | 0.8652007733744973 | | Big Trouble in Little China [UMD for PSP] | 0.8649691870870362 | @@ -411,7 +411,7 @@ SET ivfflat.probes = 1; WITH request AS ( SELECT pgml.embed( 'intfloat/e5-large', - 'Best 1980''s scifi movie' + 'query: Best 1980''s scifi movie' )::vector(1024) AS embedding ) @@ -420,7 +420,7 @@ SELECT total_reviews, 1 - ( review_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS cosine_similiarity + ) AS cosine_similarity FROM movies WHERE total_reviews > 10 ORDER BY review_embedding_e5_large <=> (SELECT embedding FROM request) @@ -431,7 +431,7 @@ LIMIT 10; !!! results -| title | total_reviews | cosine_similiarity | +| title | total_reviews | cosine_similarity | |------------------------------------------------------|---------------|--------------------| | 2010: The Year We Make Contact | 29 | 0.8621574666546908 | | Forbidden Planet | 202 | 0.861032948199611 | @@ -467,7 +467,7 @@ SQL is a very expressive language that can handle a lot of complexity. To keep t WITH request AS ( SELECT pgml.embed( 'intfloat/e5-large', - 'Best 1980''s scifi movie' + 'query: Best 1980''s scifi movie' )::vector(1024) AS embedding ), @@ -479,7 +479,7 @@ first_pass AS ( star_rating_avg, 1 - ( review_embedding_e5_large <=> (SELECT embedding FROM request) - ) AS cosine_similiarity, + ) AS cosine_similarity, star_rating_avg / 5 AS star_rating_score FROM movies WHERE total_reviews > 10 @@ -493,8 +493,8 @@ SELECT total_reviews, round(star_rating_avg, 2) as star_rating_avg, star_rating_score, - cosine_similiarity, - cosine_similiarity + star_rating_score AS final_score + cosine_similarity, + cosine_similarity + star_rating_score AS final_score FROM first_pass ORDER BY final_score DESC LIMIT 10; @@ -504,7 +504,7 @@ LIMIT 10; !!! results -| title | total_reviews | star_rating_avg | final_score | star_rating_score | cosine_similiarity | +| title | total_reviews | star_rating_avg | final_score | star_rating_score | cosine_similarity | |:-----------------------------------------------------|--------------:|----------------:|-------------------:|-----------------------:|-------------------:| | Forbidden Planet (Two-Disc 50th Anniversary Edition) | 255 | 4.82 | 1.8216832158805154 | 0.96392156862745098000 | 0.8577616472530644 | | Back to the Future | 31 | 4.94 | 1.82090702765472 | 0.98709677419354838000 | 0.8338102534611714 | pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy