From 33e8ebf99faede41cd04a07713361e8a7f3fb1d8 Mon Sep 17 00:00:00 2001 From: Montana Low Date: Tue, 31 Jan 2023 18:14:14 -0800 Subject: [PATCH 1/2] namespace the migrations under the pgml schema --- .../20221125201109_notebook.down.sql | 4 +- .../migrations/20221125201109_notebook.up.sql | 6 +- .../20221129170843_notebooks_data.down.sql | 4 +- .../20221129170843_notebooks_data.up.sql | 366 +++++++++--------- .../20221130170423_uploaded_files.down.sql | 2 +- .../20221130170423_uploaded_files.up.sql | 2 +- 6 files changed, 192 insertions(+), 192 deletions(-) diff --git a/pgml-dashboard/migrations/20221125201109_notebook.down.sql b/pgml-dashboard/migrations/20221125201109_notebook.down.sql index 446acd59b..4d636fc32 100644 --- a/pgml-dashboard/migrations/20221125201109_notebook.down.sql +++ b/pgml-dashboard/migrations/20221125201109_notebook.down.sql @@ -1,3 +1,3 @@ -- Add down migration script here -DROP TABLE notebook_cells; -DROP TABLE notebooks; +DROP TABLE pgml.notebook_cells; +DROP TABLE pgml.notebooks; diff --git a/pgml-dashboard/migrations/20221125201109_notebook.up.sql b/pgml-dashboard/migrations/20221125201109_notebook.up.sql index a6d67dfdb..1444dc10a 100644 --- a/pgml-dashboard/migrations/20221125201109_notebook.up.sql +++ b/pgml-dashboard/migrations/20221125201109_notebook.up.sql @@ -1,14 +1,14 @@ -- Add up migration script here -CREATE TABLE notebooks ( +CREATE TABLE pgml.notebooks ( id BIGSERIAL PRIMARY KEY, name VARCHAR NOT NULL, created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(), updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW() ); -CREATE TABLE notebook_cells ( +CREATE TABLE pgml.notebook_cells ( id BIGSERIAL PRIMARY KEY, - notebook_id BIGINT NOT NULL REFERENCES notebooks(id), + notebook_id BIGINT NOT NULL REFERENCES pgml.notebooks(id), cell_type INT NOT NULL, cell_number INT NOT NULL, version INT NOT NULL, diff --git a/pgml-dashboard/migrations/20221129170843_notebooks_data.down.sql b/pgml-dashboard/migrations/20221129170843_notebooks_data.down.sql index 4e157056d..5a373cb47 100644 --- a/pgml-dashboard/migrations/20221129170843_notebooks_data.down.sql +++ b/pgml-dashboard/migrations/20221129170843_notebooks_data.down.sql @@ -1,3 +1,3 @@ -- Add down migration script here -TRUNCATE notebook_cells CASCADE; -TRUNCATE notebooks CASCADE; +TRUNCATE pgml.notebook_cells CASCADE; +TRUNCATE pgml.notebooks CASCADE; diff --git a/pgml-dashboard/migrations/20221129170843_notebooks_data.up.sql b/pgml-dashboard/migrations/20221129170843_notebooks_data.up.sql index d6b9ff32c..948075038 100644 --- a/pgml-dashboard/migrations/20221129170843_notebooks_data.up.sql +++ b/pgml-dashboard/migrations/20221129170843_notebooks_data.up.sql @@ -1,16 +1,16 @@ -INSERT INTO notebooks VALUES (0, 'Tutorial 0: 🎉 Welcome to PostgresML!', '2022-08-19 18:47:08.93719', '2022-08-19 18:47:08.93719'); -INSERT INTO notebooks VALUES (1, 'Tutorial 1: ⏱️ Real Time Fraud Detection', '2022-08-15 15:26:18.428227', '2022-08-15 15:26:18.428241'); -INSERT INTO notebooks VALUES (2, 'Tutorial 2: ⚕️ Tumor Detection w/ Binary Classification', '2022-08-19 16:10:23.120983', '2022-08-19 16:10:23.120996'); -INSERT INTO notebooks VALUES (3, 'Tutorial 3: ✍️ Handwritten Digit Image Classification', '2022-08-20 09:46:40.856497', '2022-08-20 09:46:40.856511'); -INSERT INTO notebooks VALUES (4, 'Tutorial 4: 🍭 Diabetes Progression w/ Regression', '2022-08-19 19:18:14.608456', '2022-08-19 19:18:14.608474'); -INSERT INTO notebooks VALUES (5, 'Tutorial 5: 🤗 Deep Learning w/ Transformers', '2022-08-20 09:47:47.830932', '2022-08-20 09:47:47.830946'); -INSERT INTO notebooks VALUES (6, 'Tutorial 6: ↗️ Working w/ Embeddings', '2022-08-20 09:48:16.252016', '2022-08-20 09:48:16.252029'); -INSERT INTO notebooks VALUES (7, 'Tutorial 7: 📒 Managing Model Deployments', '2022-08-20 09:48:40.044312', '2022-08-20 09:48:40.044325'); -INSERT INTO notebooks VALUES (8, 'Tutorial 8: 💻 Working w/ the Internal Schema of PostgresML', '2022-08-20 09:49:41.363292', '2022-08-20 09:49:41.363306'); -INSERT INTO notebooks VALUES (9, 'Tutorial 9: 🏁 Launch PostgresML w/ Your Production Stack', '2022-08-23 19:36:49.286982', '2022-08-23 19:36:49.286998'); - -SELECT pg_catalog.setval('notebooks_id_seq', (SELECT MAX(id) + 1 FROM notebooks), true); +INSERT INTO pgml.notebooks VALUES (0, 'Tutorial 0: 🎉 Welcome to PostgresML!', '2022-08-19 18:47:08.93719', '2022-08-19 18:47:08.93719'); +INSERT INTO pgml.notebooks VALUES (1, 'Tutorial 1: ⏱️ Real Time Fraud Detection', '2022-08-15 15:26:18.428227', '2022-08-15 15:26:18.428241'); +INSERT INTO pgml.notebooks VALUES (2, 'Tutorial 2: ⚕️ Tumor Detection w/ Binary Classification', '2022-08-19 16:10:23.120983', '2022-08-19 16:10:23.120996'); +INSERT INTO pgml.notebooks VALUES (3, 'Tutorial 3: ✍️ Handwritten Digit Image Classification', '2022-08-20 09:46:40.856497', '2022-08-20 09:46:40.856511'); +INSERT INTO pgml.notebooks VALUES (4, 'Tutorial 4: 🍭 Diabetes Progression w/ Regression', '2022-08-19 19:18:14.608456', '2022-08-19 19:18:14.608474'); +INSERT INTO pgml.notebooks VALUES (5, 'Tutorial 5: 🤗 Deep Learning w/ Transformers', '2022-08-20 09:47:47.830932', '2022-08-20 09:47:47.830946'); +INSERT INTO pgml.notebooks VALUES (6, 'Tutorial 6: ↗️ Working w/ Embeddings', '2022-08-20 09:48:16.252016', '2022-08-20 09:48:16.252029'); +INSERT INTO pgml.notebooks VALUES (7, 'Tutorial 7: 📒 Managing Model Deployments', '2022-08-20 09:48:40.044312', '2022-08-20 09:48:40.044325'); +INSERT INTO pgml.notebooks VALUES (8, 'Tutorial 8: 💻 Working w/ the Internal Schema of PostgresML', '2022-08-20 09:49:41.363292', '2022-08-20 09:49:41.363306'); +INSERT INTO pgml.notebooks VALUES (9, 'Tutorial 9: 🏁 Launch PostgresML w/ Your Production Stack', '2022-08-23 19:36:49.286982', '2022-08-23 19:36:49.286998'); + +SELECT pg_catalog.setval('pgml.notebooks_id_seq', (SELECT MAX(id) + 1 FROM pgml.notebooks), true); -- -- PostgreSQL database dump @@ -20,7 +20,7 @@ SELECT pg_catalog.setval('notebooks_id_seq', (SELECT MAX(id) + 1 FROM notebooks) -- Data for Name: notebook_cells; Type: TABLE DATA; Schema: Owner: lev -- -INSERT INTO notebook_cells VALUES (1, 0, 1, 1, 1, '## Welcome! +INSERT INTO pgml.notebook_cells VALUES (1, 0, 1, 1, 1, '## Welcome! You''re set up and running on PostgresML! This is an end-to-end system for training and deploying real time machine learning models. It handles data versioning, model training and validation, and safe production release. This dashboard web app will give you an overview of what''s happening in the system and also helps build and deploy projects. You can use notebooks like this one to interact with your database in real time and organize your SQL while documenting your code. @@ -38,8 +38,8 @@ Let me give you an example. The next cell (cell #2) will be a SQL cell which wil

These notebooks are similar to Jupyter Notebooks, which you might be familiar with already. On the bottom of the page, you will find a text editor which is used to create new cells. Each cell can contain either Markdown which is just text really, and SQL which will be executed directly by your Postgres database server.

Each cell has a little menu in the top right corner, allowing you to (re)run it (if it''s SQL), edit it, and delete it.

Let me give you an example. The next cell (cell #2) will be a SQL cell which will execute a simple query. Go ahead and click the next "Play" button now.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (2, 0, 3, 2, 1, 'SELECT random();', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (4, 0, 1, 4, 1, 'We just asked Postgres to return a random number. Pretty simple query, but it demonstrates the notebook functionality pretty well. You can see that the result of `random()` is a float between 0 and 1. On the bottom right corner, you can see that it took `0:00:00.000654` or 0 hours, 0 minutes, 0 seconds and only 654ns, or 0.6ms. This run time is good to keep an eye on. It will help build an intuition for how fast Postgres really is, and how certain operations scale as the data grows. +INSERT INTO pgml.notebook_cells VALUES (2, 0, 3, 2, 1, 'SELECT random();', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (4, 0, 1, 4, 1, 'We just asked Postgres to return a random number. Pretty simple query, but it demonstrates the notebook functionality pretty well. You can see that the result of `random()` is a float between 0 and 1. On the bottom right corner, you can see that it took `0:00:00.000654` or 0 hours, 0 minutes, 0 seconds and only 654ns, or 0.6ms. This run time is good to keep an eye on. It will help build an intuition for how fast Postgres really is, and how certain operations scale as the data grows. Try rerunning the cell again by clicking the "Play" button in the top right corner. You''ll see that the random number will change. Rerunning is a real time operation and Postgres will give you a different random number every time (otherwise it wouldn''t be random). @@ -98,8 +98,8 @@ You may want to check out the rest of [the tutorials](../) or dive straight in w

Thank you

Thank you for trying out PostgresML! We hope you enjoy your time here and have fun learning about machine learning, in the comfort of your favorite database.

You may want to check out the rest of the tutorials or dive straight in with a notebook to test Tutorial 1: ⏱️ Real Time Fraud Detection

', NULL, NULL); -INSERT INTO notebook_cells VALUES (5, 0, 3, 5, 1, 'SELECT ''Have a nice day!'' AS greeting;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (6, 1, 1, 1, 1, 'Introduction +INSERT INTO pgml.notebook_cells VALUES (5, 0, 3, 5, 1, 'SELECT ''Have a nice day!'' AS greeting;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (6, 1, 1, 1, 1, 'Introduction ------------ Most organizations have some risks that may be minimized using machine learning, by predicting the likelihood of negative outcomes before they happen. As long as you''re able to track the information leading up to the unfortunate events, there are many different machine learning algorithms that can tease out the correlations across multiple variables. @@ -132,23 +132,23 @@ We''ll build out a simple ecommerce schema, and populate it with some example da

Part 1: Ecommerce Application Data Model

We''ll build out a simple ecommerce schema, and populate it with some example data. First, our store needs some products to sell. Products have a name, their price, and other metadata, like whether or not they are perishable goods.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (7, 1, 3, 2, 1, 'CREATE TABLE products ( +INSERT INTO pgml.notebook_cells VALUES (7, 1, 3, 2, 1, 'CREATE TABLE products ( emoji TEXT PRIMARY KEY, name TEXT, price MONEY, perishable BOOLEAN );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (8, 1, 3, 3, 1, 'INSERT INTO PRODUCTS (emoji, name, price, perishable) +INSERT INTO pgml.notebook_cells VALUES (8, 1, 3, 3, 1, 'INSERT INTO PRODUCTS (emoji, name, price, perishable) VALUES (''💰'', ''1oz gold bar'', ''$1999.99'', false), (''📕'', ''a tale of 2 cities'', ''$19.99'', false), (''🥬'', ''head of lettuce'', ''$1.99'', true) RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (9, 1, 1, 4, 1, 'Now that we''re in business, our first customer has shown up, named Alice. She is a chef that owns a salad shop, so she is going to create an order for 1,000 🥬 `head of lettuce`. +INSERT INTO pgml.notebook_cells VALUES (9, 1, 1, 4, 1, 'Now that we''re in business, our first customer has shown up, named Alice. She is a chef that owns a salad shop, so she is going to create an order for 1,000 🥬 `head of lettuce`. Our ecommerce site will record `orders` and their `line_items` in our database with the following schema.', '

Now that we''re in business, our first customer has shown up, named Alice. She is a chef that owns a salad shop, so she is going to create an order for 1,000 🥬 head of lettuce.

Our ecommerce site will record orders and their line_items in our database with the following schema.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (10, 1, 3, 5, 1, 'CREATE TABLE orders ( +INSERT INTO pgml.notebook_cells VALUES (10, 1, 3, 5, 1, 'CREATE TABLE orders ( id BIGSERIAL PRIMARY KEY, customer_name TEXT ); @@ -159,8 +159,8 @@ CREATE TABLE line_items ( product_emoji TEXT, count INTEGER );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (11, 1, 1, 6, 1, 'Now that we have created the schema, we can record Alice''s order', '

Now that we have created the schema, we can record Alice''s order

', NULL, NULL); -INSERT INTO notebook_cells VALUES (12, 1, 3, 7, 1, 'INSERT INTO orders (customer_name) VALUES (''Alice''); +INSERT INTO pgml.notebook_cells VALUES (11, 1, 1, 6, 1, 'Now that we have created the schema, we can record Alice''s order', '

Now that we have created the schema, we can record Alice''s order

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (12, 1, 3, 7, 1, 'INSERT INTO orders (customer_name) VALUES (''Alice''); INSERT INTO line_items (order_id, product_emoji, count) VALUES ( @@ -170,7 +170,7 @@ VALUES ( 1000 ) RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (13, 1, 1, 8, 1, '🔎 That inline subquery in #7 is a little weird. +INSERT INTO pgml.notebook_cells VALUES (13, 1, 1, 8, 1, '🔎 That inline subquery in #7 is a little weird. ```sql -- a query to find Alice''s most recent order @@ -185,17 +185,17 @@ Next, we''ll record her payment in full via credit card in our `payments` table.

Typically this ID would be passed in from the application layer, instead of being retrieved during the INSERT statement itself. But anyway...

Next, we''ll record her payment in full via credit card in our payments table.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (14, 1, 3, 9, 1, 'CREATE TABLE payments ( +INSERT INTO pgml.notebook_cells VALUES (14, 1, 3, 9, 1, 'CREATE TABLE payments ( id BIGSERIAL PRIMARY KEY, order_id BIGINT, amount MONEY );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (15, 1, 1, 10, 1, 'We''ll be doing a little bit of heavy lifting in the next query to calculate her payment total on the fly.', '

We''ll be doing a little bit of heavy lifting in the next query to calculate her payment total on the fly.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (100, 8, 1, 4, 1, '## Projects +INSERT INTO pgml.notebook_cells VALUES (15, 1, 1, 10, 1, 'We''ll be doing a little bit of heavy lifting in the next query to calculate her payment total on the fly.', '

We''ll be doing a little bit of heavy lifting in the next query to calculate her payment total on the fly.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (100, 8, 1, 4, 1, '## Projects Projects are an artifact of calls to `pgml.train`.', '

Projects

Projects are an artifact of calls to pgml.train.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (16, 1, 3, 11, 1, 'INSERT INTO payments (order_id, amount) +INSERT INTO pgml.notebook_cells VALUES (16, 1, 3, 11, 1, 'INSERT INTO payments (order_id, amount) -- a query to compute the full amount of Alice''s most recent order SELECT order_id, SUM(count * price) AS amount FROM orders @@ -204,20 +204,20 @@ JOIN products ON products.emoji = line_items.product_emoji WHERE orders.id = (SELECT max(id) AS order_id FROM orders WHERE customer_name = ''Alice'') GROUP BY 1 RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (17, 1, 1, 12, 1, '🎉 Time to celebrate! Alice has paid in full for our first order, and business is good. +INSERT INTO pgml.notebook_cells VALUES (17, 1, 1, 12, 1, '🎉 Time to celebrate! Alice has paid in full for our first order, and business is good. Now, along comes Bob "the bad guy" who places an order for a 💰 1oz gold bar.', '

🎉 Time to celebrate! Alice has paid in full for our first order, and business is good.

Now, along comes Bob "the bad guy" who places an order for a 💰 1oz gold bar.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (18, 1, 3, 13, 1, 'INSERT INTO orders (customer_name) VALUES (''Bob''); +INSERT INTO pgml.notebook_cells VALUES (18, 1, 3, 13, 1, 'INSERT INTO orders (customer_name) VALUES (''Bob''); INSERT INTO line_items (order_id, product_emoji, count) VALUES ( (SELECT max(id) FROM orders WHERE customer_name = ''Bob''), ''💰'', 1 ) RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (19, 1, 1, 14, 1, 'Unfortunately, Bob makes his payment with a stolen credit card, but we don''t know that yet.', '

Unfortunately, Bob makes his payment with a stolen credit card, but we don''t know that yet.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (20, 1, 3, 15, 1, 'INSERT INTO payments (order_id, amount) +INSERT INTO pgml.notebook_cells VALUES (19, 1, 1, 14, 1, 'Unfortunately, Bob makes his payment with a stolen credit card, but we don''t know that yet.', '

Unfortunately, Bob makes his payment with a stolen credit card, but we don''t know that yet.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (20, 1, 3, 15, 1, 'INSERT INTO payments (order_id, amount) -- a query to compute the full amount of Bob''s most recent order SELECT order_id, SUM(count * price) AS amount FROM orders @@ -226,19 +226,19 @@ JOIN products ON products.emoji = line_items.product_emoji WHERE orders.id = (SELECT max(id) AS order_id FROM orders WHERE customer_name = ''Bob'') GROUP BY 1 RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (21, 1, 1, 16, 1, 'At the end of the month, the credit card company lets us know about the chargeback from the real card owner. We''ll need to create another table to keep track of this.', '

At the end of the month, the credit card company lets us know about the chargeback from the real card owner. We''ll need to create another table to keep track of this.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (22, 1, 3, 17, 1, 'CREATE TABLE chargebacks ( +INSERT INTO pgml.notebook_cells VALUES (21, 1, 1, 16, 1, 'At the end of the month, the credit card company lets us know about the chargeback from the real card owner. We''ll need to create another table to keep track of this.', '

At the end of the month, the credit card company lets us know about the chargeback from the real card owner. We''ll need to create another table to keep track of this.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (22, 1, 3, 17, 1, 'CREATE TABLE chargebacks ( id BIGSERIAL PRIMARY KEY, payment_id BIGINT )', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (23, 1, 1, 18, 1, 'And now we can record the example of fraud', '

And now we can record the example of fraud

', NULL, NULL); -INSERT INTO notebook_cells VALUES (24, 1, 3, 19, 1, 'INSERT INTO chargebacks (payment_id) +INSERT INTO pgml.notebook_cells VALUES (23, 1, 1, 18, 1, 'And now we can record the example of fraud', '

And now we can record the example of fraud

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (24, 1, 3, 19, 1, 'INSERT INTO chargebacks (payment_id) SELECT max(payments.id) AS payment_id FROM payments JOIN orders ON payments.order_id = orders.id WHERE customer_name = ''Bob'' RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (25, 1, 1, 20, 1, '🏁 Congratulations! 🏁 +INSERT INTO pgml.notebook_cells VALUES (25, 1, 1, 20, 1, '🏁 Congratulations! 🏁 ---------------- If you''ve made it this far, you''ve won half the machine learning battle. We have created 2 training data examples that are perfect for "supervised" machine learning. The chargebacks act as the ground truth to inform the machine learning algorithm of whether or not an order is fraudulent. These records are what we refer to as "labels", a.k.a "targets" or "Y-values" for the data. @@ -248,7 +248,7 @@ We can construct a query that provides a summary view of our orders, including t

If you''ve made it this far, you''ve won half the machine learning battle. We have created 2 training data examples that are perfect for "supervised" machine learning. The chargebacks act as the ground truth to inform the machine learning algorithm of whether or not an order is fraudulent. These records are what we refer to as "labels", a.k.a "targets" or "Y-values" for the data.

Part 2: Structuring the Training Data

We can construct a query that provides a summary view of our orders, including the fraudulent label.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (26, 1, 3, 21, 1, 'CREATE VIEW orders_summaries AS +INSERT INTO pgml.notebook_cells VALUES (26, 1, 3, 21, 1, 'CREATE VIEW orders_summaries AS SELECT orders.id AS order_id, orders.customer_name, @@ -265,10 +265,10 @@ LEFT JOIN line_items ON line_items.order_id = orders.id LEFT JOIN products ON products.emoji = line_items.product_emoji GROUP BY 1, 2, 3, 5 ORDER BY orders.id;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (27, 1, 1, 22, 1, 'Now, let''s have a look at the summary', '

Now, let''s have a look at the summary

', NULL, NULL); -INSERT INTO notebook_cells VALUES (28, 1, 3, 23, 1, 'SELECT * FROM orders_summaries;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (29, 1, 1, 24, 1, 'It''s intuitive that thieves will be more attracted to gold bars than a head of lettuce because the resell value is better. Perishable goods are more difficult to move on the black market. A good piece of information for our model would be the percentage of the order that is perishable. We call this a "feature" of the data model. We can construct a query to return this feature for each order, along with the chargeback label.', '

It''s intuitive that thieves will be more attracted to gold bars, than a head of lettuce because the resell value is better. Perishable goods are more difficult to move on the black market. A good piece of information for our model would be the percentage of the order that is perishable. We call this a "feature" of the data model. We can construct a query to return this feature for each order, along with the chargeback label.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (30, 1, 3, 25, 1, 'CREATE VIEW fraud_samples AS +INSERT INTO pgml.notebook_cells VALUES (27, 1, 1, 22, 1, 'Now, let''s have a look at the summary', '

Now, let''s have a look at the summary

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (28, 1, 3, 23, 1, 'SELECT * FROM orders_summaries;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (29, 1, 1, 24, 1, 'It''s intuitive that thieves will be more attracted to gold bars than a head of lettuce because the resell value is better. Perishable goods are more difficult to move on the black market. A good piece of information for our model would be the percentage of the order that is perishable. We call this a "feature" of the data model. We can construct a query to return this feature for each order, along with the chargeback label.', '

It''s intuitive that thieves will be more attracted to gold bars, than a head of lettuce because the resell value is better. Perishable goods are more difficult to move on the black market. A good piece of information for our model would be the percentage of the order that is perishable. We call this a "feature" of the data model. We can construct a query to return this feature for each order, along with the chargeback label.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (30, 1, 3, 25, 1, 'CREATE VIEW fraud_samples AS SELECT SUM(CASE WHEN products.perishable THEN (count * price) ELSE ''$0.0'' END) / SUM(payments.amount) AS perishable_percentage, CASE WHEN chargebacks.id IS NOT NULL @@ -282,23 +282,23 @@ LEFT JOIN line_items ON line_items.order_id = orders.id LEFT JOIN products ON products.emoji = line_items.product_emoji GROUP BY orders.id, chargebacks.id ORDER BY orders.id;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (31, 1, 3, 26, 1, 'SELECT * FROM fraud_samples;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (48, 1, 1, 43, 1, 'Uh oh, the model was trained on a perfectly small dataset. It learned that unless the order is perishable goods, it''s going to predict fraud 100% of the time, but our test data shows that''s not 100% true. Let''s generate some samples to further explore our model.', '

Uh oh, the model was trained on a perfectly small dataset. It learned that unless the order is perishable goods, it''s going to predict fraud 100% of the time, but our test data shows that''s not 100% true. Let''s generate some samples to further explore our model.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (49, 1, 3, 44, 1, 'WITH exploration_samples AS ( +INSERT INTO pgml.notebook_cells VALUES (31, 1, 3, 26, 1, 'SELECT * FROM fraud_samples;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (48, 1, 1, 43, 1, 'Uh oh, the model was trained on a perfectly small dataset. It learned that unless the order is perishable goods, it''s going to predict fraud 100% of the time, but our test data shows that''s not 100% true. Let''s generate some samples to further explore our model.', '

Uh oh, the model was trained on a perfectly small dataset. It learned that unless the order is perishable goods, it''s going to predict fraud 100% of the time, but our test data shows that''s not 100% true. Let''s generate some samples to further explore our model.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (49, 1, 3, 44, 1, 'WITH exploration_samples AS ( SELECT generate_series(0, 1, 0.1) AS perishable_percentage ) SELECT perishable_percentage, pgml.predict(''Our Fraud Classification'', ARRAY[perishable_percentage::real]) AS predict_fraud FROM exploration_samples;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (101, 8, 3, 5, 1, 'SELECT id, name, task::TEXT FROM pgml.projects LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (158, 6, 3, 23, 1, 'SELECT pgml.distance_l2(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (32, 1, 1, 27, 1, 'Training a model +INSERT INTO pgml.notebook_cells VALUES (101, 8, 3, 5, 1, 'SELECT id, name, task::TEXT FROM pgml.projects LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (158, 6, 3, 23, 1, 'SELECT pgml.distance_l2(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (32, 1, 1, 27, 1, 'Training a model ---------------- This is a great training set for a machine learning model. We''ve found a feature `perishable_percentage` that perfectly correlates with the label `fraudulent`. Perishable orders are less likely to result in a chargeback. A good model will be able to generalize from the example data we have to new examples that we may never have seen before, like an order that is only 33% perishable goods. Now that we have a `VIEW` of this data, we can train a "classification" model to classify the features as `fraudulent` or not.', '

Training a model

This is a great training set for a machine learning model. We''ve found a feature perishable_percentage that perfectly correlates with the label fraudulent. Perishable orders are less likely to result in a chargeback. A good model will be able to generalize from the example data we have to new examples that we may never have seen before, like an order that is only 33% perishable goods. Now that we have a VIEW of this data, we can train a "classification" model to classify the features as fraudulent or not.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (33, 1, 3, 28, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (33, 1, 3, 28, 1, 'SELECT * FROM pgml.train( project_name => ''Our Fraud Model'', -- a friendly name we''ll use to identify this machine learning project task => ''classification'', -- we want to classify into true or false relation_name => ''fraud_samples'', -- our view of the data @@ -306,7 +306,7 @@ INSERT INTO notebook_cells VALUES (33, 1, 3, 28, 1, 'SELECT * FROM pgml.train( test_sampling => ''last'', -- the part of the data to use for testing our model test_size => 0.5 -- use half the data for tests );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (34, 1, 1, 29, 1, 'Oops. We''re going to get an error: +INSERT INTO pgml.notebook_cells VALUES (34, 1, 1, 29, 1, 'Oops. We''re going to get an error: ``` ERROR: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: False @@ -316,7 +316,7 @@ Wait a second, we know there is both a True and a False label, because we have a
ERROR:  ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: False
 

Wait a second, we know there is both a True and a False label, because we have an example of both a fraudulent and legit order. What gives? This is a glimpse into how PostgresML works inside the black box. It splits the sample data into 2 sets. One is used for training the model as we expected, and the other is used to test the model''s predictions against the remaining known labels. This way we can see how well the model generalizes. In this case, since there are only 2 data samples, 1 is used for training (the False label) and 1 is used for testing (the True label). Now we can understand there isn''t enough data to actually train and test. We need to generate a couple more examples so we have enough to train and test.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (35, 1, 3, 30, 1, 'INSERT INTO orders (customer_name) VALUES (''Carol''); +INSERT INTO pgml.notebook_cells VALUES (35, 1, 3, 30, 1, 'INSERT INTO orders (customer_name) VALUES (''Carol''); INSERT INTO line_items ( order_id, product_emoji, @@ -327,8 +327,8 @@ INSERT INTO line_items ( 10 ) RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (36, 1, 1, 31, 1, 'Carol has bought a book, and now will legitimately pay in full.', '

Carol has bought a book, and now will legitimately pay in full.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (37, 1, 3, 32, 1, 'INSERT INTO payments (order_id, amount) +INSERT INTO pgml.notebook_cells VALUES (36, 1, 1, 31, 1, 'Carol has bought a book, and now will legitimately pay in full.', '

Carol has bought a book, and now will legitimately pay in full.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (37, 1, 3, 32, 1, 'INSERT INTO payments (order_id, amount) -- a query to compute the full amount of Carol''s most recent order SELECT order_id, SUM(count * price) AS amount FROM orders @@ -337,8 +337,8 @@ JOIN products ON products.emoji = line_items.product_emoji WHERE orders.id = (SELECT max(id) AS order_id FROM orders WHERE customer_name = ''Carol'') GROUP BY 1 RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (38, 1, 1, 33, 1, 'And now Dan (another fraudster) shows up to steal more books:', '

And now Dan (another fraudster) shows up to steal more books:

', NULL, NULL); -INSERT INTO notebook_cells VALUES (39, 1, 3, 34, 1, 'INSERT INTO orders (customer_name) VALUES (''Dan''); +INSERT INTO pgml.notebook_cells VALUES (38, 1, 1, 33, 1, 'And now Dan (another fraudster) shows up to steal more books:', '

And now Dan (another fraudster) shows up to steal more books:

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (39, 1, 3, 34, 1, 'INSERT INTO orders (customer_name) VALUES (''Dan''); INSERT INTO line_items ( order_id, product_emoji, @@ -349,8 +349,8 @@ INSERT INTO line_items ( 50 ) RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (40, 1, 1, 35, 1, 'Here comes the fraudulent payment.', '

Here comes the fraudulent payment.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (41, 1, 3, 36, 1, 'INSERT INTO payments (order_id, amount) +INSERT INTO pgml.notebook_cells VALUES (40, 1, 1, 35, 1, 'Here comes the fraudulent payment.', '

Here comes the fraudulent payment.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (41, 1, 3, 36, 1, 'INSERT INTO payments (order_id, amount) -- a query to compute the full amount of Dan''s most recent order SELECT order_id, SUM(count * price) AS amount FROM orders @@ -359,15 +359,15 @@ JOIN products ON products.emoji = line_items.product_emoji WHERE orders.id = (SELECT max(id) AS order_id FROM orders WHERE customer_name = ''Dan'') GROUP BY 1 RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (42, 1, 1, 37, 1, 'And when the credit card company let''s us know about the issue, we''ll record it.', '

And when the credit card company let''s us know about the issue, we''ll record it.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (43, 1, 3, 38, 1, 'INSERT INTO chargebacks (payment_id) +INSERT INTO pgml.notebook_cells VALUES (42, 1, 1, 37, 1, 'And when the credit card company let''s us know about the issue, we''ll record it.', '

And when the credit card company let''s us know about the issue, we''ll record it.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (43, 1, 3, 38, 1, 'INSERT INTO chargebacks (payment_id) SELECT max(payments.id) AS payment_id FROM payments JOIN orders ON payments.order_id = orders.id WHERE customer_name = ''Dan'' RETURNING *;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (44, 1, 1, 39, 1, 'And now we can try to train the model again.', '

And now we can try to train the model again.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (45, 1, 3, 40, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (44, 1, 1, 39, 1, 'And now we can try to train the model again.', '

And now we can try to train the model again.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (45, 1, 3, 40, 1, 'SELECT * FROM pgml.train( project_name => ''Our Fraud Classification'', -- a friendly name we''ll use to identify this machine learning project task => ''classification'', -- we want to classify into true or false relation_name => ''fraud_samples'', -- our view of the data @@ -375,17 +375,17 @@ INSERT INTO notebook_cells VALUES (45, 1, 3, 40, 1, 'SELECT * FROM pgml.train( test_sampling => ''last'', test_size => 0.5 -- use half the data for testing rather than the default test size of 25% );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (46, 1, 1, 41, 1, '🏁 Success! 🏁 +INSERT INTO pgml.notebook_cells VALUES (46, 1, 1, 41, 1, '🏁 Success! 🏁 -------------- We can demonstrate basic usage of the model with another SQL call', '

🏁 Success! 🏁

We can demonstrate basic usage of the model with another SQL call

', NULL, NULL); -INSERT INTO notebook_cells VALUES (47, 1, 3, 42, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (47, 1, 3, 42, 1, 'SELECT perishable_percentage, fraudulent, pgml.predict(''Our Fraud Classification'', ARRAY[perishable_percentage::real]) AS predict_fraud FROM fraud_samples;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (130, 4, 3, 13, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (130, 4, 3, 13, 1, 'SELECT * FROM pgml.train( ''Diabetes Progression'', algorithm => ''xgboost'', search => ''grid'', @@ -395,15 +395,15 @@ INSERT INTO notebook_cells VALUES (130, 4, 3, 13, 1, 'SELECT * FROM pgml.train( "learning_rate": [0.1, 0.2] }'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (159, 6, 3, 24, 1, 'SELECT pgml.dot_product(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (50, 1, 1, 45, 1, 'The default model is a linear regression, so it has learned from the training half of the data that high amounts of perishable goods make for safe orders. +INSERT INTO pgml.notebook_cells VALUES (159, 6, 3, 24, 1, 'SELECT pgml.dot_product(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (50, 1, 1, 45, 1, 'The default model is a linear regression, so it has learned from the training half of the data that high amounts of perishable goods make for safe orders. Part 4: Adding more features ---------------------------- We need to add some more features to create a better model. Instead of just using the perishable percentage, we can use dollar values as our features, since we know criminals want to steal large amounts more than small amounts.', '

The default model is a linear regression, so it has learned from the training half of the data that high amounts of perishable goods make for safe orders.

Part 4: Adding more features

We need to add some more features to create a better model. Instead of just using the perishable percentage, we can use dollar values as our features, since we know criminals want to steal large amounts more than small amounts.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (51, 1, 3, 46, 1, 'DROP VIEW fraud_samples; +INSERT INTO pgml.notebook_cells VALUES (51, 1, 3, 46, 1, 'DROP VIEW fraud_samples; CREATE VIEW fraud_samples AS SELECT SUM(CASE WHEN products.perishable THEN (count * price)::NUMERIC::FLOAT ELSE 0.0 END) AS perishable_amount, @@ -419,8 +419,8 @@ LEFT JOIN line_items ON line_items.order_id = orders.id LEFT JOIN products ON products.emoji = line_items.product_emoji GROUP BY orders.id, chargebacks.id ORDER BY orders.id;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (52, 1, 1, 47, 1, 'And now we retrain a new version of the model, by calling train with the same parameters again.', '

And now we retrain a new version of the model, by calling train with the same parameters again.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (53, 1, 3, 48, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (52, 1, 1, 47, 1, 'And now we retrain a new version of the model, by calling train with the same parameters again.', '

And now we retrain a new version of the model, by calling train with the same parameters again.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (53, 1, 3, 48, 1, 'SELECT * FROM pgml.train( project_name => ''Our Fraud Classification'', -- a friendly name we''ll use to identify this machine learning project task => ''classification'', -- we want to classify into true or false relation_name => ''fraud_samples'', -- our view of the data @@ -428,10 +428,10 @@ INSERT INTO notebook_cells VALUES (53, 1, 3, 48, 1, 'SELECT * FROM pgml.train( test_sampling => ''last'', test_size => 0.5 -- use half the data for testing rather than the default test size of 25% );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (54, 1, 1, 49, 1, 'And then we can deploy this most recent version', '

And then we can deploy this most recent version

', NULL, NULL); -INSERT INTO notebook_cells VALUES (55, 1, 3, 50, 1, 'SELECT * FROM pgml.deploy(''Our Fraud Classification'', ''most_recent'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (56, 1, 1, 51, 1, 'And view the input/outputs of this model based on our data:', '

And view the input/outputs of this model based on our data:

', NULL, NULL); -INSERT INTO notebook_cells VALUES (57, 1, 3, 52, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (54, 1, 1, 49, 1, 'And then we can deploy this most recent version', '

And then we can deploy this most recent version

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (55, 1, 3, 50, 1, 'SELECT * FROM pgml.deploy(''Our Fraud Classification'', ''most_recent'');', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (56, 1, 1, 51, 1, 'And view the input/outputs of this model based on our data:', '

And view the input/outputs of this model based on our data:

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (57, 1, 3, 52, 1, 'SELECT perishable_amount, non_perishable_amount, fraudulent, @@ -440,7 +440,7 @@ INSERT INTO notebook_cells VALUES (57, 1, 3, 52, 1, 'SELECT ARRAY[perishable_amount::real, non_perishable_amount::real] ) AS predict_fraud FROM fraud_samples;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (58, 1, 1, 53, 1, 'This is the basic development cycle for a model. +INSERT INTO pgml.notebook_cells VALUES (58, 1, 1, 53, 1, 'This is the basic development cycle for a model. 1. Add new features. 2. Retrain the new model. @@ -526,12 +526,12 @@ WITH customer_stats AS ( ... LEFT JOIN customer_stats ON customer_stats.customer_name = orders.customer_name ', NULL, NULL); -INSERT INTO notebook_cells VALUES (59, 1, 1, 54, 1, 'Part 5: Upgrading the Machine Learning Algorithm +INSERT INTO pgml.notebook_cells VALUES (59, 1, 1, 54, 1, 'Part 5: Upgrading the Machine Learning Algorithm ------------------------------------------ When you''re out of ideas for features that might help the model distinguish orders that are likely to result in chargebacks, you may want to start testing different algorithms to see how the performance changes. PostgresML makes algorithm selection as easy as passing an additional parameter to `pgml.train`. You may want to test them all just to see, but `xgboost` typically gives excellent performance in terms of both accuracy and latency.', '

Part 5: Upgrading the Machine Learning Algorithm

When you''re out of ideas for features that might help the model distinguish orders that are likely to result in chargebacks, you may want to start testing different algorithms to see how the performance changes. PostgresML makes algorithm selection as easy as passing an additional parameter to pgml.train. You may want to test them all just to see, but xgboost typically gives excellent performance in terms of both accuracy and latency.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (60, 1, 3, 55, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (60, 1, 3, 55, 1, 'SELECT * FROM pgml.train( project_name => ''Our Fraud Classification'', -- a friendly name we''ll use to identify this machine learning project task => ''classification'', -- we want to classify into true or false relation_name => ''fraud_samples'', -- our view of the data @@ -540,8 +540,8 @@ INSERT INTO notebook_cells VALUES (60, 1, 3, 55, 1, 'SELECT * FROM pgml.train( test_size => 0.5, -- use half the data for testing rather than the default test size of 25% test_sampling => ''last'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (61, 1, 1, 56, 1, 'So far we''ve been training a classifier that gives us a binary 0 or 1 output to indicate fraud or not. If we''d like to refine our application response to the models predictions in a more nuanced way, say high/medium/low risk instead of binary, we can use "regression" instead of "classification" to predict a likelihood between 0 and 1, instead of binary.', '

So far we''ve been training a classifier that gives us a binary 0 or 1 output to indicate fraud or not. If we''d like to refine our application response to the models predictions in a more nuanced way, say high/medium/low risk instead of binary, we can use "regression" instead of "classification" to predict a likelihood between 0 and 1, instead of binary.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (62, 1, 3, 57, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (61, 1, 1, 56, 1, 'So far we''ve been training a classifier that gives us a binary 0 or 1 output to indicate fraud or not. If we''d like to refine our application response to the models predictions in a more nuanced way, say high/medium/low risk instead of binary, we can use "regression" instead of "classification" to predict a likelihood between 0 and 1, instead of binary.', '

So far we''ve been training a classifier that gives us a binary 0 or 1 output to indicate fraud or not. If we''d like to refine our application response to the models predictions in a more nuanced way, say high/medium/low risk instead of binary, we can use "regression" instead of "classification" to predict a likelihood between 0 and 1, instead of binary.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (62, 1, 3, 57, 1, 'SELECT * FROM pgml.train( project_name => ''Our Fraud Regression'', -- a friendly name we''ll use to identify this machine learning project task => ''regression'', -- predict the likelihood relation_name => ''fraud_samples'', -- our view of the data @@ -550,17 +550,17 @@ INSERT INTO notebook_cells VALUES (62, 1, 3, 57, 1, 'SELECT * FROM pgml.train( test_size => 0.5, -- use half the data for testing rather than the default test size of 25% test_sampling => ''last'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (152, 6, 1, 17, 1, '### Normalization', '

Normalization

', NULL, NULL); -INSERT INTO notebook_cells VALUES (153, 6, 3, 18, 1, 'SELECT pgml.normalize_l1(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (63, 1, 1, 58, 1, 'At this point, the primary limitation of our model is the amount of data, the number of examples we have to train it on. Luckily, as time marches on, and data accumulates in the database, we can simply retrain this model with additional calls to `pgml.train` and watch it adjust as new information becomes available.', '

At this point, the primary limitation of our model is the amount of data, the number of examples we have to train it on. Luckily, as time marches on, and data accumulates in the database, we can simply retrain this model with additional calls to pgml.train and watch it adjust as new information becomes available.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (64, 1, 3, 59, 1, '-- If you''d like to start this tutorial over, you can clear out the tables we created. +INSERT INTO pgml.notebook_cells VALUES (152, 6, 1, 17, 1, '### Normalization', '

Normalization

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (153, 6, 3, 18, 1, 'SELECT pgml.normalize_l1(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (63, 1, 1, 58, 1, 'At this point, the primary limitation of our model is the amount of data, the number of examples we have to train it on. Luckily, as time marches on, and data accumulates in the database, we can simply retrain this model with additional calls to `pgml.train` and watch it adjust as new information becomes available.', '

At this point, the primary limitation of our model is the amount of data, the number of examples we have to train it on. Luckily, as time marches on, and data accumulates in the database, we can simply retrain this model with additional calls to pgml.train and watch it adjust as new information becomes available.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (64, 1, 3, 59, 1, '-- If you''d like to start this tutorial over, you can clear out the tables we created. -- use Ctrl-/ to comment/uncomment blocks in this editor. DROP TABLE IF EXISTS products CASCADE; DROP TABLE IF EXISTS orders CASCADE; DROP TABLE IF EXISTS line_items CASCADE; DROP TABLE IF EXISTS chargebacks CASCADE; DROP TABLE IF EXISTS payments CASCADE;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (65, 2, 1, 1, 1, 'Binary classification means categorizing data into 2 categories. Usually these are categories like: +INSERT INTO pgml.notebook_cells VALUES (65, 2, 1, 1, 1, 'Binary classification means categorizing data into 2 categories. Usually these are categories like: - `True` or `False` - `0` or `1` @@ -582,23 +582,23 @@ You can load this dataset into your Postgres database with the following SQL.',

In this tutorial, we''ll train models using various "supervised learning" algorithms to classify medical samples as benign or malignant. Supervised learning techniques require us to label the sample data for the algorithm to learn how the inputs correlate with the labels. After the algorithm has been trained on the labeled data set we created, we can present it with new unlabeled data to classify based on the most likely outcome.

As we saw in Tutorial 1: Real Time Fraud Model understanding the structure of the data and the labels is a complex and critical step for real world machine learning projects. In this example we''ll focus more on the different algorithms, and use an academic benchmark dataset that already includes binary labels from UCI ML Breast Cancer Wisconsin. Features were computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. The labels are either True for a malignant sample of False for a benign sample.

You can load this dataset into your Postgres database with the following SQL.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (66, 2, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''breast_cancer'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (67, 2, 1, 3, 1, 'This function has created a new table in your database named `pgml.breast_cancer`. Let''s look at a random sample of the data with some more SQL.', '

This function has created a new table in your database named pgml.breast_cancer. Let''s look at a random sample of the data with some more SQL.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (68, 2, 3, 4, 1, 'SELECT * +INSERT INTO pgml.notebook_cells VALUES (66, 2, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''breast_cancer'');', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (67, 2, 1, 3, 1, 'This function has created a new table in your database named `pgml.breast_cancer`. Let''s look at a random sample of the data with some more SQL.', '

This function has created a new table in your database named pgml.breast_cancer. Let''s look at a random sample of the data with some more SQL.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (68, 2, 3, 4, 1, 'SELECT * FROM pgml.breast_cancer ORDER BY random() LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (69, 2, 1, 5, 1, 'That''s a lot of numeric feature data describing various attributes of the cells, but if you scroll all the way to the right above, after running the query, you''ll see that each sample set of feature data is labeled `malignant` [`True` or `False`]. It would be extremely difficult for a human to study all these numbers, and see how they correlate with malignant or not, and then be able to make a prediction for new samples, but mathematicians have been working on algorithms to do exactly this using computers which happen to be exceptionally good at this by now. This is statistical machine learning. +INSERT INTO pgml.notebook_cells VALUES (69, 2, 1, 5, 1, 'That''s a lot of numeric feature data describing various attributes of the cells, but if you scroll all the way to the right above, after running the query, you''ll see that each sample set of feature data is labeled `malignant` [`True` or `False`]. It would be extremely difficult for a human to study all these numbers, and see how they correlate with malignant or not, and then be able to make a prediction for new samples, but mathematicians have been working on algorithms to do exactly this using computers which happen to be exceptionally good at this by now. This is statistical machine learning. PostgresML makes it easy to use this data to create a model. It only takes a single function call with a few parameters.', '

That''s a lot of numeric feature data describing various attributes of the cells, but if you scroll all the way to the right above, after running the query, you''ll see that each sample set of feature data is labeled malignant [True or False]. It would be extremely difficult for a human to study all these numbers, and see how they correlate with malignant or not, and then be able to make a prediction for new samples, but mathematicians have been working on algorithms to do exactly this using computers which happen to be exceptionally good at this by now. This is statistical machine learning.

PostgresML makes it easy to use this data to create a model. It only takes a single function call with a few parameters.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (70, 2, 3, 6, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (70, 2, 3, 6, 1, 'SELECT * FROM pgml.train( project_name => ''Breast Cancer Detection'', task => ''classification'', relation_name => ''pgml.breast_cancer'', y_column_name => ''malignant'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (71, 2, 1, 7, 1, '🏁 Congratulations 🏁 +INSERT INTO pgml.notebook_cells VALUES (71, 2, 1, 7, 1, '🏁 Congratulations 🏁 --------------------- You''ve just created a machine learning model, tested it''s accuracy, and deployed it to production. PostgresML orchestrated a bunch of the traditional ML drudgery in that couple of seconds to make it as simple as possible for you to get value. We''ll organize our work on this task under the project name "Breast Cancer Detection", which you can now see it in your [list of projects](../../projects/). You can see that the first model uses the default linear algorithm, and that it achieves an [F1 score](https://en.wikipedia.org/wiki/F-score) in the mid 90''s, which is pretty good. A score of 1.0 is perfect, and 0.5 would be as good as random guessing. The better the F1 score, the better the algorithm can perform on this dataset. @@ -606,7 +606,7 @@ You''ve just created a machine learning model, tested it''s accuracy, and deploy We can now use this model to make some predictions in real time, using the training data as input to the `pgml.predict` function.', '

🏁 Congratulations 🏁

You''ve just created a machine learning model, tested it''s accuracy, and deployed it to production. PostgresML orchestrated a bunch of the traditional ML drudgery in that couple of seconds to make it as simple as possible for you to get value. We''ll organize our work on this task under the project name "Breast Cancer Detection", which you can now see it in your list of projects. You can see that the first model uses the default linear algorithm, and that it achieves an F1 score in the mid 90''s, which is pretty good. A score of 1.0 is perfect, and 0.5 would be as good as random guessing. The better the F1 score, the better the algorithm can perform on this dataset.

We can now use this model to make some predictions in real time, using the training data as input to the pgml.predict function.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (72, 2, 3, 8, 1, 'SELECT malignant, pgml.predict( +INSERT INTO pgml.notebook_cells VALUES (72, 2, 3, 8, 1, 'SELECT malignant, pgml.predict( ''Breast Cancer Detection'', ARRAY[ "mean radius", @@ -644,12 +644,12 @@ INSERT INTO notebook_cells VALUES (72, 2, 3, 8, 1, 'SELECT malignant, pgml.predi FROM pgml.breast_cancer ORDER BY random() LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (154, 6, 3, 19, 1, 'SELECT pgml.normalize_l2(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (73, 2, 1, 9, 1, 'You can see the model is pretty good at predicting `0` for non malignant samples, and `1` for malignant samples. This isn''t a great test though, because we''re using the same data we trained with. We could have just looked up the data in the database table if this is all we wanted to do. The point of training a machine learning model is to generalize these statistics to data we''ve never seen before. What do you think this model would predict if all the input features happened to be 0 or 1? How does that compare to what it''s seen before? +INSERT INTO pgml.notebook_cells VALUES (154, 6, 3, 19, 1, 'SELECT pgml.normalize_l2(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (73, 2, 1, 9, 1, 'You can see the model is pretty good at predicting `0` for non malignant samples, and `1` for malignant samples. This isn''t a great test though, because we''re using the same data we trained with. We could have just looked up the data in the database table if this is all we wanted to do. The point of training a machine learning model is to generalize these statistics to data we''ve never seen before. What do you think this model would predict if all the input features happened to be 0 or 1? How does that compare to what it''s seen before? It''s easy to test the model and see by providing new sample data in real time. There are lots of ways we could feed new data to a model in Postgres. We could write new samples to a table just like our training data, or we could pass parameters directly into a query without recording anything in the database at all. Postgres gives us a lot of ways to get data in and out at run time. We''ll demonstrate with a `VALUES` example for batch prediction.', '

You can see the model is pretty good at predicting 0 for non malignant samples, and 1 for malignant samples. This isn''t a great test though, because we''re using the same data we trained with. We could have just looked up the data in the database table if this is all we wanted to do. The point of training a machine learning model is to generalize these statistics to data we''ve never seen before. What do you think this model would predict if all the input features happened to be 0 or 1? How does that compare to what it''s seen before?

It''s easy to test the model and see by providing new sample data in real time. There are lots of ways we could feed new data to a model in Postgres. We could write new samples to a table just like our training data, or we could pass parameters directly into a query without recording anything in the database at all. Postgres gives us a lot of ways to get data in and out at run time. We''ll demonstrate with a VALUES example for batch prediction.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (74, 2, 3, 10, 1, 'SELECT sample_name, pgml.predict( +INSERT INTO pgml.notebook_cells VALUES (74, 2, 3, 10, 1, 'SELECT sample_name, pgml.predict( ''Breast Cancer Detection'', ARRAY[ "mean radius", @@ -722,19 +722,19 @@ FROM ( "worst symmetry", "worst fractal dimension" );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (75, 2, 1, 11, 1, 'Even though the inputs are not data we''ve ever seen before, the model is telling us both of these new samples are likely to be benign based on their statistical correlations to the training samples we had labeled. As we collect new data samples, we could potentially use this model for multiple purposes, like screening the samples before doing further more expensive or invasive analysis. +INSERT INTO pgml.notebook_cells VALUES (75, 2, 1, 11, 1, 'Even though the inputs are not data we''ve ever seen before, the model is telling us both of these new samples are likely to be benign based on their statistical correlations to the training samples we had labeled. As we collect new data samples, we could potentially use this model for multiple purposes, like screening the samples before doing further more expensive or invasive analysis. To demonstrate a more concise call that omits all the feature names (careful to get the order right):', '

Even though the inputs are not data we''ve ever seen before, the model is telling us both of these new samples are likely to be benign based on their statistical correlations to the training samples we had labeled. As we collect new data samples, we could potentially use this model for multiple purposes, like screening the samples before doing further more expensive or invasive analysis.

To demonstrate a more concise call that omits all the feature names (careful to get the order right):

', NULL, NULL); -INSERT INTO notebook_cells VALUES (76, 2, 3, 12, 1, 'SELECT pgml.predict( +INSERT INTO pgml.notebook_cells VALUES (76, 2, 3, 12, 1, 'SELECT pgml.predict( ''Breast Cancer Detection'', ARRAY[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100000] )', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (77, 2, 1, 13, 1, 'Ah hah! We put a really big number into the last feature (worst fractal dimension), and got the model to give us a `True` prediction, indicating that large values there correlate with a malignant sample all else being equal using our default linear algorithm. There are lots of ways we can probe the model with test data, but before we spend too much time on this one, it might be informative to try other algorithms. +INSERT INTO pgml.notebook_cells VALUES (77, 2, 1, 13, 1, 'Ah hah! We put a really big number into the last feature (worst fractal dimension), and got the model to give us a `True` prediction, indicating that large values there correlate with a malignant sample all else being equal using our default linear algorithm. There are lots of ways we can probe the model with test data, but before we spend too much time on this one, it might be informative to try other algorithms. PostgresML makes it easy to reuse your training data with many of the best algorithms available. Why not try them all?', '

Ah hah! We put a really big number into the last feature (worst fractal dimension), and got the model to give us a True prediction, indicating that large values there correlate with a malignant sample all else being equal using our default linear algorithm. There are lots of ways we can probe the model with test data, but before we spend too much time on this one, it might be informative to try other algorithms.

PostgresML makes it easy to reuse your training data with many of the best algorithms available. Why not try them all?

', NULL, NULL); -INSERT INTO notebook_cells VALUES (78, 2, 3, 14, 1, '-- +INSERT INTO pgml.notebook_cells VALUES (78, 2, 3, 14, 1, '-- -- After a project has been trained, omitted parameters will be reused from previous training runs -- In these examples we''ll reuse the training data snapshots from the initial call. -- @@ -761,11 +761,11 @@ SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''random_fore SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''xgboost_random_forest'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Breast Cancer Detection'', algorithm => ''lightgbm'', hyperparams => ''{"n_estimators": 1}'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (79, 2, 1, 15, 1, 'Turns out, computers are pretty fast these days, even with state of the art algorithms running on a free tier computation resources. 😊 +INSERT INTO pgml.notebook_cells VALUES (79, 2, 1, 15, 1, 'Turns out, computers are pretty fast these days, even with state of the art algorithms running on a free tier computation resources. 😊 You can pop over to the [projects](../../projects/) tab for a visualization of the performance of all these algorithms on this dataset, or you can check out the artifacts directly in the database.', '

Turns out, computers are pretty fast these days, even with state of the art algorithms running on a free tier computation resources. 😊

You can pop over to the projects tab for a visualization of the performance of all these algorithms on this dataset, or you can check out the artifacts directly in the database.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (80, 2, 3, 16, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (80, 2, 3, 16, 1, 'SELECT projects.name, models.algorithm, round((models.metrics->>''f1'')::numeric, 4) AS f1_score, @@ -775,20 +775,20 @@ FROM pgml.models JOIN pgml.projects on projects.id = models.project_id AND projects.name = ''Breast Cancer Detection'' ORDER BY models.metrics->>''f1'' DESC LIMIT 5;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (81, 2, 1, 17, 1, 'Tree based algorithms like `random_forest`, `xgboost` and `lightgbm` do well on tabular datasets and frequently lead the pack with A+ level performance as measured by the `f1_score`. They are generally sensitive to small changes in the inputs, but also robust to outliers. They are also relatively fast algorithms that can perform predictions in sub millisecond times, meaning most of the cost of inference is in fetching the data they require as inputs. When your inputs are already in the database with the model, that time is as fast as possible!', '

Tree based algorithms like random_forest, xgboost and lightgbm do well on tabular datasets and frequently lead the pack with A+ level performance as measured by the f1_score. They are generally sensitive to small changes in the inputs, but also robust to outliers. They are also relatively fast algorithms that can perform predictions in sub millisecond times, meaning most of the cost of inference is in fetching the data they require as inputs. When your inputs are already in the database with the model, that time is as fast as possible!

', NULL, NULL); -INSERT INTO notebook_cells VALUES (82, 4, 1, 1, 1, 'So far we''ve focussed on Classification tasks which divide the world into discrete groups. Sometimes we need to take a more nuanced view when issues are not black and white. Sometimes there are no hard boundaries between options, or sometimes one sort of classification error might be much more painful than another. There are many algorithms that can produce a raw score rather than a discrete class for us. These are "Regression" tasks instead of "Classification". +INSERT INTO pgml.notebook_cells VALUES (81, 2, 1, 17, 1, 'Tree based algorithms like `random_forest`, `xgboost` and `lightgbm` do well on tabular datasets and frequently lead the pack with A+ level performance as measured by the `f1_score`. They are generally sensitive to small changes in the inputs, but also robust to outliers. They are also relatively fast algorithms that can perform predictions in sub millisecond times, meaning most of the cost of inference is in fetching the data they require as inputs. When your inputs are already in the database with the model, that time is as fast as possible!', '

Tree based algorithms like random_forest, xgboost and lightgbm do well on tabular datasets and frequently lead the pack with A+ level performance as measured by the f1_score. They are generally sensitive to small changes in the inputs, but also robust to outliers. They are also relatively fast algorithms that can perform predictions in sub millisecond times, meaning most of the cost of inference is in fetching the data they require as inputs. When your inputs are already in the database with the model, that time is as fast as possible!

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (82, 4, 1, 1, 1, 'So far we''ve focussed on Classification tasks which divide the world into discrete groups. Sometimes we need to take a more nuanced view when issues are not black and white. Sometimes there are no hard boundaries between options, or sometimes one sort of classification error might be much more painful than another. There are many algorithms that can produce a raw score rather than a discrete class for us. These are "Regression" tasks instead of "Classification". For this example, we''ll look at several medical indicators that correlate with the progression of diabetes one year later. Let''s load up the data and take a look', '

So far we''ve focussed on Classification tasks which divide the world into discrete groups. Sometimes we need to take a more nuanced view when issues are not black and white. Sometimes there are no hard boundaries between options, or sometimes one sort of classification error might be much more painful than another. There are many algorithms that can produce a raw score rather than a discrete class for us. These are "Regression" tasks instead of "Classification".

For this example, we''ll look at several medical indicators that correlate with the progression of diabetes one year later. Let''s load up the data and take a look

', NULL, NULL); -INSERT INTO notebook_cells VALUES (83, 4, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''diabetes'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (84, 3, 1, 1, 1, 'Image classification is a great application of machine learning. In this tutorial we''ll examine a classic version of this problem, recognizing hand written digits to automatically parse zip codes out of addresses. For machine learning purposes, we decompose images into their uncompressed pixel values as 2D arrays for gray scale images, or 3D arrays for color images. +INSERT INTO pgml.notebook_cells VALUES (83, 4, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''diabetes'');', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (84, 3, 1, 1, 1, 'Image classification is a great application of machine learning. In this tutorial we''ll examine a classic version of this problem, recognizing hand written digits to automatically parse zip codes out of addresses. For machine learning purposes, we decompose images into their uncompressed pixel values as 2D arrays for gray scale images, or 3D arrays for color images. Convolutional Neural Nets and other forms of deep learning, leverage the 2D and 3D adjacency of the pixels to get breakthrough state of the art results on difficult image classification tasks over thousands of categories, and also for image labeling. Postgres has native support for multidimensional `ARRAY` data types, that PostgresML can treat accordingly. Let''s load the dataset to start:', '

Image classification is a great application of machine learning. In this tutorial we''ll examine a classic version of this problem, recognizing hand written digits to automatically parse zip codes out of addresses. For machine learning purposes, we decompose images into their uncompressed pixel values as 2D arrays for gray scale images, or 3D arrays for color images.

Convolutional Neural Nets and other forms of deep learning, leverage the 2D and 3D adjacency of the pixels to get breakthrough state of the art results on difficult image classification tasks over thousands of categories, and also for image labeling. Postgres has native support for multidimensional ARRAY data types, that PostgresML can treat accordingly.

Let''s load the dataset to start:

', NULL, NULL); -INSERT INTO notebook_cells VALUES (85, 5, 1, 1, 1, 'PostgresML integrates [🤗 Hugging Face Transformers](https://huggingface.co/transformers) to bring state-of-the-art models into the data layer. There are tens of thousands of pre-trained models with pipelines to turn raw inputs into useful results. Many state of the art deep learning architectures have been published and made available for download. You will want to browse all the [models](https://huggingface.co/models) available to find the perfect solution for your [dataset](https://huggingface.co/dataset) and [task](https://huggingface.co/tasks). +INSERT INTO pgml.notebook_cells VALUES (85, 5, 1, 1, 1, 'PostgresML integrates [🤗 Hugging Face Transformers](https://huggingface.co/transformers) to bring state-of-the-art models into the data layer. There are tens of thousands of pre-trained models with pipelines to turn raw inputs into useful results. Many state of the art deep learning architectures have been published and made available for download. You will want to browse all the [models](https://huggingface.co/models) available to find the perfect solution for your [dataset](https://huggingface.co/dataset) and [task](https://huggingface.co/tasks). We''ll demonstrate some of the tasks that are immediately available to users of your database upon installation. @@ -834,20 +834,20 @@ def transform(task, call, inputs): def transform(task, call, inputs): return transformers.pipeline(**task)(inputs, **call) ', NULL, NULL); -INSERT INTO notebook_cells VALUES (93, 5, 1, 4, 1, '### Sentiment Analysis +INSERT INTO pgml.notebook_cells VALUES (93, 5, 1, 4, 1, '### Sentiment Analysis Sentiment analysis is one use of `text-classification`, but there are [many others](https://huggingface.co/tasks/text-classification). This model returns both a label classification `["POSITIVE", "NEUTRAL", "NEGATIVE"]`, as well as the score where 0.0 is perfectly negative, and 1.0 is perfectly positive. This example demonstrates specifying the `model` to be used rather than the task. The [`roberta-large-mnli`](https://huggingface.co/roberta-large-mnli) model specifies the task of `sentiment-analysis` in it''s default configuration, so we may omit it from the parameters. Because this is a batch call with 2 inputs, we''ll get 2 outputs in the JSONB. See [text classification documentation](https://huggingface.co/tasks/text-classification) for more options and potential use cases beyond sentiment analysis. You''ll notice the outputs are not great in this example. RoBERTa is a breakthrough model that demonstrated just how important each particular hyperparameter is for the task and particular dataset regardless of how large your model is. We''ll show how to [fine tune](/user_guides/transformers/fine_tuning/) models on your data in the next step.', '

Sentiment Analysis

Sentiment analysis is one use of text-classification, but there are many others. This model returns both a label classification ["POSITIVE", "NEUTRAL", "NEGATIVE"], as well as the score where 0.0 is perfectly negative, and 1.0 is perfectly positive. This example demonstrates specifying the model to be used rather than the task. The roberta-large-mnli model specifies the task of sentiment-analysis in it''s default configuration, so we may omit it from the parameters. Because this is a batch call with 2 inputs, we''ll get 2 outputs in the JSONB.

See text classification documentation for more options and potential use cases beyond sentiment analysis. You''ll notice the outputs are not great in this example. RoBERTa is a breakthrough model that demonstrated just how important each particular hyperparameter is for the task and particular dataset regardless of how large your model is. We''ll show how to fine tune models on your data in the next step.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (86, 6, 1, 1, 1, 'PostgresML adds [native vector operations](https://github.com/postgresml/postgresml/tree/master/pgml-extension/sql/install/vectors.sql) that can be used in SQL queries. Vector operations are particularly useful for dealing with embeddings that have been generated from other machine learning algorithms and can provide functions like nearest neighbor calculations using the distance functions. +INSERT INTO pgml.notebook_cells VALUES (86, 6, 1, 1, 1, 'PostgresML adds [native vector operations](https://github.com/postgresml/postgresml/tree/master/pgml-extension/sql/install/vectors.sql) that can be used in SQL queries. Vector operations are particularly useful for dealing with embeddings that have been generated from other machine learning algorithms and can provide functions like nearest neighbor calculations using the distance functions. Emeddings can be a relatively efficient mechanism to leverage the power of deep learning, without the runtime inference costs. These functions are relatively fast and the more expensive distance functions can compute ~100k per second for a memory resident dataset on modern hardware. The PostgreSQL planner will also [automatically parallelize](https://www.postgresql.org/docs/current/parallel-query.html) evaluation on larger datasets, as configured to take advantage of multiple CPU cores when available.', '

PostgresML adds native vector operations that can be used in SQL queries. Vector operations are particularly useful for dealing with embeddings that have been generated from other machine learning algorithms and can provide functions like nearest neighbor calculations using the distance functions.

Emeddings can be a relatively efficient mechanism to leverage the power of deep learning, without the runtime inference costs. These functions are relatively fast and the more expensive distance functions can compute ~100k per second for a memory resident dataset on modern hardware.

The PostgreSQL planner will also automatically parallelize evaluation on larger datasets, as configured to take advantage of multiple CPU cores when available.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (87, 7, 1, 1, 1, 'Models are automatically deployed if their key metric (R2 for regression, F1 for classification) is improved over the currently deployed version during training. If you want to manage deploys manually, you can always change which model is currently responsible for making predictions. +INSERT INTO pgml.notebook_cells VALUES (87, 7, 1, 1, 1, 'Models are automatically deployed if their key metric (R2 for regression, F1 for classification) is improved over the currently deployed version during training. If you want to manage deploys manually, you can always change which model is currently responsible for making predictions. ``` pgml.deploy( @@ -898,10 +898,10 @@ rollback | The model that was previously deployed for this project', '
', NULL, NULL); -INSERT INTO notebook_cells VALUES (88, 7, 1, 2, 1, '', '
', NULL, '2022-08-22 15:09:15.475779'); -INSERT INTO notebook_cells VALUES (89, 8, 1, 1, 1, 'PostgresML stores all artifacts from training in the database under the `pgml` schema. You can manually inspect these tables to further understand the inner workings, or to generate additional reporting and analytics across your models.', '

PostgresML stores all artifacts from training in the database under the pgml schema. You can manually inspect these tables to further understand the inner workings, or to generate additional reporting and analytics across your models.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (90, 3, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''digits'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (91, 5, 1, 2, 1, '### Translation +INSERT INTO pgml.notebook_cells VALUES (88, 7, 1, 2, 1, '', '
', NULL, '2022-08-22 15:09:15.475779'); +INSERT INTO pgml.notebook_cells VALUES (89, 8, 1, 1, 1, 'PostgresML stores all artifacts from training in the database under the `pgml` schema. You can manually inspect these tables to further understand the inner workings, or to generate additional reporting and analytics across your models.', '

PostgresML stores all artifacts from training in the database under the pgml schema. You can manually inspect these tables to further understand the inner workings, or to generate additional reporting and analytics across your models.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (90, 3, 3, 2, 1, 'SELECT * FROM pgml.load_dataset(''digits'');', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (91, 5, 1, 2, 1, '### Translation There are thousands of different pre-trained translation models between language pairs. They generally take a single input string in the "from" language, and translate it into the "to" language as a result of the call. PostgresML transformations provide a batch interface where you can pass an array of `TEXT` to process in a single call for efficiency. Not all language pairs have a default task name like this example of English to French. In those cases, you''ll need to specify [the desired model](https://huggingface.co/models?pipeline_tag=translation) by name. Because this is a batch call with 2 inputs, we''ll get 2 outputs in the JSONB. See [translation documentation](https://huggingface.co/docs/transformers/tasks/translation) for more options. @@ -910,49 +910,49 @@ For a translation from English to French with the default pre-trained model:', '

There are thousands of different pre-trained translation models between language pairs. They generally take a single input string in the "from" language, and translate it into the "to" language as a result of the call. PostgresML transformations provide a batch interface where you can pass an array of TEXT to process in a single call for efficiency. Not all language pairs have a default task name like this example of English to French. In those cases, you''ll need to specify the desired model by name. Because this is a batch call with 2 inputs, we''ll get 2 outputs in the JSONB.

See translation documentation for more options.

For a translation from English to French with the default pre-trained model:

', NULL, NULL); -INSERT INTO notebook_cells VALUES (92, 5, 3, 3, 1, 'SELECT pgml.transform( +INSERT INTO pgml.notebook_cells VALUES (92, 5, 3, 3, 1, 'SELECT pgml.transform( ''translation_en_to_fr'', inputs => ARRAY[ ''Welcome to the future!'', ''Where have you been all this time?'' ] ) AS french;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (94, 5, 3, 5, 1, 'SELECT pgml.transform( +INSERT INTO pgml.notebook_cells VALUES (94, 5, 3, 5, 1, 'SELECT pgml.transform( ''{"model": "roberta-large-mnli"}''::JSONB, inputs => ARRAY[ ''I love how amazingly simple ML has become!'', ''I hate doing mundane and thankless tasks. ☹️'' ] ) AS positivity;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (95, 6, 1, 2, 1, '### Elementwise arithmetic w/ constants', '

Elementwise arithmetic w/ constants

', NULL, NULL); -INSERT INTO notebook_cells VALUES (96, 6, 3, 3, 1, '', NULL, NULL, '2022-08-22 15:14:31.875531'); -INSERT INTO notebook_cells VALUES (97, 7, 3, 2, 1, '-- deploy the "best" model for prediction use +INSERT INTO pgml.notebook_cells VALUES (95, 6, 1, 2, 1, '### Elementwise arithmetic w/ constants', '

Elementwise arithmetic w/ constants

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (96, 6, 3, 3, 1, '', NULL, NULL, '2022-08-22 15:14:31.875531'); +INSERT INTO pgml.notebook_cells VALUES (97, 7, 3, 2, 1, '-- deploy the "best" model for prediction use SELECT * FROM pgml.deploy(''Handwritten Digits'', ''best_score'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (98, 8, 1, 2, 1, '## Models +INSERT INTO pgml.notebook_cells VALUES (98, 8, 1, 2, 1, '## Models Models are an artifact of calls to `pgml.train`.', '

Models

Models are an artifact of calls to pgml.train.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (99, 8, 3, 3, 1, 'SELECT id, algorithm::TEXT, runtime::TEXT FROM pgml.models LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (155, 6, 3, 20, 1, 'SELECT pgml.normalize_max(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (102, 8, 1, 6, 1, '## Snapshots +INSERT INTO pgml.notebook_cells VALUES (99, 8, 3, 3, 1, 'SELECT id, algorithm::TEXT, runtime::TEXT FROM pgml.models LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (155, 6, 3, 20, 1, 'SELECT pgml.normalize_max(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (102, 8, 1, 6, 1, '## Snapshots Snapshots are an artifact of calls to `pgml.train` that include a specific `relation_name` parameter. A full copy of all data in the relation at training time will be saved in a new table named `pgml.snapshot_{id}`. You can retrieve the original training data set by inspecting tables like `pgml.snapshot_1`.', '

Snapshots

Snapshots are an artifact of calls to pgml.train that include a specific relation_name parameter. A full copy of all data in the relation at training time will be saved in a new table named pgml.snapshot_{id}. You can retrieve the original training data set by inspecting tables like pgml.snapshot_1.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (103, 3, 1, 3, 1, 'We can view a sample of the data with a simple `SELECT`', '

We can view a sample of the data with a simple SELECT

', NULL, NULL); -INSERT INTO notebook_cells VALUES (104, 3, 3, 4, 1, 'SELECT target, array_to_json(image) FROM pgml.digits LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (105, 3, 1, 5, 1, 'The images are 8x8 gray scale arrays with gray values from 0 (white) to 16 (black) pixels. These images have been fairly heavily processed to center and crop each one, and the represented digit is labeled in the `target` column. By now you should start to have an idea what comes next in this tutorial. We''ve got data, so we train a model with a simple call to PostgresML.', '

The images are 8x8 gray scale arrays with gray values from 0 (white) to 16 (black) pixels. These images have been fairly heavily processed to center and crop each one, and the represented digit is labeled in the target column. By now you should start to have an idea what comes next in this tutorial. We''ve got data, so we train a model with a simple call to PostgresML.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (106, 3, 3, 6, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (103, 3, 1, 3, 1, 'We can view a sample of the data with a simple `SELECT`', '

We can view a sample of the data with a simple SELECT

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (104, 3, 3, 4, 1, 'SELECT target, array_to_json(image) FROM pgml.digits LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (105, 3, 1, 5, 1, 'The images are 8x8 gray scale arrays with gray values from 0 (white) to 16 (black) pixels. These images have been fairly heavily processed to center and crop each one, and the represented digit is labeled in the `target` column. By now you should start to have an idea what comes next in this tutorial. We''ve got data, so we train a model with a simple call to PostgresML.', '

The images are 8x8 gray scale arrays with gray values from 0 (white) to 16 (black) pixels. These images have been fairly heavily processed to center and crop each one, and the represented digit is labeled in the target column. By now you should start to have an idea what comes next in this tutorial. We''ve got data, so we train a model with a simple call to PostgresML.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (106, 3, 3, 6, 1, 'SELECT * FROM pgml.train( project_name => ''Handwritten Digits'', task => ''classification'', relation_name => ''pgml.digits'', y_column_name => ''target'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (107, 3, 1, 7, 1, 'We can view some of the predictions of the model on the training data.', '

We can view some of the predictions of the model on the training data.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (108, 3, 3, 8, 1, 'SELECT target, pgml.predict(''Handwritten Digits'', image) AS prediction +INSERT INTO pgml.notebook_cells VALUES (107, 3, 1, 7, 1, 'We can view some of the predictions of the model on the training data.', '

We can view some of the predictions of the model on the training data.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (108, 3, 3, 8, 1, 'SELECT target, pgml.predict(''Handwritten Digits'', image) AS prediction FROM pgml.digits LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (109, 3, 1, 9, 1, 'Hah! Even the default linear classification algorithm performs extremely well on such carefully engineered, but real world data. It''s a demonstration of how effective feature engineering and clean data can be even with relatively simple algorithms. Let''s take a look at that models metrics.', '

Hah! Even the default linear classification algorithm performs extremely well on such carefully engineered, but real world data. It''s a demonstration of how effective feature engineering and clean data can be even with relatively simple algorithms. Let''s take a look at that models metrics.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (110, 3, 3, 10, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (109, 3, 1, 9, 1, 'Hah! Even the default linear classification algorithm performs extremely well on such carefully engineered, but real world data. It''s a demonstration of how effective feature engineering and clean data can be even with relatively simple algorithms. Let''s take a look at that models metrics.', '

Hah! Even the default linear classification algorithm performs extremely well on such carefully engineered, but real world data. It''s a demonstration of how effective feature engineering and clean data can be even with relatively simple algorithms. Let''s take a look at that models metrics.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (110, 3, 3, 10, 1, 'SELECT projects.name, models.algorithm, round((models.metrics->>''f1'')::numeric, 4) AS f1_score, @@ -962,11 +962,11 @@ FROM pgml.models JOIN pgml.projects on projects.id = models.project_id AND projects.name = ''Handwritten Digits'' ORDER BY models.created_at DESC LIMIT 5;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (111, 3, 1, 11, 1, 'An F1 score in the mid nineties are grade A results, although there is room for improvement. We need to keep in mind the cost/benefit tradeoffs in the real world. If this algorithm is wrong about a digit 1 out of 20 times, it''ll give us the wrong ZIP code on every 3rd piece of mail. It might be a lot more expensive to re-route 1/3rd of all mail to fix these mistakes than it is to hire human''s read and input every zip code manually, so even though the results are pretty good, they are not good enough to create real value. +INSERT INTO pgml.notebook_cells VALUES (111, 3, 1, 11, 1, 'An F1 score in the mid nineties are grade A results, although there is room for improvement. We need to keep in mind the cost/benefit tradeoffs in the real world. If this algorithm is wrong about a digit 1 out of 20 times, it''ll give us the wrong ZIP code on every 3rd piece of mail. It might be a lot more expensive to re-route 1/3rd of all mail to fix these mistakes than it is to hire human''s read and input every zip code manually, so even though the results are pretty good, they are not good enough to create real value. Luckily, we have the benefit of the last 40 years of some very smart people developing a bunch of different algorithms for learning that all have different tradeoffs strengths and weaknesses. You could go spend a few years getting a degree trying to understand how they all work, or we can just try them all since computers are cheaper and more plentiful than engineers.', '

An F1 score in the mid nineties are grade A results, although there is room for improvement. We need to keep in mind the cost/benefit tradeoffs in the real world. If this algorithm is wrong about a digit 1 out of 20 times, it''ll give us the wrong ZIP code on every 3rd piece of mail. It might be a lot more expensive to re-route 1/3rd of all mail to fix these mistakes than it is to hire human''s read and input every zip code manually, so even though the results are pretty good, they are not good enough to create real value.

Luckily, we have the benefit of the last 40 years of some very smart people developing a bunch of different algorithms for learning that all have different tradeoffs strengths and weaknesses. You could go spend a few years getting a degree trying to understand how they all work, or we can just try them all since computers are cheaper and more plentiful than engineers.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (112, 3, 3, 12, 1, '-- +INSERT INTO pgml.notebook_cells VALUES (112, 3, 3, 12, 1, '-- -- After a project has been trained, omitted parameters will be reused from previous training runs -- In these examples we''ll reuse the training data snapshots from the initial call. -- @@ -993,9 +993,9 @@ SELECT * FROM pgml.train(''Handwritten Digits'', algorithm => ''random_forest'', SELECT * FROM pgml.train(''Handwritten Digits'', algorithm => ''xgboost'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Handwritten Digits'', algorithm => ''xgboost_random_forest'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Handwritten Digits'', algorithm => ''lightgbm'', hyperparams => ''{"n_estimators": 1}'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (113, 3, 1, 13, 1, 'In less than 10 seconds, we''ve thrown a barrage of algorithms at the problem and measured how they perform. Now let''s take a look at the best one''s metrics.', '

In less than 10 seconds, we''ve thrown a barrage of algorithms at the problem and measured how they perform. Now let''s take a look at the best one''s metrics.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (156, 6, 1, 21, 1, '### Comparisons', '

Comparisons

', NULL, NULL); -INSERT INTO notebook_cells VALUES (114, 3, 3, 14, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (113, 3, 1, 13, 1, 'In less than 10 seconds, we''ve thrown a barrage of algorithms at the problem and measured how they perform. Now let''s take a look at the best one''s metrics.', '

In less than 10 seconds, we''ve thrown a barrage of algorithms at the problem and measured how they perform. Now let''s take a look at the best one''s metrics.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (156, 6, 1, 21, 1, '### Comparisons', '

Comparisons

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (114, 3, 3, 14, 1, 'SELECT projects.name, models.algorithm, round((models.metrics->>''f1'')::numeric, 4) AS f1_score, @@ -1005,11 +1005,11 @@ FROM pgml.models JOIN pgml.projects on projects.id = models.project_id AND projects.name = ''Handwritten Digits'' ORDER BY models.metrics->>''f1'' DESC LIMIT 5;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (115, 3, 1, 15, 1, '`svm` stands for Support Vector Machines. They do well on this particular problem, and can reach A+ F1 scores. Back in our real world performance evaluation where they are only wrong 1 out of 100 digits, or 1/14 zip codes, instead of our original 1/3rd wrong baseline model. In the real world this means that about 7% of our mail would end up getting auto-routed to the wrong zip code. Is that good enough to start automating? Let''s ask the Postmaster general... If he says not quite, there is one more thing to try before we break out deep learning. +INSERT INTO pgml.notebook_cells VALUES (115, 3, 1, 15, 1, '`svm` stands for Support Vector Machines. They do well on this particular problem, and can reach A+ F1 scores. Back in our real world performance evaluation where they are only wrong 1 out of 100 digits, or 1/14 zip codes, instead of our original 1/3rd wrong baseline model. In the real world this means that about 7% of our mail would end up getting auto-routed to the wrong zip code. Is that good enough to start automating? Let''s ask the Postmaster general... If he says not quite, there is one more thing to try before we break out deep learning. Many algorithm''s have a few options we can tweak. These options are called hyperparameters. You can find the available ones for SVMs in the [docs](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html). Then we can automatically search all the combinations of the hyperparams to see how to tweak the knobs. We don''t actually have to have that degree just yet...', '

svm stands for Support Vector Machines. They do well on this particular problem, and can reach A+ F1 scores. Back in our real world performance evaluation where they are only wrong 1 out of 100 digits, or 1/14 zip codes, instead of our original 1/3rd wrong baseline model. In the real world this means that about 7% of our mail would end up getting auto-routed to the wrong zip code. Is that good enough to start automating? Let''s ask the Postmaster general... If he says not quite, there is one more thing to try before we break out deep learning.

Many algorithm''s have a few options we can tweak. These options are called hyperparameters. You can find the available ones for SVMs in the docs. Then we can automatically search all the combinations of the hyperparams to see how to tweak the knobs. We don''t actually have to have that degree just yet...

', NULL, NULL); -INSERT INTO notebook_cells VALUES (116, 3, 3, 16, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (116, 3, 3, 16, 1, 'SELECT * FROM pgml.train( ''Handwritten Digits'', algorithm => ''svm'', hyperparams => ''{"random_state": 0}'', @@ -1019,28 +1019,28 @@ INSERT INTO notebook_cells VALUES (116, 3, 3, 16, 1, 'SELECT * FROM pgml.train( "shrinking": [true, false] }'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (117, 3, 1, 17, 1, 'And then we can peak at the metrics directly with a bit more SQL.', '

And then we can peak at the metrics directly with a bit more SQL.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (118, 3, 3, 18, 1, 'SELECT metrics +INSERT INTO pgml.notebook_cells VALUES (117, 3, 1, 17, 1, 'And then we can peak at the metrics directly with a bit more SQL.', '

And then we can peak at the metrics directly with a bit more SQL.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (118, 3, 3, 18, 1, 'SELECT metrics FROM pgml.models ORDER BY created_at DESC LIMIT 1;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (119, 3, 1, 19, 1, 'It''s a bit tough to parse the results of the search in pure SQL, so you can hop over to the [Projects](../../projects/) list to see a visualization.', '

It''s a bit tough to parse the results of the search in pure SQL, so you can hop over to the Projects list to see a visualization.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (120, 4, 3, 3, 1, 'SELECT * +INSERT INTO pgml.notebook_cells VALUES (119, 3, 1, 19, 1, 'It''s a bit tough to parse the results of the search in pure SQL, so you can hop over to the [Projects](../../projects/) list to see a visualization.', '

It''s a bit tough to parse the results of the search in pure SQL, so you can hop over to the Projects list to see a visualization.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (120, 4, 3, 3, 1, 'SELECT * FROM pgml.diabetes LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (121, 4, 1, 4, 1, 'In this case, the `target` is a number that represents the severity of the disease progression one year later, with larger values indicating worse outcomes. Building a Regression model uses the same PostgresML API as Classification, just with a different task. You''re going to start breezing through these tutorials faster and faster.', '

In this case, the target is a number that represents the severity of the disease progression one year later, with larger values indicating worse outcomes. Building a Regression model uses the same PostgresML API as Classification, just with a different task. You''re going to start breezing through these tutorials faster and faster.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (122, 4, 3, 5, 1, 'SELECT * FROM pgml.train( +INSERT INTO pgml.notebook_cells VALUES (121, 4, 1, 4, 1, 'In this case, the `target` is a number that represents the severity of the disease progression one year later, with larger values indicating worse outcomes. Building a Regression model uses the same PostgresML API as Classification, just with a different task. You''re going to start breezing through these tutorials faster and faster.', '

In this case, the target is a number that represents the severity of the disease progression one year later, with larger values indicating worse outcomes. Building a Regression model uses the same PostgresML API as Classification, just with a different task. You''re going to start breezing through these tutorials faster and faster.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (122, 4, 3, 5, 1, 'SELECT * FROM pgml.train( project_name => ''Diabetes Progression'', task => ''regression'', relation_name => ''pgml.diabetes'', y_column_name => ''target'' );', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (123, 4, 1, 6, 1, 'With our baseline model automatically deployed, we can sample some of the predictions', '

With our baseline model automatically deployed, we can sample some of the predictions

', NULL, NULL); -INSERT INTO notebook_cells VALUES (124, 4, 3, 7, 1, 'SELECT target, pgml.predict(''Diabetes Progression'', ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) AS prediction +INSERT INTO pgml.notebook_cells VALUES (123, 4, 1, 6, 1, 'With our baseline model automatically deployed, we can sample some of the predictions', '

With our baseline model automatically deployed, we can sample some of the predictions

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (124, 4, 3, 7, 1, 'SELECT target, pgml.predict(''Diabetes Progression'', ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) AS prediction FROM pgml.diabetes LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (125, 4, 1, 8, 1, 'To get an objective measure of just how far off every single prediction is from the target, we can look at the key metrics recorded during training.', '

To get an objective measure of just how far off every single prediction is from the target, we can look at the key metrics recorded during training.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (126, 4, 3, 9, 1, 'SELECT +INSERT INTO pgml.notebook_cells VALUES (125, 4, 1, 8, 1, 'To get an objective measure of just how far off every single prediction is from the target, we can look at the key metrics recorded during training.', '

To get an objective measure of just how far off every single prediction is from the target, we can look at the key metrics recorded during training.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (126, 4, 3, 9, 1, 'SELECT projects.name, models.algorithm, round((models.metrics->>''r2'')::numeric, 4) AS r2_score @@ -1048,8 +1048,8 @@ FROM pgml.models JOIN pgml.projects on projects.id = models.project_id AND projects.name = ''Diabetes Progression'' ORDER BY models.created_at DESC LIMIT 5;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (127, 4, 1, 10, 1, 'I like to look at the R2 score, since it is fixed between 0 and 1 it can help us compare the performance of different algorithms on our data. Let''s throw our bag of tricks at the problem and see what sticks.', '

I like to look at the R2 score, since it is fixed between 0 and 1 it can help us compare the performance of different algorithms on our data. Let''s throw our bag of tricks at the problem and see what sticks.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (128, 4, 3, 11, 1, '-- linear models +INSERT INTO pgml.notebook_cells VALUES (127, 4, 1, 10, 1, 'I like to look at the R2 score, since it is fixed between 0 and 1 it can help us compare the performance of different algorithms on our data. Let''s throw our bag of tricks at the problem and see what sticks.', '

I like to look at the R2 score, since it is fixed between 0 and 1 it can help us compare the performance of different algorithms on our data. Let''s throw our bag of tricks at the problem and see what sticks.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (128, 4, 3, 11, 1, '-- linear models SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''ridge''); SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''lasso''); SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''elastic_net''); @@ -1080,15 +1080,15 @@ SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''random_forest' SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''xgboost'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''xgboost_random_forest'', hyperparams => ''{"n_estimators": 10}''); SELECT * FROM pgml.train(''Diabetes Progression'', algorithm => ''lightgbm'', hyperparams => ''{"n_estimators": 1}'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (129, 4, 1, 12, 1, 'It''s that easy, and that fast, to test all the algorithm''s in our toolkit to see what fares the best, and the best one has automatically been deployed. Once we''ve honed in on a few good candidate algorithms, we can check the docs for their hyperparams, and then do another brute force search across all combinations to find the best set.', '

It''s that easy, and that fast, to test all the algorithm''s in our toolkit to see what fares the best, and the best one has automatically been deployed. Once we''ve honed in on a few good candidate algorithms, we can check the docs for their hyperparams, and then do another brute force search across all combinations to find the best set.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (157, 6, 3, 22, 1, 'SELECT pgml.distance_l1(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (131, 5, 1, 6, 1, '### Summarization +INSERT INTO pgml.notebook_cells VALUES (129, 4, 1, 12, 1, 'It''s that easy, and that fast, to test all the algorithm''s in our toolkit to see what fares the best, and the best one has automatically been deployed. Once we''ve honed in on a few good candidate algorithms, we can check the docs for their hyperparams, and then do another brute force search across all combinations to find the best set.', '

It''s that easy, and that fast, to test all the algorithm''s in our toolkit to see what fares the best, and the best one has automatically been deployed. Once we''ve honed in on a few good candidate algorithms, we can check the docs for their hyperparams, and then do another brute force search across all combinations to find the best set.

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (157, 6, 3, 22, 1, 'SELECT pgml.distance_l1(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (131, 5, 1, 6, 1, '### Summarization Sometimes we need all the nuanced detail, but sometimes it''s nice to get to the point. Summarization can reduce a very long and complex document to a few sentences. One studied application is reducing legal bills passed by Congress into a plain english summary. Hollywood may also need some intelligence to reduce a full synopsis down to a pithy blurb for movies like Inception. See [summarization documentation](https://huggingface.co/tasks/summarization) for more options.', '

Summarization

Sometimes we need all the nuanced detail, but sometimes it''s nice to get to the point. Summarization can reduce a very long and complex document to a few sentences. One studied application is reducing legal bills passed by Congress into a plain english summary. Hollywood may also need some intelligence to reduce a full synopsis down to a pithy blurb for movies like Inception.

See summarization documentation for more options.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (132, 5, 3, 7, 1, 'SELECT pgml.transform( +INSERT INTO pgml.notebook_cells VALUES (132, 5, 3, 7, 1, 'SELECT pgml.transform( ''summarization'', inputs => ARRAY['' Dominic Cobb is the foremost practitioner of the artistic science @@ -1121,13 +1121,13 @@ INSERT INTO notebook_cells VALUES (132, 5, 3, 7, 1, 'SELECT pgml.transform( what happens in the dreams. ''] ) AS result;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (133, 5, 1, 8, 1, '### Question Answering +INSERT INTO pgml.notebook_cells VALUES (133, 5, 1, 8, 1, '### Question Answering Question Answering extracts an answer from a given context. Recent progress has enabled models to also specify if the answer is present in the context at all. If you were trying to build a general question answering system, you could first turn the question into a keyword search against Wikipedia articles, and then use a model to retrieve the correct answer from the top hit. Another application would provide automated support from a knowledge base, based on the customers question. See [question answering documentation](https://huggingface.co/tasks/question-answering) for more options.', '

Question Answering

Question Answering extracts an answer from a given context. Recent progress has enabled models to also specify if the answer is present in the context at all. If you were trying to build a general question answering system, you could first turn the question into a keyword search against Wikipedia articles, and then use a model to retrieve the correct answer from the top hit. Another application would provide automated support from a knowledge base, based on the customers question.

See question answering documentation for more options.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (134, 5, 3, 9, 1, 'SELECT pgml.transform( +INSERT INTO pgml.notebook_cells VALUES (134, 5, 3, 9, 1, 'SELECT pgml.transform( ''question-answering'', inputs => ARRAY[ ''{ @@ -1136,37 +1136,37 @@ INSERT INTO notebook_cells VALUES (134, 5, 3, 9, 1, 'SELECT pgml.transform( }'' ] ) AS answer;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (135, 5, 1, 10, 1, '### Text Generation +INSERT INTO pgml.notebook_cells VALUES (135, 5, 1, 10, 1, '### Text Generation If you need to expand on some thoughts, you can have AI complete your sentences for you:', '

Text Generation

If you need to expand on some thoughts, you can have AI complete your sentences for you:

', NULL, NULL); -INSERT INTO notebook_cells VALUES (136, 5, 3, 11, 1, 'SELECT pgml.transform( +INSERT INTO pgml.notebook_cells VALUES (136, 5, 3, 11, 1, 'SELECT pgml.transform( ''text-generation'', ''{"num_return_sequences": 2}'', ARRAY[''Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone''] ) AS result;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (137, 5, 1, 12, 1, '### More +INSERT INTO pgml.notebook_cells VALUES (137, 5, 1, 12, 1, '### More There are many different [tasks](https://huggingface.co/tasks) and tens of thousands of state-of-the-art [models](https://huggingface.co/models) available for you to explore. The possibilities are expanding every day. There can be amazing performance improvements in domain specific versions of these general tasks by fine tuning published models on your dataset. See the next section for [fine tuning](/user_guides/transformers/fine_tuning/) demonstrations.', '

More

There are many different tasks and tens of thousands of state-of-the-art models available for you to explore. The possibilities are expanding every day. There can be amazing performance improvements in domain specific versions of these general tasks by fine tuning published models on your dataset. See the next section for fine tuning demonstrations.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (138, 6, 3, 3, 1, 'SELECT pgml.add(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (139, 6, 3, 4, 1, 'SELECT pgml.subtract(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (140, 6, 3, 5, 1, 'SELECT pgml.multiply(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (141, 6, 3, 6, 1, 'SELECT pgml.divide(ARRAY[1.0::real, 2.0, 3.0], 100);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (142, 6, 1, 7, 1, '### Pairwise arithmetic', '

Pairwise arithmetic

', NULL, NULL); -INSERT INTO notebook_cells VALUES (143, 6, 3, 8, 1, 'SELECT pgml.add(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (144, 6, 3, 9, 1, 'SELECT pgml.subtract(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (145, 6, 3, 10, 1, 'SELECT pgml.multiply(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (146, 6, 3, 11, 1, 'SELECT pgml.divide(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (147, 6, 1, 12, 1, '### Norms', '

Norms

', NULL, NULL); -INSERT INTO notebook_cells VALUES (148, 6, 3, 13, 1, 'SELECT pgml.norm_l0(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (149, 6, 3, 14, 1, 'SELECT pgml.norm_l1(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (150, 6, 3, 15, 1, 'SELECT pgml.norm_l2(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (151, 6, 3, 16, 1, 'SELECT pgml.norm_max(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (160, 6, 3, 25, 1, 'SELECT pgml.cosine_similarity(ARRAY[1.0::real, 2.0, 3.0], ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (161, 6, 1, 26, 1, '### Generating Random Embeddings +INSERT INTO pgml.notebook_cells VALUES (138, 6, 3, 3, 1, 'SELECT pgml.add(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (139, 6, 3, 4, 1, 'SELECT pgml.subtract(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (140, 6, 3, 5, 1, 'SELECT pgml.multiply(ARRAY[1.0::real, 2.0, 3.0], 3);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (141, 6, 3, 6, 1, 'SELECT pgml.divide(ARRAY[1.0::real, 2.0, 3.0], 100);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (142, 6, 1, 7, 1, '### Pairwise arithmetic', '

Pairwise arithmetic

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (143, 6, 3, 8, 1, 'SELECT pgml.add(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (144, 6, 3, 9, 1, 'SELECT pgml.subtract(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (145, 6, 3, 10, 1, 'SELECT pgml.multiply(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (146, 6, 3, 11, 1, 'SELECT pgml.divide(ARRAY[1.0::real, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (147, 6, 1, 12, 1, '### Norms', '

Norms

', NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (148, 6, 3, 13, 1, 'SELECT pgml.norm_l0(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (149, 6, 3, 14, 1, 'SELECT pgml.norm_l1(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (150, 6, 3, 15, 1, 'SELECT pgml.norm_l2(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (151, 6, 3, 16, 1, 'SELECT pgml.norm_max(ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (160, 6, 3, 25, 1, 'SELECT pgml.cosine_similarity(ARRAY[1.0::real, 2.0, 3.0], ARRAY[1.0::real, 2.0, 3.0]);', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (161, 6, 1, 26, 1, '### Generating Random Embeddings We can populate a table of embeddings with 10,000 rows that have a 128 dimension embedding to demonstrate some vector functionality like nearest neighbor search.', '

Generating Random Embeddings

We can populate a table of embeddings with 10,000 rows that have a 128 dimension embedding to demonstrate some vector functionality like nearest neighbor search.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (162, 6, 3, 27, 1, 'CREATE TABLE embeddings AS +INSERT INTO pgml.notebook_cells VALUES (162, 6, 3, 27, 1, 'CREATE TABLE embeddings AS SELECT id, ARRAY_AGG(rand) AS vector FROM ( SELECT row_number() over () % 10000 + 1 AS id, random()::REAL AS rand @@ -1174,7 +1174,7 @@ FROM ( ) series GROUP BY id ORDER BY id;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (163, 6, 3, 28, 1, '-- Nearest neighbors to e1 using cosine similarity +INSERT INTO pgml.notebook_cells VALUES (163, 6, 3, 28, 1, '-- Nearest neighbors to e1 using cosine similarity SELECT e1.id, e2.id, @@ -1184,18 +1184,18 @@ JOIN embeddings e2 ON 1=1 WHERE e1.id = 1 ORDER BY distance DESC LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (164, 7, 1, 3, 1, '## Rolling back to a specific algorithm +INSERT INTO pgml.notebook_cells VALUES (164, 7, 1, 3, 1, '## Rolling back to a specific algorithm Rolling back creates a new deployment for the model that was deployed before the current one. Multiple rollbacks in a row will effectively oscillate between the two most recently deployed models, making rollbacks a relatively safe operation.', '

Rolling back to a specific algorithm

Rolling back creates a new deployment for the model that was deployed before the current one. Multiple rollbacks in a row will effectively oscillate between the two most recently deployed models, making rollbacks a relatively safe operation.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (165, 7, 3, 4, 1, 'SELECT * FROM pgml.deploy(''Handwritten Digits'', ''rollback'', ''svm'');', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (166, 8, 3, 7, 1, 'SELECT id, relation_name, test_sampling::TEXT FROM pgml.snapshots LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (167, 8, 3, 8, 1, 'SELECT * FROM pgml.snapshot_1 LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (168, 8, 1, 9, 1, '## Deployments +INSERT INTO pgml.notebook_cells VALUES (165, 7, 3, 4, 1, 'SELECT * FROM pgml.deploy(''Handwritten Digits'', ''rollback'', ''svm'');', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (166, 8, 3, 7, 1, 'SELECT id, relation_name, test_sampling::TEXT FROM pgml.snapshots LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (167, 8, 3, 8, 1, 'SELECT * FROM pgml.snapshot_1 LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (168, 8, 1, 9, 1, '## Deployments Deployments happen automatically if a new project has a better key metric after training, or when triggered manually. You can view all deployments.', '

Deployments

Deployments happen automatically if a new project has a better key metric after training, or when triggered manually. You can view all deployments.

', NULL, NULL); -INSERT INTO notebook_cells VALUES (169, 8, 3, 10, 1, 'SELECT id, model_id, strategy::TEXT FROM pgml.deployments LIMIT 10;', NULL, NULL, NULL); -INSERT INTO notebook_cells VALUES (170, 9, 1, 1, 1, '## Native Installation +INSERT INTO pgml.notebook_cells VALUES (169, 8, 3, 10, 1, 'SELECT id, model_id, strategy::TEXT FROM pgml.deployments LIMIT 10;', NULL, NULL, NULL); +INSERT INTO pgml.notebook_cells VALUES (170, 9, 1, 1, 1, '## Native Installation A PostgresML deployment consists of two different runtimes. The foundational runtime is a Python extension for Postgres ([pgml-extension](https://github.com/postgresml/postgresml/tree/master/pgml-extension/)) that facilitates the machine learning lifecycle inside the database. Additionally, we provide a dashboard ([pgml-dashboard](https://github.com/postgresml/postgresml/tree/master/pgml-dashboard/)) that can connect to your Postgres server and provide additional management functionality. It will also provide visibility into the models you build and data they use. @@ -1215,10 +1215,10 @@ We''d also love to hear your feedback. -- --- Name: notebook_cells_id_seq; Type: SEQUENCE SET; Schema: Owner: lev +-- Name: pgml.notebook_cells_id_seq; Type: SEQUENCE SET; Schema: Owner: lev -- -SELECT pg_catalog.setval('notebook_cells_id_seq', (SELECT MAX(id) + 1 FROM notebook_cells), true); +SELECT pg_catalog.setval('pgml.notebook_cells_id_seq', (SELECT MAX(id) + 1 FROM pgml.notebook_cells), true); -- diff --git a/pgml-dashboard/migrations/20221130170423_uploaded_files.down.sql b/pgml-dashboard/migrations/20221130170423_uploaded_files.down.sql index b865545b8..9f30aedcc 100644 --- a/pgml-dashboard/migrations/20221130170423_uploaded_files.down.sql +++ b/pgml-dashboard/migrations/20221130170423_uploaded_files.down.sql @@ -1,2 +1,2 @@ -- Add down migration script here -DROP TABLE uploaded_files; +DROP TABLE pgml.uploaded_files; diff --git a/pgml-dashboard/migrations/20221130170423_uploaded_files.up.sql b/pgml-dashboard/migrations/20221130170423_uploaded_files.up.sql index 467e355d3..744b3e1f8 100644 --- a/pgml-dashboard/migrations/20221130170423_uploaded_files.up.sql +++ b/pgml-dashboard/migrations/20221130170423_uploaded_files.up.sql @@ -1,6 +1,6 @@ -- Add up migration script here -- Add up migration script here -CREATE TABLE uploaded_files ( +CREATE TABLE pgml.uploaded_files ( id BIGSERIAL PRIMARY KEY, created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW() ); From 3398097757b13c148584831daa788440a0dfd526 Mon Sep 17 00:00:00 2001 From: Montana Low Date: Tue, 7 Feb 2023 17:29:51 -0800 Subject: [PATCH 2/2] consolidate development docs --- README.md | 2 + pgml-dashboard/README.md | 33 +--- pgml-dashboard/src/guards.rs | 2 +- pgml-dashboard/src/lib.rs | 5 + pgml-dashboard/src/models.rs | 26 +-- pgml-docs/docs/developer_guide/overview.md | 187 +++++++++++++++--- .../docs/user_guides/setup/v2/installation.md | 22 +-- pgml-extension/Cargo.toml | 3 +- pgml-extension/README.md | 79 +------- pgml-extension/docker/Cargo.toml.cuda | 3 +- pgml-extension/docker/Cargo.toml.no-python | 3 +- 11 files changed, 202 insertions(+), 163 deletions(-) diff --git a/README.md b/README.md index d81d42930..f2e3edb05 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ The dashboard makes it easy to compare different algorithms or hyperparameters a See it in action — cloud.postgresml.org +Please see the [quick start instructions](https://postgresml.org/user_guides/setup/quick_start_with_docker/) for general information on installing or deploying PostgresML. A [developer guide](https://postgresml.org/developer_guide/overview/) is also available for those who would like to contribute. + ## What's in the box See the documentation for a complete **[list of functionality](https://postgresml.org/)**. diff --git a/pgml-dashboard/README.md b/pgml-dashboard/README.md index f11f9ad39..a960ad77a 100644 --- a/pgml-dashboard/README.md +++ b/pgml-dashboard/README.md @@ -2,35 +2,4 @@ PostgresML provides a dashboard with analytical views of the training data and model performance, as well as integrated notebooks for rapid iteration. It is primarily written in Rust using [Rocket](https://rocket.rs/) as a lightweight web framework and [SQLx](https://github.com/launchbadge/sqlx) to interact with the database. -Please see the [online documentation](https://postgresml.org/user_guides/setup/quick_start_with_docker/) for general information on installing or deploying PostgresML. This document is intended to help developers set up a local copy of the dashboard. - -## Requirements - -The dashboard requires a Postgres database with the [pgml-extension](https://github.com/postgresml/postgresml/tree/master/pgml-extension) to generate the core schema. See that subproject for developer setup. - -We develop and test this web application on Linux, OS X, and Windows using WSL2. - -## Build process - -You'll need to specify a database url for the extension to interact with via an environment variable: - -```commandline -export DATABASE_URL=postgres://user_name:password@localhost:5432/database_name -``` - -Build and run: - -```commandline -cargo run -``` - -Incremental and automatic compilation for development cycles is supported with: - -```commandline -cargo watch --exec run -``` - -Run tests: -```commandline -cargo test -``` +Please see the [quick start instructions](https://postgresml.org/user_guides/setup/quick_start_with_docker/) for general information on installing or deploying PostgresML. A [developer guide](https://postgresml.org/developer_guide/overview/) is also available for those who would like to contribute. diff --git a/pgml-dashboard/src/guards.rs b/pgml-dashboard/src/guards.rs index 65342a109..6c672405a 100644 --- a/pgml-dashboard/src/guards.rs +++ b/pgml-dashboard/src/guards.rs @@ -56,6 +56,6 @@ impl<'r> FromRequest<'r> for Cluster { pub fn default_database_url() -> String { match var("DATABASE_URL") { Ok(val) => val, - Err(_) => "postgres:///dashboard".to_string(), + Err(_) => "postgres:///pgml".to_string(), } } diff --git a/pgml-dashboard/src/lib.rs b/pgml-dashboard/src/lib.rs index 2434911ba..dd58ce92c 100644 --- a/pgml-dashboard/src/lib.rs +++ b/pgml-dashboard/src/lib.rs @@ -22,6 +22,7 @@ mod templates; use guards::Cluster; use responses::{BadRequest, ResponseOk}; +use sqlx::Executor; /// This struct contains information specific to the cluster being displayed in the dashboard. /// @@ -50,6 +51,10 @@ impl Clusters { .max_connections(5) .idle_timeout(std::time::Duration::from_millis(15_000)) .min_connections(0) + .after_connect(|conn, _meta| Box::pin(async move { + conn.execute("SET application_name = 'pgml_dashboard';").await?; + Ok(()) + })) .connect_lazy(database_url)?; pools.insert(cluster_id, pool.clone()); diff --git a/pgml-dashboard/src/models.rs b/pgml-dashboard/src/models.rs index e54f24e43..45c78f9b4 100644 --- a/pgml-dashboard/src/models.rs +++ b/pgml-dashboard/src/models.rs @@ -86,7 +86,7 @@ pub struct Notebook { impl Notebook { pub async fn get_by_id(pool: &PgPool, id: i64) -> anyhow::Result { Ok( - sqlx::query_as!(Notebook, "SELECT * FROM notebooks WHERE id = $1", id,) + sqlx::query_as!(Notebook, "SELECT * FROM pgml.notebooks WHERE id = $1", id,) .fetch_one(pool) .await?, ) @@ -95,7 +95,7 @@ impl Notebook { pub async fn create(pool: &PgPool, name: &str) -> anyhow::Result { Ok(sqlx::query_as!( Notebook, - "INSERT INTO notebooks (name) VALUES ($1) RETURNING *", + "INSERT INTO pgml.notebooks (name) VALUES ($1) RETURNING *", name, ) .fetch_one(pool) @@ -103,7 +103,7 @@ impl Notebook { } pub async fn all(pool: &PgPool) -> anyhow::Result> { - Ok(sqlx::query_as!(Notebook, "SELECT * FROM notebooks") + Ok(sqlx::query_as!(Notebook, "SELECT * FROM pgml.notebooks") .fetch_all(pool) .await?) } @@ -111,7 +111,7 @@ impl Notebook { pub async fn cells(&self, pool: &PgPool) -> anyhow::Result> { Ok(sqlx::query_as!( Cell, - "SELECT * FROM notebook_cells + "SELECT * FROM pgml.notebook_cells WHERE notebook_id = $1 AND deleted_at IS NULL ORDER BY cell_number", @@ -123,7 +123,7 @@ impl Notebook { pub async fn reset(&self, pool: &PgPool) -> anyhow::Result<()> { let _ = sqlx::query!( - "UPDATE notebook_cells + "UPDATE pgml.notebook_cells SET execution_time = NULL, rendering = NULL @@ -189,15 +189,15 @@ impl Cell { " WITH lock AS ( - SELECT * FROM notebooks WHERE id = $1 FOR UPDATE + SELECT * FROM pgml.notebooks WHERE id = $1 FOR UPDATE ), max_cell AS ( SELECT COALESCE(MAX(cell_number), 0) AS cell_number - FROM notebook_cells + FROM pgml.notebook_cells WHERE notebook_id = $1 AND deleted_at IS NULL ) - INSERT INTO notebook_cells + INSERT INTO pgml.notebook_cells (notebook_id, cell_type, contents, cell_number, version) VALUES ($1, $2, $3, (SELECT cell_number + 1 FROM max_cell), 1) @@ -231,7 +231,7 @@ impl Cell { cell_number, version, deleted_at - FROM notebook_cells + FROM pgml.notebook_cells WHERE id = $1 ", id, @@ -250,7 +250,7 @@ impl Cell { self.contents = contents.to_string(); let _ = sqlx::query!( - "UPDATE notebook_cells + "UPDATE pgml.notebook_cells SET cell_type = $1, contents = $2, @@ -269,7 +269,7 @@ impl Cell { pub async fn delete(&self, pool: &PgPool) -> anyhow::Result { Ok(sqlx::query_as!( Cell, - "UPDATE notebook_cells + "UPDATE pgml.notebook_cells SET deleted_at = NOW() WHERE id = $1 RETURNING id, @@ -337,7 +337,7 @@ impl Cell { }; sqlx::query!( - "UPDATE notebook_cells SET rendering = $1 WHERE id = $2", + "UPDATE pgml.notebook_cells SET rendering = $1 WHERE id = $2", rendering, self.id ) @@ -797,7 +797,7 @@ impl UploadedFile { pub async fn create(pool: &PgPool) -> anyhow::Result { Ok(sqlx::query_as!( UploadedFile, - "INSERT INTO uploaded_files (id, created_at) VALUES (DEFAULT, DEFAULT) + "INSERT INTO pgml.uploaded_files (id, created_at) VALUES (DEFAULT, DEFAULT) RETURNING id, created_at" ) .fetch_one(pool) diff --git a/pgml-docs/docs/developer_guide/overview.md b/pgml-docs/docs/developer_guide/overview.md index 21a97e1fc..f2df939ed 100644 --- a/pgml-docs/docs/developer_guide/overview.md +++ b/pgml-docs/docs/developer_guide/overview.md @@ -10,6 +10,55 @@ Our project consists of three (3) applications: The development environment for each differs slightly, but overall we use Python, Rust, and PostgreSQL, so as long as you have all of those installed, the setup should be straight forward. +## Build Dependencies + +1. Install the latest Rust compiler from [rust-lang.org](https://www.rust-lang.org/learn/get-started). + +2. Install a [modern version](https://apt.kitware.com/) of CMake. + +3. Install PostgreSQL development headers and other dependencies: + + ```commandline + export POSTGRES_VERSION=15 + sudo apt-get update && \ + sudo apt-get install -y \ + postgresql-server-dev-${POSTGRES_VERSION} \ + bison \ + build-essential \ + clang \ + cmake \ + flex \ + libclang-dev \ + libopenblas-dev \ + libpython3-dev \ + libreadline-dev \ + libssl-dev \ + pkg-config \ + python3-dev + ``` + +4. Install the Python dependencies + + If your system comes with Python 3.6 or lower, you'll need to install `libpython3.7-dev` or higher. You can get it from [`ppa:deadsnakes/ppa`](https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa): + + ```commandline + sudo add-apt-repository ppa:deadsnakes/ppa && \ + sudo apt update && sudo apt install -y libpython3.7-dev + ``` + + With Python 3.7+ installed, install the package dependencies + + ```commandline + sudo pip3 install xgboost lightgbm scikit-learn + ``` + +5. Clone our git repository: + + ```commandline + git clone https://github.com/postgresml/postgresml && \ + cd postgresml && \ + git submodule update --init --recursive && \ + ``` ## Postgres extension @@ -17,63 +66,151 @@ PostgresML is a Rust extension written with `tcdi/pgx` crate. Local development The extension code is located in: -```bash +```commandline cd pgml-extension/ ``` +You'll need to install basic dependencies + Once there, you can initialize `pgx` and get going: -```bash -cargo install cargo-pgx --version "0.4.5" +#### Pgx command line and environments +```commandline +cargo install cargo-pgx --version "0.7.1" && \ cargo pgx init # This will take a few minutes ``` -`pgx` uses Postgres 13 by default. Since `pgml` is using shared memory, you need to add it to `shared_preload_libraries` in `postgresql.conf` which, for `pgx`, is located in `~/.pgx/data-13/postgresql.conf`. +#### Update postgresql.conf + +`pgx` uses Postgres 15 by default. Since `pgml` is using shared memory, you need to add it to `shared_preload_libraries` in `postgresql.conf` which, for `pgx`, is located in `~/.pgx/data-15/postgresql.conf`. ``` -shared_preload_libraries = 'pgml' +shared_preload_libraries = 'pgml' # (change requires restart) ``` -Then you're ready to go: +Run the unit tests -```bash +```commandline +cargo pgx test +``` + +Run the integration tests: +```commandline +cargo pgx run --release +psql -h localhost -p 28813 -d pgml -f tests/test.sql -P pager +``` + +Run an interactive psql session + +```commandline cargo pgx run ``` +Create the extension in your database: + +```commandline +CREATE EXTENSION pgml; +``` + +That's it, PostgresML is ready. You can validate the installation by running: + +=== "SQL" + ```sql + SELECT pgml.version(); + ``` + +=== "Output" + + ``` + postgres=# select pgml.version(); + version + ------------------- + 2.2.0 + (1 row) + ``` + +Basic extension usage: + +```sql +SELECT * FROM pgml.load_dataset('diabetes'); +SELECT * FROM pgml.train('Project name', 'regression', 'pgml.diabetes', 'target', 'xgboost'); +SELECT target, pgml.predict('Project name', ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) FROM pgml.diabetes LIMIT 10; +``` + +If you're going to run the dashboard against this database to develop both the extension and + +```commandline + +``` + By default, the extension is built without CUDA support for XGBoost and LightGBM. You'll need to install CUDA locally to build and enable the `cuda` feature for cargo. CUDA can be downloaded [here](https://developer.nvidia.com/cuda-downloads?target_os=Linux). -```bash -CUDACXX=/usr/local/cuda/bin/nvcc cargo pgx run --release --features pg13,python,cuda + +```commandline +CUDACXX=/usr/local/cuda/bin/nvcc cargo pgx run --release --features pg15,python,cuda ``` If you ever want to reset the environment, simply spin up the database with `cargo pgx run` and drop the extension and metadata tables: ```postgresql -DROP EXTENSION pgml CASCADE; -DROP SCHEMA pgml CASCADE; +DROP EXTENSION IF EXISTS pgml CASCADE; +DROP SCHEMA IF EXISTS pgml CASCADE; CREATE EXTENSION pgml; ``` -## Dashboard app -The Dashboard is a Django application, and requires no special setup apart for what's required for a normal Django project. +#### Packaging -``` -cd pgml-dashboard/ +This requires Docker. Once Docker is installed, you can run: + +```bash +bash build_extension.sh ``` -Once there, you can setup a virtual environment and get going: +which will produce a `.deb` file in the current directory (this will take about 20 minutes). The deb file can be installed with `apt-get`, for example: ```bash -python3 -m venv venv -source venv/bin/activate -pip install -r requirements.txt -cp .env.TEMPLATE .env -python manage.py migrate -python manage.py runserver +apt-get install ./postgresql-pgml-12_0.0.4-ubuntu20.04-amd64.deb ``` -The dashboard expects to have a PostgreSQL database with the `pgml` extension installed into the `pgml_development` database. You can install it by following our [Installation](/user_guides/setup/v2/installation/) instructions or by pointing the Django app to the database started by `cargo pgx run`. +which will take care of installing its dependencies as well. Make sure to run this as root and not with sudo. + +## Run the dashboard + +The dashboard is a web app that can be run against any Postgres database with the extension installed. There is a Dockerfile included with the source code if you wish to run it as a container. + +The dashboard requires a Postgres database with the [pgml-extension](https://github.com/postgresml/postgresml/tree/master/pgml-extension) to generate the core schema. See that subproject for developer setup. + +We develop and test this web application on Linux, OS X, and Windows using WSL2. + +Basic installation can be achieved with: + +1. Clone the repo (if you haven't already for the extension): +```commandline + cd postgresml/pgml-dashboard +``` + +2. Set the `DATABASE_URL` environment variable, for example to a running interactive `cargo pgx run` session started previously: +```commandline +export DATABASE_URL=postgres://localhost:28815/pgml +``` + +3. Run migrations +```commandline +sqlx migrate run +``` + +4. Run tests: +```commandline +cargo test +``` + +5. Incremental and automatic compilation for development cycles is supported with: +```commandline +cargo watch --exec run +``` + +The dashboard can be packaged for distribution. You'll need to copy the static files along with the `target/release` directory to your server. ## Documentation app @@ -83,9 +220,9 @@ The documentation app (you're using it right now) is using MkDocs. cd pgml-docs/ ``` -Once there, you can setup a virtual environment and get going: +Once there, you can set up a virtual environment and get going: -```bash +```commandline python3 -m venv venv source venv/bin/activate pip install -r requirements.txt diff --git a/pgml-docs/docs/user_guides/setup/v2/installation.md b/pgml-docs/docs/user_guides/setup/v2/installation.md index eb0d3923b..9bd45da6a 100644 --- a/pgml-docs/docs/user_guides/setup/v2/installation.md +++ b/pgml-docs/docs/user_guides/setup/v2/installation.md @@ -25,7 +25,7 @@ If your system Python is older, consider installing a newer version from [`ppa:d #### PostgreSQL -PostgresML is a Postgres extension and requires PostgreSQL to be installed. We support PostgreSQL 10 through 14. You can use the PostgreSQL version that comes with your system or get it from the [PostgreSQL PPA](https://wiki.postgresql.org/wiki/Apt). +PostgresML is a Postgres extension and requires PostgreSQL to be installed. We support PostgreSQL 11 through 15. You can use the PostgreSQL version that comes with your system or get it from the [PostgreSQL PPA](https://wiki.postgresql.org/wiki/Apt). ```bash sudo apt-get update && \ @@ -45,7 +45,7 @@ sudo apt-get install postgresql 2. Install the extension: ``` - export POSTGRES_VERSION=14 + export POSTGRES_VERSION=15 sudo apt-get update && sudo apt-get install -y postgresql-pgml-${POSTGRES_VERSION} ``` @@ -54,9 +54,9 @@ sudo apt-get install postgresql === ":material-linux: :material-microsoft: From Source (Linux & WSL)" - These instructions assume a Debian flavor Linux and PostgreSQL 14. Adjust the PostgreSQL + These instructions assume a Debian flavor Linux and PostgreSQL 15. Adjust the PostgreSQL version accordingly if yours is different. Other flavors of Linux should work, but have not been tested. - PostgreSQL 10 through 14 are supported. + PostgreSQL 11 through 15 are supported. 1. Install the latest Rust compiler from [rust-lang.org](https://www.rust-lang.org/learn/get-started). @@ -65,7 +65,7 @@ sudo apt-get install postgresql 3. Install PostgreSQL development headers and other dependencies: ```bash - export POSTGRES_VERSION=14 + export POSTGRES_VERSION=15 sudo apt-get update && \ sudo apt-get install -y \ postgresql-server-dev-${POSTGRES_VERSION} \ @@ -97,7 +97,7 @@ sudo apt-get install postgresql **With Python support:** ```bash - export POSTGRES_VERSION=14 + export POSTGRES_VERSION=15 cargo install cargo-pgx --version "0.7.1" && \ cargo pgx init --pg${POSTGRES_VERSION} /usr/bin/pg_config && \ cargo pgx package @@ -106,7 +106,7 @@ sudo apt-get install postgresql **Without Python support:** ```bash - export POSTGRES_VERSION=14 + export POSTGRES_VERSION=15 cp docker/Cargo.toml.no-python Cargo.toml && \ cargo install cargo-pgx --version "0.7.1" && \ cargo pgx init --pg${POSTGRES_VERSION} /usr/bin/pg_config && \ @@ -116,7 +116,7 @@ sudo apt-get install postgresql 6. Copy the extension binaries into Postgres system folders: ```bash - export POSTGRES_VERSION=14 + export POSTGRES_VERSION=15 # Copy the extension .so sudo cp target/release/pgml-pg${POSTGRES_VERSION}/usr/lib/postgresql/${POSTGRES_VERSION}/lib/* \ @@ -156,7 +156,7 @@ sudo apt-get install postgresql ``` cargo install cargo-pgx && \ - cargo pgx init --pg14 /usr/bin/pg_config && \ + cargo pgx init --pg15 /usr/bin/pg_config && \ cargo pgx install ``` @@ -197,7 +197,7 @@ Now that the extension is installed on your system, add it into the database whe ``` postgres=# CREATE EXTENSION pgml; INFO: Python version: 3.10.4 (main, Jun 29 2022, 12:14:53) [GCC 11.2.0] - INFO: Scikit-learn 1.1.1, XGBoost 1.62, LightGBM 3.3.2 + INFO: Scikit-learn 1.1.3, XGBoost 1.7.1, LightGBM 3.3.3, NumPy 1.23.5 CREATE EXTENSION ``` @@ -215,7 +215,7 @@ That's it, PostgresML is ready. You can validate the installation by running: postgres=# select pgml.version(); version ------------------- - 2.0.0 + 2.2.0 (1 row) ``` diff --git a/pgml-extension/Cargo.toml b/pgml-extension/Cargo.toml index 6a725116c..0b434cb34 100644 --- a/pgml-extension/Cargo.toml +++ b/pgml-extension/Cargo.toml @@ -7,11 +7,12 @@ edition = "2021" crate-type = ["cdylib"] [features] -default = ["pg13", "python"] +default = ["pg15", "python"] pg11 = ["pgx/pg11", "pgx-tests/pg11" ] pg12 = ["pgx/pg12", "pgx-tests/pg12" ] pg13 = ["pgx/pg13", "pgx-tests/pg13" ] pg14 = ["pgx/pg14", "pgx-tests/pg14" ] +pg15 = ["pgx/pg15", "pgx-tests/pg15" ] pg_test = [] python = ["pyo3"] cuda = ["xgboost/cuda", "lightgbm/cuda"] diff --git a/pgml-extension/README.md b/pgml-extension/README.md index 3a2b7a312..dd1efef5a 100644 --- a/pgml-extension/README.md +++ b/pgml-extension/README.md @@ -2,83 +2,6 @@ PostgresML is a PostgreSQL extension providing end-to-end machine learning inside your database. The extension is primarily written in Rust using [pgx](https://github.com/tcdi/pgx) and provides a SQL interface to various machine learning algorithm implementations such as [XGBoost](https://github.com/dmlc/xgboost), [LightGBM](https://github.com/microsoft/LightGBM), and [other classical methods](https://github.com/rust-ml/linfa). -Python seems to be the de facto ML industry standard, so we also include "reference" implementations of classical algorithms from Scikit-learn as well for comparison to the Rust implementations, but runtime performance and correctness. The Python integration is written using `pyo3`. - See [our blog](https://postgresml.org/blog/postgresml-is-moving-to-rust-for-our-2.0-release/) for a performance comparison and further motivations. -## Requirements - -PostgresML requires Python 3.7 or above and the Rust compiler and toolchain. You can download the Rust compiler [here](https://rust-lang.org). - -We develop and test this extension on Linux, OS X, and Windows using WSL2. - -## Dependencies - -If you haven't already, install: - -- `cmake` -- `libclang-dev` -- `libopenblas-dev` -- `build-essential` -- `libssl-dev` -- `openssl-sys` -- `pkg-config` -- `libreadline-dev` -- `libpython3-dev` (Python 3.7 or higher) - -## Python - -If your system comes with Python 3.6 or lower, you'll need to install `libpython3.7-dev` or higher. You can get it from [`ppa:deadsnakes/ppa`](https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa): - -1. `sudo add-apt-repository ppa:deadsnakes/ppa` -2. `sudo apt update && sudo apt install libpython3.7-dev` - - -## Update postgresql.conf - -PostgresML requires to be loaded as a shared library. For local development, this is in `~/.pgx/data-13/postgresql.conf`: - -``` -shared_preload_libraries = 'pgml' # (change requires restart) -``` - -## Local development - -0. `git submodule update --init --recursive` -1. `cargo install cargo-pgx --version=0.7.1` version needs to match Cargo.toml -2. `cargo pgx init` (this will take a while, go get a coffee) -3. `cargo pgx run` -4. `DROP EXTENSION IF EXISTS pgml; DROP SCHEMA IF EXISTS pgml CASCADE;` -5. `CREATE EXTENSION pgml;` -6. `SELECT * FROM pgml.load_dataset('diabetes');` -7. `SELECT * FROM pgml.train('Project name', 'regression', 'pgml.diabetes', 'target', 'xgboost');` -8. `SELECT target, pgml.predict('Project name', ARRAY[age, sex, bmi, bp, s1, s2, s3, s4, s5, s6]) FROM pgml.diabetes LIMIT 10;` - -## Testing - -Run unit tests: -```commandline -cargo test -``` - -Run integration tests: -```commandline -cargo pgx run --release -psql -h localhost -p 28813 -d pgml -f tests/test.sql -P pager -``` - -## Packaging - -This requires Docker. Once Docker is installed, you can run: - -```bash -bash build_extension.sh -``` - -which will produce a `.deb` file in the current directory (this will take about 20 minutes). The deb file can be installed with `apt-get`, for example: - -```bash -apt-get install ./postgresql-pgml-12_0.0.4-ubuntu20.04-amd64.deb -``` - -which will take care of installing its dependencies as well. Make sure to run this as root and not with sudo. +Please see the [quick start instructions](https://postgresml.org/user_guides/setup/quick_start_with_docker/) for general information on installing or deploying PostgresML. A [developer guide](https://postgresml.org/developer_guide/overview/) is also available for those who would like to contribute. diff --git a/pgml-extension/docker/Cargo.toml.cuda b/pgml-extension/docker/Cargo.toml.cuda index 65c62ddd0..97102f767 100644 --- a/pgml-extension/docker/Cargo.toml.cuda +++ b/pgml-extension/docker/Cargo.toml.cuda @@ -7,12 +7,13 @@ edition = "2021" crate-type = ["cdylib"] [features] -default = ["pg13", "python", "cuda"] +default = ["pg15", "python", "cuda"] pg10 = ["pgx/pg10", "pgx-tests/pg10" ] pg11 = ["pgx/pg11", "pgx-tests/pg11" ] pg12 = ["pgx/pg12", "pgx-tests/pg12" ] pg13 = ["pgx/pg13", "pgx-tests/pg13" ] pg14 = ["pgx/pg14", "pgx-tests/pg14" ] +pg15 = ["pgx/pg15", "pgx-tests/pg15" ] pg_test = [] python = ["pyo3"] cuda = ["xgboost/cuda", "lightgbm/cuda"] diff --git a/pgml-extension/docker/Cargo.toml.no-python b/pgml-extension/docker/Cargo.toml.no-python index 185a4a309..e4dbf82cd 100644 --- a/pgml-extension/docker/Cargo.toml.no-python +++ b/pgml-extension/docker/Cargo.toml.no-python @@ -7,12 +7,13 @@ edition = "2021" crate-type = ["cdylib"] [features] -default = ["pg13"] +default = ["pg15"] pg10 = ["pgx/pg10", "pgx-tests/pg10" ] pg11 = ["pgx/pg11", "pgx-tests/pg11" ] pg12 = ["pgx/pg12", "pgx-tests/pg12" ] pg13 = ["pgx/pg13", "pgx-tests/pg13" ] pg14 = ["pgx/pg14", "pgx-tests/pg14" ] +pg14 = ["pgx/pg15", "pgx-tests/pg15" ] pg_test = [] python = ["pyo3"] cuda = ["xgboost/cuda", "lightgbm/cuda"] pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy