Content-Length: 6984 | pFad | http://github.com/postgresml/postgresml/pull/16.patch
thub.com
From ff78f3e1be4d92036deeb960fa802ea7102c9e2c Mon Sep 17 00:00:00 2001
From: Montana Low
Date: Sun, 17 Apr 2022 16:12:18 -0700
Subject: [PATCH] add the california housing example
---
README.md | 5 ++++
examples/california_housing/run.sql | 41 +++++++++++++++++++++++++++++
examples/digits/run.sql | 3 +++
pgml/pgml/datasets.py | 28 ++++++++++++++++++--
sql/install.sql | 2 +-
5 files changed, 76 insertions(+), 3 deletions(-)
create mode 100644 examples/california_housing/run.sql
diff --git a/README.md b/README.md
index 6d0caf828..384c2f68c 100644
--- a/README.md
+++ b/README.md
@@ -211,4 +211,9 @@ Run the test:
psql -f sql/test.sql
```
+One liner:
+```
+cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql
+```
+
Make sure to run it exactly like this, from the root directory of the repo.
diff --git a/examples/california_housing/run.sql b/examples/california_housing/run.sql
new file mode 100644
index 000000000..487abde2e
--- /dev/null
+++ b/examples/california_housing/run.sql
@@ -0,0 +1,41 @@
+-- This example trains models on the sklean california_housing dataset
+-- which is a copy of the test set from the StatLib repository
+-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
+--
+-- This demonstrates using a table with individual columns as features
+-- for regression.
+SELECT pgml.load_dataset('california_housing');
+
+-- view the dataset
+SELECT * from pgml.california_housing;
+
+-- train a simple model to classify the data
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target');
+
+-- check out the predictions
+SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
+FROM pgml.california_housing
+LIMIT 10;
+
+-- -- train some more models with different algorithms
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm');
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest');
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees');
+-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network');
+-- -- check out all that hard work
+SELECT * FROM pgml.trained_models;
+
+-- deploy the random_forest model for prediction use
+SELECT pgml.deploy('California Housing Prediction', 'random_forest');
+-- check out that throughput
+SELECT * FROM pgml.deployed_models;
+
+-- do some hyper param tuning
+-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees');
+-- deploy the "best" model for prediction use
+SELECT pgml.deploy('California Housing Prediction', 'best_fit');
+
+-- check out the improved predictions
+SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
+FROM pgml.california_housing
+LIMIT 10;
diff --git a/examples/digits/run.sql b/examples/digits/run.sql
index 539c03053..0491a7a09 100644
--- a/examples/digits/run.sql
+++ b/examples/digits/run.sql
@@ -2,6 +2,9 @@
-- which is a copy of the test set of the UCI ML hand-written digits datasets
-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
--
+-- This demonstrates using a table with a single array feature column
+-- for classification.
+--
-- The final result after a few seconds of training is not terrible. Maybe not perfect
-- enough for mission critical applications, but it's telling how quickly "off the shelf"
-- solutions can solve problems these days.
diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py
index d997254cf..1f2948eae 100644
--- a/pgml/pgml/datasets.py
+++ b/pgml/pgml/datasets.py
@@ -1,5 +1,5 @@
import plpy
-from sklearn.datasets import load_digits as d
+import sklearn.datasets
from pgml.sql import q
from pgml.exceptions import PgMLException
@@ -7,15 +7,39 @@
def load(source: str):
if source == "digits":
load_digits()
+ elif source == "california_housing":
+ load_california_housing()
else:
raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
return "OK"
def load_digits():
- dataset = d()
+ dataset = sklearn.datasets.load_digits()
a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
for X, y in zip(dataset["data"], dataset["target"]):
X = ",".join("%i" % x for x in list(X))
plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")
+
+def load_california_housing():
+ dataset = sklearn.datasets.fetch_california_housing()
+ a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing")
+ a = plpy.execute("""
+ CREATE TABLE pgml.california_housing (
+ median_income FLOAT4, -- median income in block group
+ house_age FLOAT4, -- median house age in block group
+ avg_rooms FLOAT4, -- average number of rooms per household
+ avg_bedrooms FLOAT4, -- average number of bedrooms per household
+ population FLOAT4, -- block group population
+ avg_occupants FLOAT4, -- average number of household members
+ latitude FLOAT4, -- block group latitude
+ longitude FLOAT4, -- block group longitudetarget INTEGER
+ target FLOAT4
+ )""")
+ a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""")
+ for X, y in zip(dataset["data"], dataset["target"]):
+ plpy.execute(f"""
+ INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target)
+ VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""")
+
\ No newline at end of file
diff --git a/sql/install.sql b/sql/install.sql
index 5eadea438..c846dcd47 100644
--- a/sql/install.sql
+++ b/sql/install.sql
@@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u;
---
--- Predict
---
-CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[])
+CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
AS $$
from pgml.model import Project
--- a PPN by Garber Painting Akron. With Image Size Reduction included!Fetched URL: http://github.com/postgresml/postgresml/pull/16.patch
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy