diff --git a/README.md b/README.md index 6d0caf828..384c2f68c 100644 --- a/README.md +++ b/README.md @@ -211,4 +211,9 @@ Run the test: psql -f sql/test.sql ``` +One liner: +``` +cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql +``` + Make sure to run it exactly like this, from the root directory of the repo. diff --git a/examples/california_housing/run.sql b/examples/california_housing/run.sql new file mode 100644 index 000000000..487abde2e --- /dev/null +++ b/examples/california_housing/run.sql @@ -0,0 +1,41 @@ +-- This example trains models on the sklean california_housing dataset +-- which is a copy of the test set from the StatLib repository +-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html +-- +-- This demonstrates using a table with individual columns as features +-- for regression. +SELECT pgml.load_dataset('california_housing'); + +-- view the dataset +SELECT * from pgml.california_housing; + +-- train a simple model to classify the data +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target'); + +-- check out the predictions +SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction +FROM pgml.california_housing +LIMIT 10; + +-- -- train some more models with different algorithms +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm'); +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest'); +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees'); +-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network'); +-- -- check out all that hard work +SELECT * FROM pgml.trained_models; + +-- deploy the random_forest model for prediction use +SELECT pgml.deploy('California Housing Prediction', 'random_forest'); +-- check out that throughput +SELECT * FROM pgml.deployed_models; + +-- do some hyper param tuning +-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees'); +-- deploy the "best" model for prediction use +SELECT pgml.deploy('California Housing Prediction', 'best_fit'); + +-- check out the improved predictions +SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction +FROM pgml.california_housing +LIMIT 10; diff --git a/examples/digits/run.sql b/examples/digits/run.sql index 539c03053..0491a7a09 100644 --- a/examples/digits/run.sql +++ b/examples/digits/run.sql @@ -2,6 +2,9 @@ -- which is a copy of the test set of the UCI ML hand-written digits datasets -- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits -- +-- This demonstrates using a table with a single array feature column +-- for classification. +-- -- The final result after a few seconds of training is not terrible. Maybe not perfect -- enough for mission critical applications, but it's telling how quickly "off the shelf" -- solutions can solve problems these days. diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py index d997254cf..1f2948eae 100644 --- a/pgml/pgml/datasets.py +++ b/pgml/pgml/datasets.py @@ -1,5 +1,5 @@ import plpy -from sklearn.datasets import load_digits as d +import sklearn.datasets from pgml.sql import q from pgml.exceptions import PgMLException @@ -7,15 +7,39 @@ def load(source: str): if source == "digits": load_digits() + elif source == "california_housing": + load_california_housing() else: raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].") return "OK" def load_digits(): - dataset = d() + dataset = sklearn.datasets.load_digits() a = plpy.execute("DROP TABLE IF EXISTS pgml.digits") a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)") a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""") for X, y in zip(dataset["data"], dataset["target"]): X = ",".join("%i" % x for x in list(X)) plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""") + +def load_california_housing(): + dataset = sklearn.datasets.fetch_california_housing() + a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing") + a = plpy.execute(""" + CREATE TABLE pgml.california_housing ( + median_income FLOAT4, -- median income in block group + house_age FLOAT4, -- median house age in block group + avg_rooms FLOAT4, -- average number of rooms per household + avg_bedrooms FLOAT4, -- average number of bedrooms per household + population FLOAT4, -- block group population + avg_occupants FLOAT4, -- average number of household members + latitude FLOAT4, -- block group latitude + longitude FLOAT4, -- block group longitudetarget INTEGER + target FLOAT4 + )""") + a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""") + for X, y in zip(dataset["data"], dataset["target"]): + plpy.execute(f""" + INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target) + VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""") + \ No newline at end of file diff --git a/sql/install.sql b/sql/install.sql index 5eadea438..c846dcd47 100644 --- a/sql/install.sql +++ b/sql/install.sql @@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u; --- --- Predict --- -CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[]) +CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features DOUBLE PRECISION[]) RETURNS DOUBLE PRECISION AS $$ from pgml.model import Project
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: