Content-Length: 6396 | pFad | http://github.com/postgresml/postgresml/pull/16.diff

thub.com diff --git a/README.md b/README.md index 6d0caf828..384c2f68c 100644 --- a/README.md +++ b/README.md @@ -211,4 +211,9 @@ Run the test: psql -f sql/test.sql ``` +One liner: +``` +cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql +``` + Make sure to run it exactly like this, from the root directory of the repo. diff --git a/examples/california_housing/run.sql b/examples/california_housing/run.sql new file mode 100644 index 000000000..487abde2e --- /dev/null +++ b/examples/california_housing/run.sql @@ -0,0 +1,41 @@ +-- This example trains models on the sklean california_housing dataset +-- which is a copy of the test set from the StatLib repository +-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html +-- +-- This demonstrates using a table with individual columns as features +-- for regression. +SELECT pgml.load_dataset('california_housing'); + +-- view the dataset +SELECT * from pgml.california_housing; + +-- train a simple model to classify the data +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target'); + +-- check out the predictions +SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction +FROM pgml.california_housing +LIMIT 10; + +-- -- train some more models with different algorithms +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm'); +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest'); +SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees'); +-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network'); +-- -- check out all that hard work +SELECT * FROM pgml.trained_models; + +-- deploy the random_forest model for prediction use +SELECT pgml.deploy('California Housing Prediction', 'random_forest'); +-- check out that throughput +SELECT * FROM pgml.deployed_models; + +-- do some hyper param tuning +-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees'); +-- deploy the "best" model for prediction use +SELECT pgml.deploy('California Housing Prediction', 'best_fit'); + +-- check out the improved predictions +SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction +FROM pgml.california_housing +LIMIT 10; diff --git a/examples/digits/run.sql b/examples/digits/run.sql index 539c03053..0491a7a09 100644 --- a/examples/digits/run.sql +++ b/examples/digits/run.sql @@ -2,6 +2,9 @@ -- which is a copy of the test set of the UCI ML hand-written digits datasets -- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits -- +-- This demonstrates using a table with a single array feature column +-- for classification. +-- -- The final result after a few seconds of training is not terrible. Maybe not perfect -- enough for mission critical applications, but it's telling how quickly "off the shelf" -- solutions can solve problems these days. diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py index d997254cf..1f2948eae 100644 --- a/pgml/pgml/datasets.py +++ b/pgml/pgml/datasets.py @@ -1,5 +1,5 @@ import plpy -from sklearn.datasets import load_digits as d +import sklearn.datasets from pgml.sql import q from pgml.exceptions import PgMLException @@ -7,15 +7,39 @@ def load(source: str): if source == "digits": load_digits() + elif source == "california_housing": + load_california_housing() else: raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].") return "OK" def load_digits(): - dataset = d() + dataset = sklearn.datasets.load_digits() a = plpy.execute("DROP TABLE IF EXISTS pgml.digits") a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)") a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""") for X, y in zip(dataset["data"], dataset["target"]): X = ",".join("%i" % x for x in list(X)) plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""") + +def load_california_housing(): + dataset = sklearn.datasets.fetch_california_housing() + a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing") + a = plpy.execute(""" + CREATE TABLE pgml.california_housing ( + median_income FLOAT4, -- median income in block group + house_age FLOAT4, -- median house age in block group + avg_rooms FLOAT4, -- average number of rooms per household + avg_bedrooms FLOAT4, -- average number of bedrooms per household + population FLOAT4, -- block group population + avg_occupants FLOAT4, -- average number of household members + latitude FLOAT4, -- block group latitude + longitude FLOAT4, -- block group longitudetarget INTEGER + target FLOAT4 + )""") + a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""") + for X, y in zip(dataset["data"], dataset["target"]): + plpy.execute(f""" + INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target) + VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""") + \ No newline at end of file diff --git a/sql/install.sql b/sql/install.sql index 5eadea438..c846dcd47 100644 --- a/sql/install.sql +++ b/sql/install.sql @@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u; --- --- Predict --- -CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[]) +CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features DOUBLE PRECISION[]) RETURNS DOUBLE PRECISION AS $$ from pgml.model import Project

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier! Saves Data!