sketch out the regression model training cycle

Montana Low · Montana Low · commit 9907aaab9ab3 · 2022-04-12T20:07:17.000-07:00
diff --git a/pgml/pgml/model.py b/pgml/pgml/model.py
@@ -1,95 +1,139 @@
+from cmath import e
 import plpy
 
+from sklearn.linear_model import LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+
+import pickle
+
+from pgml.exceptions import PgMLException
+
 class Regression:
     """Provides continuous real number predictions learned from the training data.
     """    
     def __init__(
-        model_name: str, 
+        self,
+        project_name: str, 
         relation_name: str, 
         y_column_name: str, 
-        implementation: str = "sklearn.linear_model"
+        algorithm: str = "sklearn.linear_model",
+        test_size: float or int = 0.1,
+        test_sampling: str = "random"
     ) -> None:
         """Create a regression model from a table or view filled with training data.
 
         Args:
-            model_name (str): a human friendly identifier
+            project_name (str): a human friendly identifier
             relation_name (str): the table or view that stores the training data
             y_column_name (str): the column in the training data that acts as the label
-            implementation (str, optional): the algorithm used to implement the regression. Defaults to "sklearn.linear_model".
+            algorithm (str, optional): the algorithm used to implement the regression. Defaults to "sklearn.linear_model".
+            test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25.
+            test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"].
         """
 
-        data_source = f"SELECT * FROM {table_name}"
-
-        # Start training.
-        start = plpy.execute(f"""
-            INSERT INTO pgml.model_versions
-                (name, data_source, y_column)
-            VALUES
-                ('{table_name}', '{data_source}', '{y}')
-            RETURNING *""", 1)
-
-        id_ = start[0]["id"]
-        name = f"{table_name}_{id_}"
-
-        destination = models_directory(plpy)
+        plpy.warning("snapshot")
+        # Create a snapshot of the relation
+        snapshot = plpy.execute(f"INSERT INTO pgml.snapshots (relation, y, test_size, test_sampling, status) VALUES ('{relation_name}', '{y_column_name}', {test_size}, '{test_sampling}', 'new') RETURNING *", 1)[0]
+        plpy.execute(f"""CREATE TABLE pgml.snapshot_{snapshot['id']} AS SELECT * FROM "{relation_name}";""")
+        plpy.execute(f"UPDATE pgml.snapshots SET status = 'created' WHERE id = {snapshot['id']}")
+
+        plpy.warning("project")
+        # Find or create the project
+        project = plpy.execute(f"SELECT * FROM pgml.projects WHERE name = '{project_name}'", 1)
+        plpy.warning(f"project {project}")
+        if (project.nrows == 1):
+            plpy.warning("project found")
+            project = project[0]
+        else:
+            try: 
+                project = plpy.execute(f"INSERT INTO pgml.projects (name) VALUES ('{project_name}') RETURNING *", 1)
+                plpy.warning(f"project inserted {project}")
+                if (project.nrows() == 1):
+                    project = project[0]
+
+            except Exception as e: # handle race condition to insert
+                plpy.warning(f"project retry: #{e}")
+                project = plpy.execute(f"SELECT * FROM pgml.projects WHERE name = '{project_name}'", 1)[0]
+
+        plpy.warning("model")
+        # Create the model
+        model = plpy.execute(f"INSERT INTO pgml.models (project_id, snapshot_id, algorithm, status) VALUES ({project['id']}, {snapshot['id']}, '{algorithm}', 'training') RETURNING *")[0]
+
+        plpy.warning("data")
+        # Prepare the data
+        data = plpy.execute(f"SELECT * FROM pgml.snapshot_{snapshot['id']}")
+
+        # Sanity check the data
+        if data.nrows == 0:
+            PgMLException(
+                f"Relation `{y_column_name}` contains no rows. Did you pass the correct `relation_name`?"
+            )
+        if y_column_name not in data[0]:
+            PgMLException(
+                f"Column `{y_column_name}` not found. Did you pass the correct `y_column_name`?"
+            )
+
+        # Always pull the columns in the same order from the row.
+        # Python dict iteration is not always in the same order (hash table).
+        columns = []
+        for col in data[0]:
+            if col != y_column_name:
+                columns.append(col)
 
-        # Train!
-        pickle, msq, r2 = train(plpy.cursor(data_source), y_column=y, name=name, destination=destination)
+        # Split the label from the features
         X = []
         y = []
-        columns = []
-
-        for row in all_rows(cursor):
-            row = row.copy()
-
-            if y_column not in row:
-                PgMLException(
-                    f"Column `{y}` not found. Did you name your `y_column` correctly?"
-                )
-
-            y_ = row.pop(y_column)
+        for row in data:
+            plpy.warning(f"row: {row}")
+            y_ = row.pop(y_column_name)
             x_ = []
 
-            # Always pull the columns in the same order from the row.
-            # Python dict iteration is not always in the same order (hash table).
-            if not columns:
-                for col in row:
-                    columns.append(col)
-
             for column in columns:
                 x_.append(row[column])
+
             X.append(x_)
             y.append(y_)
 
-        X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-        # Just linear regression for now, but can add many more later.
-        lr = LinearRegression()
-        lr.fit(X_train, y_train)
-
+        # Split into training and test sets
+        plpy.warning("split")
+        if (test_sampling == 'random'):
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
+        else:
+            if (test_sampling == 'first'):
+                X.reverse()
+                y.reverse()
+                if isinstance(split, float):
+                    split = 1.0 - split
+            split = test_size
+            if isinstance(split, float):
+                split = int(test_size * X.len())
+            X_train, X_test, y_train, y_test = X[0:split], X[split:X.len()-1], y[0:split], y[split:y.len()-1]
+
+        # TODO normalize and clean data
+
+        plpy.warning("train")
+        # Train the model
+        algo = LinearRegression()
+        algo.fit(X_train, y_train)
+
+        plpy.warning("test")
         # Test
-        y_pred = lr.predict(X_test)
+        y_pred = algo.predict(X_test)
         msq = mean_squared_error(y_test, y_pred)
         r2 = r2_score(y_test, y_pred)
 
-        path = os.path.join(destination, name)
-
-        if save:
-            with open(path, "wb") as f:
-                pickle.dump(lr, f)
-
-        return path, msq, r2
-
+        plpy.warning("save")
+        # Save the model
+        weights = pickle.dumps(algo)
 
         plpy.execute(f"""
-            UPDATE pgml.model_versions
-            SET pickle = '{pickle}',
-                successful = true,
+            UPDATE pgml.models
+            SET pickle = '\\x{weights.hex()}',
+                status = 'successful',
                 mean_squared_error = '{msq}',
-                r2_score = '{r2}',
-                ended_at = clock_timestamp()
-            WHERE id = {id_}""")
-
-        return name
+                r2_score = '{r2}'
+            WHERE id = {model['id']}
+        """)
 
-            model
+        # TODO: promote the model?
diff --git a/sql/install.sql b/sql/install.sql
@@ -47,28 +47,35 @@ CREATE TABLE pgml.projects(
 	updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 SELECT pgml.auto_updated_at('pgml.projects');
+CREATE UNIQUE INDEX projects_name_idx ON pgml.projects(name);
 
 CREATE TABLE pgml.snapshots(
 	id BIGSERIAL PRIMARY KEY,
 	relation TEXT NOT NULL,
 	y TEXT NOT NULL,
-	validation_ratio FLOAT4 NOT NULL,
-	validation_strategy TEXT NOT NULL,
+	test_size FLOAT4 NOT NULL,
+	test_sampling TEXT NOT NULL,
+	status TEXT NOT NULL,
 	created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
 	updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 SELECT pgml.auto_updated_at('pgml.snapshots');
 
 CREATE TABLE pgml.models(
 	id BIGSERIAL PRIMARY KEY,
-	project_id BIGINT,
-	snapshot_id BIGINT,
+	project_id BIGINT NOT NULL,
+	snapshot_id BIGINT NOT NULL,
+	algorithm TEXT NOT NULL,
+	status TEXT NOT NULL,
 	created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
 	updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+	mean_squared_error DOUBLE PRECISION,
+	r2_score DOUBLE PRECISION,
 	pickle BYTEA,
 	CONSTRAINT project_id_fk FOREIGN KEY(project_id) REFERENCES pgml.projects(id),
 	CONSTRAINT snapshot_id_fk FOREIGN KEY(snapshot_id) REFERENCES pgml.snapshots(id)
 );
+CREATE INDEX models_project_id_created_at_idx ON pgml.models(project_id, created_at);
 SELECT pgml.auto_updated_at('pgml.models');
 
 CREATE TABLE pgml.promotions(
@@ -92,11 +99,12 @@ AS $$
 	return pgml.version()
 $$ LANGUAGE plpython3u;
 
-CREATE OR REPLACE FUNCTION pgml.model_regression(model_name TEXT, relation_name TEXT, y_column_name TEXT, algorithm TEXT)
+CREATE OR REPLACE FUNCTION pgml.model_regression(project_name TEXT, relation_name TEXT, y_column_name TEXT)
 RETURNS VOID
 AS $$
 	import pgml
-	pgml.model.regression(model_name, relation_name, y_column_name, algorithm)
+	from pgml.model import Regression
+	Regression(project_name, relation_name, y_column_name)
 $$ LANGUAGE plpython3u;
 
 
diff --git a/sql/test.sql b/sql/test.sql
@@ -7,15 +7,19 @@
 SELECT pgml.version();
 
 -- Train twice
-SELECT pgml.train('wine_quality_red', 'quality');
+-- SELECT pgml.train('wine_quality_red', 'quality');
 
-SELECT * FROM pgml.model_versions;
+-- SELECT * FROM pgml.model_versions;
+
+-- \timing
+-- WITH latest_model AS (
+-- 	SELECT name || '_' || id AS model_name FROM pgml.model_versions ORDER BY id DESC LIMIT 1
+-- )
+-- SELECT pgml.score(
+-- 	(SELECT model_name FROM latest_model), -- last model we just trained
+-- 	7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4 -- features as variadic arguments
+-- ) AS score;
 
 \timing
-WITH latest_model AS (
-	SELECT name || '_' || id AS model_name FROM pgml.model_versions ORDER BY id DESC LIMIT 1
-)
-SELECT pgml.score(
-	(SELECT model_name FROM latest_model), -- last model we just trained
-	7.4, 0.7, 0, 1.9, 0.076, 11, 34, 0.99, 2, 0.5, 9.4 -- features as variadic arguments
-) AS score;
+
+SELECT pgml.model_regression('Red Wine', 'wine_quality_red', 'quality');