From 82b1f806c1b0cf8e076976398a86731a230437d5 Mon Sep 17 00:00:00 2001 From: Lev Date: Thu, 14 Apr 2022 13:12:00 -0700 Subject: [PATCH] lint --- pgml/pgml/model.py | 151 +++++++++++++++++++++++++-------------- pgml/pgml/sql.py | 1 + pgml/tests/plpy.py | 7 +- pgml/tests/test_model.py | 136 +++++++++++++++++++++++++++++++---- 4 files changed, 226 insertions(+), 69 deletions(-) diff --git a/pgml/pgml/model.py b/pgml/pgml/model.py index b34145aca..753df0141 100644 --- a/pgml/pgml/model.py +++ b/pgml/pgml/model.py @@ -9,10 +9,11 @@ from pgml.exceptions import PgMLException from pgml.sql import q + class Project(object): """ Use projects to refine multiple models of a particular dataset on a specific objective. - + Attributes: id (int): a unique identifier name (str): a human friendly unique identifier @@ -20,7 +21,7 @@ class Project(object): created_at (Timestamp): when this project was created updated_at (Timestamp): when this project was last updated """ - + _cache = {} def __init__(self): @@ -36,11 +37,14 @@ def find(cls, id: int): Returns: Project or None: instantiated from the database if found """ - result = plpy.execute(f""" + result = plpy.execute( + f""" SELECT * FROM pgml.projects WHERE id = {q(id)} - """, 1) + """, + 1, + ) if len(result) == 0: return None @@ -53,11 +57,11 @@ def find(cls, id: int): @classmethod def find_by_name(cls, name: str): """ - Get a Project from the database by name. - + Get a Project from the database by name. + This is the prefered API to retrieve projects, and they are cached by name to avoid needing to go to he database on every usage. - + Args: name (str): the project name Returns: @@ -65,13 +69,16 @@ def find_by_name(cls, name: str): """ if name in cls._cache: return cls._cache[name] - - result = plpy.execute(f""" + + result = plpy.execute( + f""" SELECT * FROM pgml.projects WHERE name = {q(name)} - """, 1) - if len(result)== 0: + """, + 1, + ) + if len(result) == 0: return None project = Project() @@ -84,7 +91,7 @@ def find_by_name(cls, name: str): def create(cls, name: str, objective: str): """ Create a Project and save it to the database. - + Args: name (str): a human friendly identifier objective (str): valid values are ["regression", "classification"]. @@ -93,11 +100,16 @@ def create(cls, name: str, objective: str): """ project = Project() - project.__dict__ = dict(plpy.execute(f""" + project.__dict__ = dict( + plpy.execute( + f""" INSERT INTO pgml.projects (name, objective) VALUES ({q(name)}, {q(objective)}) RETURNING * - """, 1)[0]) + """, + 1, + )[0] + ) project.__init__() cls._cache[name] = project return project @@ -112,10 +124,11 @@ def deployed_model(self): self._deployed_model = Model.find_deployed(self.id) return self._deployed_model + class Snapshot(object): """ Snapshots capture a set of training & test data for repeatability. - + Attributes: id (int): a unique identifier relation_name (str): the name of the table or view to snapshot @@ -126,11 +139,18 @@ class Snapshot(object): created_at (Timestamp): when this snapshot was created updated_at (Timestamp): when this snapshot was last updated """ + @classmethod - def create(cls, relation_name: str, y_column_name: str, test_size: float or int, test_sampling: str): + def create( + cls, + relation_name: str, + y_column_name: str, + test_size: float or int, + test_sampling: str, + ): """ - Create a Snapshot and save it to the database. - + Create a Snapshot and save it to the database. + This creates both a metadata record in the snapshots table, as well as creating a new table that holds a snapshot of all the data currently present in the relation so that training runs may be repeated, or further analysis may be conducted against the input. @@ -145,21 +165,33 @@ def create(cls, relation_name: str, y_column_name: str, test_size: float or int, """ snapshot = Snapshot() - snapshot.__dict__ = dict(plpy.execute(f""" + snapshot.__dict__ = dict( + plpy.execute( + f""" INSERT INTO pgml.snapshots (relation_name, y_column_name, test_size, test_sampling, status) VALUES ({q(relation_name)}, {q(y_column_name)}, {q(test_size)}, {q(test_sampling)}, 'new') RETURNING * - """, 1)[0]) - plpy.execute(f""" + """, + 1, + )[0] + ) + plpy.execute( + f""" CREATE TABLE pgml."snapshot_{snapshot.id}" AS SELECT * FROM "{snapshot.relation_name}"; - """) - snapshot.__dict__ = dict(plpy.execute(f""" + """ + ) + snapshot.__dict__ = dict( + plpy.execute( + f""" UPDATE pgml.snapshots SET status = 'created' WHERE id = {q(snapshot.id)} RETURNING * - """, 1)[0]) + """, + 1, + )[0] + ) return snapshot def data(self): @@ -167,10 +199,12 @@ def data(self): Returns: list, list, list, list: All rows from the snapshot split into X_train, X_test, y_train, y_test sets. """ - data = plpy.execute(f""" + data = plpy.execute( + f""" SELECT * FROM pgml."snapshot_{self.id}" - """) + """ + ) print(data) # Sanity check the data @@ -203,10 +237,10 @@ def data(self): y.append(y_) # Split into training and test sets - if self.test_sampling == 'random': + if self.test_sampling == "random": return train_test_split(X, y, test_size=self.test_size, random_state=0) else: - if self.test_sampling == 'first': + if self.test_sampling == "first": X.reverse() y.reverse() if isinstance(split, float): @@ -216,9 +250,9 @@ def data(self): split = int(self.test_size * X.len()) return X[:split], X[split:], y[:split], y[split:] - # TODO normalize and clean data + class Model(object): """Models use an algorithm on a snapshot of data to record the parameters learned. @@ -234,23 +268,26 @@ class Model(object): pickle (bytes): the serialized version of the model parameters algorithm: the in memory version of the model parameters that can make predictions """ + @classmethod def create(cls, project: Project, snapshot: Snapshot, algorithm_name: str): """ Create a Model and save it to the database. - + Args: - project (str): - snapshot (str): + project (str): + snapshot (str): algorithm_name (str): Returns: Model: instantiated from the database """ - result = plpy.execute(f""" + result = plpy.execute( + f""" INSERT INTO pgml.models (project_id, snapshot_id, algorithm_name, status) VALUES ({q(project.id)}, {q(snapshot.id)}, {q(algorithm_name)}, 'new') RETURNING * - """) + """ + ) model = Model() model.__dict__ = dict(result[0]) model.__init__() @@ -265,7 +302,8 @@ def find_deployed(cls, project_id: int): Returns: Model: that should currently be used for predictions of the project """ - result = plpy.execute(f""" + result = plpy.execute( + f""" SELECT models.* FROM pgml.models JOIN pgml.deployments @@ -273,7 +311,8 @@ def find_deployed(cls, project_id: int): AND deployments.project_id = {q(project_id)} ORDER by deployments.created_at DESC LIMIT 1 - """) + """ + ) if len(result) == 0: return None @@ -303,19 +342,19 @@ def algorithm(self): self._algorithm = pickle.loads(self.pickle) else: self._algorithm = { - 'linear_regression': LinearRegression, - 'random_forest_regression': RandomForestRegressor, - 'random_forest_classification': RandomForestClassifier - }[self.algorithm_name + '_' + self.project.objective]() - + "linear_regression": LinearRegression, + "random_forest_regression": RandomForestRegressor, + "random_forest_classification": RandomForestClassifier, + }[self.algorithm_name + "_" + self.project.objective]() + return self._algorithm def fit(self, snapshot: Snapshot): """ - Learns the parameters of this model and records them in the database. + Learns the parameters of this model and records them in the database. - Args: - snapshot (Snapshot): dataset used to train this model + Args: + snapshot (Snapshot): dataset used to train this model """ X_train, X_test, y_train, y_test = snapshot.data() @@ -328,7 +367,9 @@ def fit(self, snapshot: Snapshot): r2 = r2_score(y_test, y_pred) # Save the model - self.__dict__ = dict(plpy.execute(f""" + self.__dict__ = dict( + plpy.execute( + f""" UPDATE pgml.models SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}', status = 'successful', @@ -336,14 +377,18 @@ def fit(self, snapshot: Snapshot): r2_score = {q(r2)} WHERE id = {q(self.id)} RETURNING * - """)[0]) + """ + )[0] + ) def deploy(self): """Promote this model to the active version for the project that will be used for predictions""" - plpy.execute(f""" + plpy.execute( + f""" INSERT INTO pgml.deployments (project_id, model_id) VALUES ({q(self.project_id)}, {q(self.id)}) - """) + """ + ) def predict(self, data: list): """Use the model for a set of features. @@ -358,12 +403,12 @@ def predict(self, data: list): def train( - project_name: str, + project_name: str, objective: str, - relation_name: str, - y_column_name: str, + relation_name: str, + y_column_name: str, test_size: float or int = 0.1, - test_sampling: str = "random" + test_sampling: str = "random", ): """Create a regression model from a table or view filled with training data. @@ -390,5 +435,5 @@ def train( model.fit(snapshot) if best_error is None or model.mean_squared_error < best_error: best_error = model.mean_squared_error - best_model = model + best_model = model best_model.deploy() diff --git a/pgml/pgml/sql.py b/pgml/pgml/sql.py index 79ab69bdc..d8866d6c1 100644 --- a/pgml/pgml/sql.py +++ b/pgml/pgml/sql.py @@ -1,5 +1,6 @@ from plpy import quote_literal + def q(obj): if type(obj) == str: return quote_literal(obj) diff --git a/pgml/tests/plpy.py b/pgml/tests/plpy.py index 4bbbbc6fd..122092550 100644 --- a/pgml/tests/plpy.py +++ b/pgml/tests/plpy.py @@ -2,15 +2,18 @@ execute_results = deque() + def quote_literal(literal): return "'" + literal + "'" -def execute(sql, lines = 0): + +def execute(sql, lines=0): if len(execute_results) > 0: result = execute_results.popleft() return result - else: + else: return [] + def add_mock_result(result): execute_results.append(result) diff --git a/pgml/tests/test_model.py b/pgml/tests/test_model.py index 02605982d..cd7d26867 100644 --- a/pgml/tests/test_model.py +++ b/pgml/tests/test_model.py @@ -1,28 +1,71 @@ # stub out plpy from . import plpy import sys -sys.modules['plpy'] = plpy + +sys.modules["plpy"] = plpy import time import unittest from pgml import model + class TestModel(unittest.TestCase): def test_the_world(self): plpy.add_mock_result( - [{"id": 1, "name": "Test", "objective": "regression", "created_at": time.time(), "updated_at": time.time()}] - ) - plpy.add_mock_result( - [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "new", "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "name": "Test", + "objective": "regression", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - "OK" + [ + { + "id": 1, + "relation_name": "test", + "y_column_name": "test_y", + "test_size": 0.1, + "test_sampling": "random", + "status": "new", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) + plpy.add_mock_result("OK") plpy.add_mock_result( - [{"id": 1, "relation_name": "test", "y_column_name": "test_y", "test_size": 0.1, "test_sampling": "random", "status": "created", "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "relation_name": "test", + "y_column_name": "test_y", + "test_size": 0.1, + "test_sampling": "random", + "status": "created", + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -32,11 +75,37 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -46,11 +115,37 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) - + plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) plpy.add_mock_result( [ @@ -60,6 +155,19 @@ def test_the_world(self): ] ) plpy.add_mock_result( - [{"id": 1, "project_id": 1, "snapshot_id": 1, "algorithm_name": "linear", "status": "new", "r2_score": None, "mean_squared_error": None, "pickle": None, "created_at": time.time(), "updated_at": time.time()}] + [ + { + "id": 1, + "project_id": 1, + "snapshot_id": 1, + "algorithm_name": "linear", + "status": "new", + "r2_score": None, + "mean_squared_error": None, + "pickle": None, + "created_at": time.time(), + "updated_at": time.time(), + } + ] ) model.train("Test", "regression", "test", "test_y") pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy