1
+ from re import M
1
2
import plpy
2
- from sklearn .linear_model import LinearRegression
3
- from sklearn .ensemble import RandomForestRegressor , RandomForestClassifier
3
+ from sklearn .linear_model import LinearRegression , LogisticRegression
4
+ from sklearn .svm import SVR , SVC
5
+ from sklearn .ensemble import RandomForestRegressor , RandomForestClassifier , GradientBoostingRegressor , GradientBoostingClassifier
4
6
from sklearn .model_selection import train_test_split
5
- from sklearn .metrics import mean_squared_error , r2_score
7
+ from sklearn .metrics import mean_squared_error , r2_score , f1_score , precision_score , recall_score
6
8
7
9
import pickle
10
+ import json
8
11
9
12
from pgml .exceptions import PgMLException
10
13
from pgml .sql import q
11
14
15
+ def flatten (S ):
16
+ if S == []:
17
+ return S
18
+ if isinstance (S [0 ], list ):
19
+ return flatten (S [0 ]) + flatten (S [1 :])
20
+ return S [:1 ] + flatten (S [1 :])
12
21
13
22
class Project (object ):
14
23
"""
@@ -124,6 +133,14 @@ def deployed_model(self):
124
133
self ._deployed_model = Model .find_deployed (self .id )
125
134
return self ._deployed_model
126
135
136
+ def deploy (self , algorithm_name ):
137
+ model = None
138
+ if algorithm_name == "best_fit" :
139
+ model = Model .find_by_project_and_best_fit (self )
140
+ else :
141
+ model = Model .find_by_project_id_and_algorithm_name (self .id , algorithm_name )
142
+ model .deploy ()
143
+ return model
127
144
128
145
class Snapshot (object ):
129
146
"""
@@ -178,7 +195,7 @@ def create(
178
195
plpy .execute (
179
196
f"""
180
197
CREATE TABLE pgml."snapshot_{ snapshot .id } " AS
181
- SELECT * FROM " { snapshot .relation_name } " ;
198
+ SELECT * FROM { snapshot .relation_name } ;
182
199
"""
183
200
)
184
201
snapshot .__dict__ = dict (
@@ -232,6 +249,7 @@ def data(self):
232
249
for column in columns :
233
250
x_ .append (row [column ])
234
251
252
+ x_ = flatten (x_ ) # TODO be smart about flattening X depending on algorithm
235
253
X .append (x_ )
236
254
y .append (y_ )
237
255
@@ -262,8 +280,7 @@ class Model(object):
262
280
status (str): The current status of the model, e.g. 'new', 'training' or 'successful'
263
281
created_at (Timestamp): when this model was created
264
282
updated_at (Timestamp): when this model was last updated
265
- mean_squared_error (float):
266
- r2_score (float):
283
+ metrics (dict): key performance indicators for the model
267
284
pickle (bytes): the serialized version of the model parameters
268
285
algorithm: the in memory version of the model parameters that can make predictions
269
286
"""
@@ -320,6 +337,63 @@ def find_deployed(cls, project_id: int):
320
337
model .__init__ ()
321
338
return model
322
339
340
+ @classmethod
341
+ def find_by_project_id_and_algorithm_name (cls , project_id : int , algorithm_name : str ):
342
+ """
343
+ Args:
344
+ project_id (int): The project id
345
+ algorithm_name (str): The algorithm
346
+ Returns:
347
+ Model: most recently created model that fits the criteria
348
+ """
349
+ result = plpy .execute (
350
+ f"""
351
+ SELECT models.*
352
+ FROM pgml.models
353
+ WHERE algorithm_name = { q (algorithm_name )}
354
+ AND project_id = { q (project_id )}
355
+ ORDER by models.created_at DESC
356
+ LIMIT 1
357
+ """
358
+ )
359
+ if len (result ) == 0 :
360
+ return None
361
+
362
+ model = Model ()
363
+ model .__dict__ = dict (result [0 ])
364
+ model .__init__ ()
365
+ return model
366
+
367
+ @classmethod
368
+ def find_by_project_and_best_fit (cls , project : Project ):
369
+ """
370
+ Args:
371
+ project (Project): The project
372
+ Returns:
373
+ Model: the model with the best metrics for the project
374
+ """
375
+ if project .objective == "regression" :
376
+ metric = "mean_squared_error"
377
+ elif project .objective == "classification" :
378
+ metric = "f1"
379
+
380
+ result = plpy .execute (
381
+ f"""
382
+ SELECT models.*
383
+ FROM pgml.models
384
+ WHERE project_id = { q (project .id )}
385
+ ORDER by models.metrics->>{ q (metric )} DESC
386
+ LIMIT 1
387
+ """
388
+ )
389
+ if len (result ) == 0 :
390
+ return None
391
+
392
+ model = Model ()
393
+ model .__dict__ = dict (result [0 ])
394
+ model .__init__ ()
395
+ return model
396
+
323
397
def __init__ (self ):
324
398
self ._algorithm = None
325
399
self ._project = None
@@ -342,8 +416,13 @@ def algorithm(self):
342
416
else :
343
417
self ._algorithm = {
344
418
"linear_regression" : LinearRegression ,
419
+ "linear_classification" : LogisticRegression ,
420
+ "svm_regression" : SVR ,
421
+ "svm_classification" : SVC ,
345
422
"random_forest_regression" : RandomForestRegressor ,
346
423
"random_forest_classification" : RandomForestClassifier ,
424
+ "gradient_boosting_trees_regression" : GradientBoostingRegressor ,
425
+ "gradient_boosting_trees_classification" : GradientBoostingClassifier ,
347
426
}[self .algorithm_name + "_" + self .project .objective ]()
348
427
349
428
return self ._algorithm
@@ -362,8 +441,14 @@ def fit(self, snapshot: Snapshot):
362
441
363
442
# Test
364
443
y_pred = self .algorithm .predict (X_test )
365
- msq = mean_squared_error (y_test , y_pred )
366
- r2 = r2_score (y_test , y_pred )
444
+ metrics = {}
445
+ if self .project .objective == "regression" :
446
+ metrics ["mean_squared_error" ] = mean_squared_error (y_test , y_pred )
447
+ metrics ["r2" ] = r2_score (y_test , y_pred )
448
+ elif self .project .objective == "classification" :
449
+ metrics ["f1" ] = f1_score (y_test , y_pred , average = "weighted" )
450
+ metrics ["precision" ] = precision_score (y_test , y_pred , average = "weighted" )
451
+ metrics ["recall" ] = recall_score (y_test , y_pred , average = "weighted" )
367
452
368
453
# Save the model
369
454
self .__dict__ = dict (
@@ -372,8 +457,7 @@ def fit(self, snapshot: Snapshot):
372
457
UPDATE pgml.models
373
458
SET pickle = '\\ x{ pickle .dumps (self .algorithm ).hex ()} ',
374
459
status = 'successful',
375
- mean_squared_error = { q (msq )} ,
376
- r2_score = { q (r2 )}
460
+ metrics = { q (json .dumps (metrics ))}
377
461
WHERE id = { q (self .id )}
378
462
RETURNING *
379
463
"""
@@ -398,6 +482,7 @@ def predict(self, data: list):
398
482
Returns:
399
483
float or int: scores for regressions or ints for classifications
400
484
"""
485
+ # TODO: add metrics for tracking prediction volume/accuracy by model
401
486
return self .algorithm .predict (data )
402
487
403
488
@@ -406,6 +491,7 @@ def train(
406
491
objective : str ,
407
492
relation_name : str ,
408
493
y_column_name : str ,
494
+ algorithm_name : str = "linear" ,
409
495
test_size : float or int = 0.1 ,
410
496
test_sampling : str = "random" ,
411
497
):
@@ -416,15 +502,14 @@ def train(
416
502
objective (str): Defaults to "regression". Valid values are ["regression", "classification"].
417
503
relation_name (str): the table or view that stores the training data
418
504
y_column_name (str): the column in the training data that acts as the label
419
- algorithm (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "random_forest"].
505
+ algorithm_name (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "svm", " random_forest", "gradient_boosting "].
420
506
test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25.
421
507
test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"].
422
508
"""
423
- if objective == "regression" :
424
- algorithms = ["linear" , "random_forest" ]
425
- elif objective == "classification" :
426
- algorithms = ["random_forest" ]
427
- else :
509
+ if algorithm_name is None :
510
+ algorithm_name = "linear"
511
+
512
+ if objective not in ["regression" , "classification" ]:
428
513
raise PgMLException (
429
514
f"Unknown objective `{ objective } `, available options are: regression, classification."
430
515
)
@@ -440,23 +525,11 @@ def train(
440
525
)
441
526
442
527
snapshot = Snapshot .create (relation_name , y_column_name , test_size , test_sampling )
443
- deployed = Model .find_deployed (project .id )
444
-
445
- # Let's assume that the deployed model is better for now.
446
- best_model = deployed
447
- best_error = best_model .mean_squared_error if best_model else None
448
-
449
- for algorithm_name in algorithms :
450
- model = Model .create (project , snapshot , algorithm_name )
451
- model .fit (snapshot )
528
+ model = Model .create (project , snapshot , algorithm_name )
529
+ model .fit (snapshot )
452
530
453
- # Find the better model and deploy that.
454
- if best_error is None or model .mean_squared_error < best_error :
455
- best_error = model .mean_squared_error
456
- best_model = model
457
-
458
- if deployed and deployed .id == best_model .id :
459
- return "rolled back"
460
- else :
461
- best_model .deploy ()
531
+ if project .deployed_model is None :
532
+ model .deploy ()
462
533
return "deployed"
534
+ else :
535
+ return "not deployed"
0 commit comments