42
42
__cache_sentence_transformer_by_name = {}
43
43
__cache_transform_pipeline_by_task = {}
44
44
45
+
45
46
class NumpyJSONEncoder (json .JSONEncoder ):
46
47
def default (self , obj ):
47
48
if isinstance (obj , np .float32 ):
48
49
return float (obj )
49
50
return super ().default (obj )
50
51
52
+
51
53
def transform (task , args , inputs , cache ):
52
54
task = json .loads (task )
53
55
args = json .loads (args )
@@ -65,7 +67,8 @@ def transform(task, args, inputs, cache):
65
67
if pipe .task == "question-answering" :
66
68
inputs = [json .loads (input ) for input in inputs ]
67
69
68
- return json .dumps (pipe (inputs , ** args ), cls = NumpyJSONEncoder )
70
+ return json .dumps (pipe (inputs , ** args ), cls = NumpyJSONEncoder )
71
+
69
72
70
73
def embed (transformer , text , kwargs ):
71
74
kwargs = json .loads (kwargs )
@@ -92,7 +95,9 @@ def load_dataset(name, subset, limit: None, kwargs: "{}"):
92
95
kwargs = json .loads (kwargs )
93
96
94
97
if limit :
95
- dataset = datasets .load_dataset (name , subset , split = f"train[:{ limit } ]" , ** kwargs )
98
+ dataset = datasets .load_dataset (
99
+ name , subset , split = f"train[:{ limit } ]" , ** kwargs
100
+ )
96
101
else :
97
102
dataset = datasets .load_dataset (name , subset , ** kwargs )
98
103
@@ -116,26 +121,34 @@ def load_dataset(name, subset, limit: None, kwargs: "{}"):
116
121
117
122
return json .dumps ({"data" : data , "types" : types })
118
123
124
+
119
125
def tokenize_text_classification (tokenizer , max_length , x , y ):
120
126
encoding = tokenizer (x , padding = True , truncation = True )
121
127
encoding ["label" ] = y
122
128
return datasets .Dataset .from_dict (encoding .data )
123
129
130
+
124
131
def tokenize_translation (tokenizer , max_length , x , y ):
125
132
encoding = tokenizer (x , max_length = max_length , truncation = True , text_target = y )
126
133
return datasets .Dataset .from_dict (encoding .data )
127
134
135
+
128
136
def tokenize_summarization (tokenizer , max_length , x , y ):
129
137
encoding = tokenizer (x , max_length = max_length , truncation = True , text_target = y )
130
138
return datasets .Dataset .from_dict (encoding .data )
131
139
140
+
132
141
def tokenize_text_generation (tokenizer , max_length , y ):
133
- encoding = tokenizer (y , max_length = max_length , truncation = True , padding = "max_length" )
142
+ encoding = tokenizer (
143
+ y , max_length = max_length , truncation = True , padding = "max_length"
144
+ )
134
145
return datasets .Dataset .from_dict (encoding .data )
135
146
147
+
136
148
def tokenize_question_answering (tokenizer , max_length , x , y ):
137
149
pass
138
150
151
+
139
152
def compute_metrics_summarization (model , tokenizer , hyperparams , x , y ):
140
153
all_preds = []
141
154
all_labels = y
@@ -153,7 +166,9 @@ def compute_metrics_summarization(model, tokenizer, hyperparams, x, y):
153
166
return_token_type_ids = False ,
154
167
).to (model .device )
155
168
predictions = model .generate (** tokens )
156
- decoded_preds = tokenizer .batch_decode (predictions , skip_special_tokens = True )
169
+ decoded_preds = tokenizer .batch_decode (
170
+ predictions , skip_special_tokens = True
171
+ )
157
172
all_preds .extend (decoded_preds )
158
173
bleu = BLEU ().corpus_score (all_preds , [[l ] for l in all_labels ])
159
174
rouge = Rouge ().get_scores (all_preds , all_labels , avg = True )
@@ -167,6 +182,7 @@ def compute_metrics_summarization(model, tokenizer, hyperparams, x, y):
167
182
"rouge_bigram_recall" : rouge ["rouge-2" ]["r" ],
168
183
}
169
184
185
+
170
186
def compute_metrics_text_classification (self , dataset ):
171
187
feature = label = None
172
188
for name , type in dataset .features .items ():
@@ -183,8 +199,12 @@ def compute_metrics_text_classification(self, dataset):
183
199
184
200
with torch .no_grad ():
185
201
for i in range (batches ):
186
- slice = dataset .select (range (i * batch_size , min ((i + 1 ) * batch_size , len (dataset ))))
187
- tokens = self .tokenizer (slice [feature ], padding = True , truncation = True , return_tensors = "pt" )
202
+ slice = dataset .select (
203
+ range (i * batch_size , min ((i + 1 ) * batch_size , len (dataset )))
204
+ )
205
+ tokens = self .tokenizer (
206
+ slice [feature ], padding = True , truncation = True , return_tensors = "pt"
207
+ )
188
208
tokens .to (self .model .device )
189
209
result = self .model (** tokens ).logits .to ("cpu" )
190
210
logits = torch .cat ((logits , result ), 0 )
@@ -203,12 +223,17 @@ def compute_metrics_text_classification(self, dataset):
203
223
metrics ["accuracy" ] = accuracy_score (y_test , y_pred )
204
224
metrics ["log_loss" ] = log_loss (y_test , y_prob )
205
225
roc_auc_y_prob = y_prob
206
- if y_prob .shape [1 ] == 2 : # binary classification requires only the greater label by passed to roc_auc_score
226
+ if (
227
+ y_prob .shape [1 ] == 2
228
+ ): # binary classification requires only the greater label by passed to roc_auc_score
207
229
roc_auc_y_prob = y_prob [:, 1 ]
208
- metrics ["roc_auc" ] = roc_auc_score (y_test , roc_auc_y_prob , average = "weighted" , multi_class = "ovo" )
230
+ metrics ["roc_auc" ] = roc_auc_score (
231
+ y_test , roc_auc_y_prob , average = "weighted" , multi_class = "ovo"
232
+ )
209
233
210
234
return metrics
211
235
236
+
212
237
def compute_metrics_translation (model , tokenizer , hyperparams , x , y ):
213
238
all_preds = []
214
239
all_labels = y
@@ -226,7 +251,9 @@ def compute_metrics_translation(model, tokenizer, hyperparams, x, y):
226
251
return_token_type_ids = False ,
227
252
).to (model .device )
228
253
predictions = model .generate (** tokens )
229
- decoded_preds = tokenizer .batch_decode (predictions , skip_special_tokens = True )
254
+ decoded_preds = tokenizer .batch_decode (
255
+ predictions , skip_special_tokens = True
256
+ )
230
257
all_preds .extend (decoded_preds )
231
258
bleu = BLEU ().corpus_score (all_preds , [[l ] for l in all_labels ])
232
259
rouge = Rouge ().get_scores (all_preds , all_labels , avg = True )
@@ -240,13 +267,16 @@ def compute_metrics_translation(model, tokenizer, hyperparams, x, y):
240
267
"rouge_bigram_recall" : rouge ["rouge-2" ]["r" ],
241
268
}
242
269
270
+
243
271
def compute_metrics_question_answering (model , tokenizer , hyperparams , x , y ):
244
272
batch_size = self .hyperparams ["per_device_eval_batch_size" ]
245
273
batches = int (math .ceil (len (dataset ) / batch_size ))
246
274
247
275
with torch .no_grad ():
248
276
for i in range (batches ):
249
- slice = dataset .select (range (i * batch_size , min ((i + 1 ) * batch_size , len (dataset ))))
277
+ slice = dataset .select (
278
+ range (i * batch_size , min ((i + 1 ) * batch_size , len (dataset )))
279
+ )
250
280
tokens = self .algorithm ["tokenizer" ].encode_plus (
251
281
slice ["question" ], slice ["context" ], return_tensors = "pt"
252
282
)
@@ -255,7 +285,9 @@ def compute_metrics_question_answering(model, tokenizer, hyperparams, x, y):
255
285
answer_start = torch .argmax (outputs [0 ])
256
286
answer_end = torch .argmax (outputs [1 ]) + 1
257
287
answer = self .algorithm ["tokenizer" ].convert_tokens_to_string (
258
- self .algorithm ["tokenizer" ].convert_ids_to_tokens (tokens ["input_ids" ][0 ][answer_start :answer_end ])
288
+ self .algorithm ["tokenizer" ].convert_ids_to_tokens (
289
+ tokens ["input_ids" ][0 ][answer_start :answer_end ]
290
+ )
259
291
)
260
292
261
293
def compute_exact_match (prediction , truth ):
@@ -297,6 +329,7 @@ def get_gold_answers(example):
297
329
298
330
return metrics
299
331
332
+
300
333
def compute_metrics_text_generation (model , tokenizer , hyperparams , y ):
301
334
full_text = ""
302
335
for entry in y :
@@ -339,9 +372,8 @@ def compute_metrics_text_generation(model, tokenizer, hyperparams, y):
339
372
340
373
perplexity = torch .exp (torch .stack (nlls ).sum () / end_loc )
341
374
342
- return {
343
- "perplexity" : perplexity
344
- }
375
+ return {"perplexity" : perplexity }
376
+
345
377
346
378
def tune (task , hyperparams , path , x_train , x_test , y_train , y_test ):
347
379
hyperparams = json .loads (hyperparams )
@@ -351,7 +383,9 @@ def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
351
383
algorithm = {}
352
384
353
385
if task == "text-classification" :
354
- model = AutoModelForSequenceClassification .from_pretrained (model_name , num_labels = 2 )
386
+ model = AutoModelForSequenceClassification .from_pretrained (
387
+ model_name , num_labels = 2
388
+ )
355
389
train = tokenize_text_classification (tokenizer , max_length , x_train , y_train )
356
390
test = tokenize_text_classification (tokenizer , max_length , x_test , y_test )
357
391
data_collator = DefaultDataCollator ()
@@ -373,15 +407,19 @@ def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
373
407
model = AutoModelForSeq2SeqLM .from_pretrained (model_name )
374
408
train = tokenize_translation (tokenizer , max_length , x_train , y_train )
375
409
test = tokenize_translation (tokenizer , max_length , x_test , y_test )
376
- data_collator = DataCollatorForSeq2Seq (tokenizer , model = model , return_tensors = "pt" )
410
+ data_collator = DataCollatorForSeq2Seq (
411
+ tokenizer , model = model , return_tensors = "pt"
412
+ )
377
413
elif task == "text-generation" :
378
414
max_length = hyperparams .pop ("max_length" , None )
379
415
tokenizer .pad_token = tokenizer .eos_token
380
416
model = AutoModelForCausalLM .from_pretrained (model_name )
381
417
model .resize_token_embeddings (len (tokenizer ))
382
418
train = tokenize_text_generation (tokenizer , max_length , y_train )
383
419
test = tokenize_text_generation (tokenizer , max_length , y_test )
384
- data_collator = DataCollatorForLanguageModeling (tokenizer , mlm = False , return_tensors = "pt" )
420
+ data_collator = DataCollatorForLanguageModeling (
421
+ tokenizer , mlm = False , return_tensors = "pt"
422
+ )
385
423
else :
386
424
raise PgMLException (f"unhandled task type: { task } " )
387
425
trainer = Trainer (
@@ -402,13 +440,21 @@ def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
402
440
# Test
403
441
start = time .perf_counter ()
404
442
if task == "summarization" :
405
- metrics = compute_metrics_summarization (model , tokenizer , hyperparams , x_test , y_test )
443
+ metrics = compute_metrics_summarization (
444
+ model , tokenizer , hyperparams , x_test , y_test
445
+ )
406
446
elif task == "text-classification" :
407
- metrics = compute_metrics_text_classification (model , tokenizer , hyperparams , x_test , y_test )
447
+ metrics = compute_metrics_text_classification (
448
+ model , tokenizer , hyperparams , x_test , y_test
449
+ )
408
450
elif task == "question-answering" :
409
- metrics = compute_metrics_question_answering (model , tokenizer , hyperparams , x_test , y_test )
451
+ metrics = compute_metrics_question_answering (
452
+ model , tokenizer , hyperparams , x_test , y_test
453
+ )
410
454
elif task == "translation" :
411
- metrics = compute_metrics_translation (model , tokenizer , hyperparams , x_test , y_test )
455
+ metrics = compute_metrics_translation (
456
+ model , tokenizer , hyperparams , x_test , y_test
457
+ )
412
458
elif task == "text-generation" :
413
459
metrics = compute_metrics_text_generation (model , tokenizer , hyperparams , y_test )
414
460
else :
@@ -423,16 +469,19 @@ def tune(task, hyperparams, path, x_train, x_test, y_train, y_test):
423
469
424
470
return metrics
425
471
472
+
426
473
class MissingModelError (Exception ):
427
474
pass
428
475
476
+
429
477
def get_transformer_by_model_id (model_id ):
430
478
global __cache_transformer_by_model_id
431
479
if model_id in __cache_transformer_by_model_id :
432
480
return __cache_transformer_by_model_id [model_id ]
433
481
else :
434
482
raise MissingModelError
435
483
484
+
436
485
def load_model (model_id , task , dir ):
437
486
if task == "summarization" :
438
487
__cache_transformer_by_model_id [model_id ] = {
@@ -463,14 +512,15 @@ def load_model(model_id, task, dir):
463
512
else :
464
513
raise Exception (f"unhandled task type: { task } " )
465
514
515
+
466
516
def generate (model_id , data , config ):
467
517
result = get_transformer_by_model_id (model_id )
468
518
tokenizer = result ["tokenizer" ]
469
519
model = result ["model" ]
470
520
config = json .loads (config )
471
521
all_preds = []
472
522
473
- batch_size = 1 # TODO hyperparams
523
+ batch_size = 1 # TODO hyperparams
474
524
batches = int (math .ceil (len (data ) / batch_size ))
475
525
476
526
with torch .no_grad ():
@@ -485,7 +535,9 @@ def generate(model_id, data, config):
485
535
return_token_type_ids = False ,
486
536
).to (model .device )
487
537
predictions = model .generate (** tokens , ** config )
488
- decoded_preds = tokenizer .batch_decode (predictions , skip_special_tokens = True )
538
+ decoded_preds = tokenizer .batch_decode (
539
+ predictions , skip_special_tokens = True
540
+ )
489
541
all_preds .extend (decoded_preds )
490
542
return all_preds
491
543
@@ -494,9 +546,12 @@ def assign_device(device=None):
494
546
if device is not None :
495
547
if device == "cpu" or "cuda:" in device :
496
548
return device
549
+ if "cuda" in device and not torch .cuda .is_available ():
550
+ raise Exception ("CUDA is not available" )
497
551
498
- device = "cpu"
499
552
if torch .cuda .is_available ():
500
553
device = "cuda:" + str (os .getpid () % torch .cuda .device_count ())
554
+ else :
555
+ device = "cpu"
501
556
502
557
return device
0 commit comments