|
8 | 8 | from datetime import datetime
|
9 | 9 |
|
10 | 10 | import datasets
|
11 |
| -from InstructorEmbedding import INSTRUCTOR |
12 | 11 | import numpy
|
13 | 12 | import orjson
|
14 | 13 | from rouge import Rouge
|
@@ -502,23 +501,17 @@ def transform(task, args, inputs, stream=False):
|
502 | 501 |
|
503 | 502 |
|
504 | 503 | def create_embedding(transformer):
|
505 |
| - instructor = transformer.startswith("hkunlp/instructor") |
506 |
| - klass = INSTRUCTOR if instructor else SentenceTransformer |
507 |
| - return klass(transformer) |
| 504 | + return SentenceTransformer(transformer) |
508 | 505 |
|
509 | 506 |
|
510 | 507 | def embed_using(model, transformer, inputs, kwargs):
|
511 | 508 | if isinstance(kwargs, str):
|
512 | 509 | kwargs = orjson.loads(kwargs)
|
513 | 510 |
|
514 | 511 | instructor = transformer.startswith("hkunlp/instructor")
|
515 |
| - if instructor: |
516 |
| - texts_with_instructions = [] |
| 512 | + if instructor and "instruction" in kwargs: |
517 | 513 | instruction = kwargs.pop("instruction")
|
518 |
| - for text in inputs: |
519 |
| - texts_with_instructions.append([instruction, text]) |
520 |
| - |
521 |
| - inputs = texts_with_instructions |
| 514 | + kwargs["prompt"] = instruction |
522 | 515 |
|
523 | 516 | return model.encode(inputs, **kwargs)
|
524 | 517 |
|
@@ -1029,7 +1022,6 @@ def __init__(
|
1029 | 1022 | path: str,
|
1030 | 1023 | hyperparameters: dict,
|
1031 | 1024 | ) -> None:
|
1032 |
| - |
1033 | 1025 | # initialize class variables
|
1034 | 1026 | self.project_id = project_id
|
1035 | 1027 | self.model_id = model_id
|
@@ -1100,8 +1092,9 @@ def print_number_of_trainable_model_parameters(self, model):
|
1100 | 1092 | # Calculate and print the number and percentage of trainable parameters
|
1101 | 1093 | r_log("info", f"Trainable model parameters: {trainable_model_params}")
|
1102 | 1094 | r_log("info", f"All model parameters: {all_model_params}")
|
1103 |
| - r_log("info", |
1104 |
| - f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%" |
| 1095 | + r_log( |
| 1096 | + "info", |
| 1097 | + f"Percentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%", |
1105 | 1098 | )
|
1106 | 1099 |
|
1107 | 1100 | def tokenize_function(self):
|
@@ -1396,23 +1389,22 @@ def __init__(
|
1396 | 1389 | "bias": "none",
|
1397 | 1390 | "task_type": "CAUSAL_LM",
|
1398 | 1391 | }
|
1399 |
| - r_log("info", |
| 1392 | + r_log( |
| 1393 | + "info", |
1400 | 1394 | "LoRA configuration are not set. Using default parameters"
|
1401 |
| - + json.dumps(self.lora_config_params) |
| 1395 | + + json.dumps(self.lora_config_params), |
1402 | 1396 | )
|
1403 | 1397 |
|
1404 | 1398 | self.prompt_template = None
|
1405 | 1399 | if "prompt_template" in hyperparameters.keys():
|
1406 | 1400 | self.prompt_template = hyperparameters.pop("prompt_template")
|
1407 | 1401 |
|
1408 | 1402 | def train(self):
|
1409 |
| - |
1410 | 1403 | args = TrainingArguments(
|
1411 | 1404 | output_dir=self.path, logging_dir=self.path, **self.training_args
|
1412 | 1405 | )
|
1413 | 1406 |
|
1414 | 1407 | def formatting_prompts_func(example):
|
1415 |
| - |
1416 | 1408 | system_content = example["system"]
|
1417 | 1409 | user_content = example["user"]
|
1418 | 1410 | assistant_content = example["assistant"]
|
@@ -1463,7 +1455,7 @@ def formatting_prompts_func(example):
|
1463 | 1455 | peft_config=LoraConfig(**self.lora_config_params),
|
1464 | 1456 | callbacks=[PGMLCallback(self.project_id, self.model_id)],
|
1465 | 1457 | )
|
1466 |
| - r_log("info","Creating Supervised Fine Tuning trainer done. Training ... ") |
| 1458 | + r_log("info", "Creating Supervised Fine Tuning trainer done. Training ... ") |
1467 | 1459 |
|
1468 | 1460 | # Train
|
1469 | 1461 | self.trainer.train()
|
@@ -1582,7 +1574,6 @@ def finetune_conversation(
|
1582 | 1574 | project_id,
|
1583 | 1575 | model_id,
|
1584 | 1576 | ):
|
1585 |
| - |
1586 | 1577 | train_dataset = datasets.Dataset.from_dict(
|
1587 | 1578 | {
|
1588 | 1579 | "system": system_train,
|
|
0 commit comments