postgresml
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎pgml-dashboard/src/models.rs
Lines changed: 8 additions & 2 deletions b/‎pgml-dashboard/src/models.rs
Lines changed: 8 additions & 2 deletions
diff --git a/‎pgml-docs/docs/user_guides/transformers/fine_tuning.md
Lines changed: 77 additions & 7 deletions b/‎pgml-docs/docs/user_guides/transformers/fine_tuning.md
Lines changed: 77 additions & 7 deletions
@@ -161,3 +161,4 @@ cython_debug/
 
 # local scratch pad
 scratch.sql
+scratch.py
@@ -60,16 +60,22 @@ impl Project {
 
     pub fn key_metric_name(&self) -> anyhow::Result<&'static str> {
         match self.task.as_ref().unwrap().as_str() {
-            "classification" | "text-classification" => Ok("f1"),
+            "classification" | "text_classification" | "question_answering" => Ok("f1"),
             "regression" => Ok("r2"),
+            "summarization" => Ok("rouge_ngram_f1"),
+            "translation" => Ok("bleu"),
+            "text_generation" | "text2text" => Ok("perplexity"),
             task => Err(anyhow::anyhow!("Unhandled task: {}", task)),
         }
     }
 
     pub fn key_metric_display_name(&self) -> anyhow::Result<&'static str> {
         match self.task.as_ref().unwrap().as_str() {
-            "classification" | "text-classification" => Ok("F<sup>1</sup>"),
+            "classification" | "text_classification" | "question_answering" => Ok("F<sup>1</sup>"),
             "regression" => Ok("R<sup>2</sup>"),
+            "summarization" => Ok("Rouge Ngram F<sup>1</sup>"),
+            "translation" => Ok("Bleu"),
+            "text_generation" | "text2text" => Ok("Perplexity"),
             task => Err(anyhow::anyhow!("Unhandled task: {}", task)),
         }
     }
 
@@ -34,18 +34,63 @@ You can view the newly loaded data in your Postgres database:
     103 | {"en": "ROLES_OF_TRANSLATORS", "es": "Rafael Osuna rosuna@wol. es Traductor"}
     (5 rows)
     ```
+This huggingface dataset stores the data as language key pairs in a JSON document. To use it with PostgresML, we'll need to provide a `VIEW` that structures the data into more primitively typed columns. 
+
+=== "SQL"
+
+    ```sql linenums="1"
+    CREATE OR REPLACE VIEW kde4_en_to_es AS
+    SELECT translation->>'en' AS "en", translation->>'es' AS "es"
+    FROM pgml.kde4
+    LIMIT 10;
+    ```
+
+=== "Result"
+
+    ```sql  linenums="1"
+    CREATE VIEW
+    ```
+
+Now, we can see the data in more normalized form. The exact column names don't matter for now, we'll specify which one is the target during the training call, and the other one will be used as the input.
+
+=== "SQL"
+
+    ```sql linenums="1"
+    SELECT * FROM kde4_en_to_es LIMIT 10;
+    ```
+
+=== "Result"
+
+    ```sql  linenums="1"
+                                                en                                            |                                                   es
+    
+    --------------------------------------------------------------------------------------------+--------------------------------------------------------------------------
+    ------------------------------
+     Lauri Watts                                                                                | Lauri Watts
+     & Lauri. Watts. mail;                                                                      | & Lauri. Watts. mail;
+     ROLES_OF_TRANSLATORS                                                                       | Rafael Osuna rosuna@wol. es Traductor Miguel Revilla Rodríguez yo@miguelr
+    evilla. com Traductor
+     2006-02-26 3.5.1                                                                           | 2006-02-26 3.5.1
+     The Babel & konqueror; plugin gives you quick access to the Babelfish translation service. | La extensión Babel de & konqueror; le permite un acceso rápido al servici
+    o de traducción de Babelfish.
+     KDE                                                                                        | KDE
+     kdeaddons                                                                                  | kdeaddons
+     konqueror                                                                                  | konqueror
+     plugins                                                                                    | extensiones
+     babelfish                                                                                  | babelfish
+    (10 rows)
+     ```
 
-When you're constructing your own datasets for translation, it's important to mirror the same table structure. You'll need a `JSONB` column named `translation`, that has first has a "from" language name/value pair, and then a "to" language name/value pair. In this English to Spanish example we use from "en" to "es". You'll pass a `y_column_name` of `translation` to tune the model.
 
 ### Tune the model
 Tuning is very similar to training with PostgresML, although we specify a `model_name` to download from Hugging Face instead of the base `algorithm`.
 
 ```sql linenums="1" title="tune.sql"
 SELECT pgml.tune(
     'Translate English to Spanish',
-    task => 'translation_en_to_es',
-    relation_name => 'pgml.kde4',
-    y_column_name => 'translation',
+    task => 'translation',
+    relation_name => 'kde4_en_to_es',
+    y_column_name => 'es', -- translate into spanish
     model_name => 'Helsinki-NLP/opus-mt-en-es',
     hyperparams => '{
         "learning_rate": 2e-5,
@@ -289,7 +334,8 @@ Or, it might be interesting to concat the title to the text field to see how rel
 
 ```sql linenums="1" title="concat_title.sql"
 CREATE OR REPLACE VIEW billsum_training_data
-AS SELECT title || '\n' || "text" AS "text", summary FROM pgml.billsum;
+AS SELECT title || '\n' || "text" AS "text", summary FROM pgml.billsum 
+LIMIT 10;
 ```
 
 
@@ -310,14 +356,14 @@ SELECT pgml.tune(
         "per_device_eval_batch_size": 2,
         "num_train_epochs": 1,
         "weight_decay": 0.01,
-        "max_input_length": 1024,
-        "max_summary_length": 128
+        "max_length": 1024
     }',
     test_size => 0.2,
     test_sampling => 'last'
 );
 ```
 
+
 ### Make predictions
 
 === "SQL"
@@ -355,3 +401,27 @@ The default for predict in a classification problem classifies the statement as
 This shows that there is a 6.26% chance for category 0 (negative sentiment), and a 93.73% chance it's category 1 (positive sentiment).
 
 See the [task documentation](https://huggingface.co/tasks/text-classification) for more examples, use cases, models and datasets.
+
+
+
+## Text Generation
+
+```postgresql linenums="1"
+    SELECT pgml.load_dataset('bookcorpus', "limit" => 100);
+    
+    SELECT pgml.tune(
+        'GPT Generator',
+        task => 'text-generation',
+        relation_name => 'pgml.bookcorpus',
+        y_column_name => 'text',
+        model_name => 'gpt2',
+        hyperparams => '{
+            "learning_rate": 2e-5,
+            "num_train_epochs": 1
+        }',
+        test_size => 0.2,
+        test_sampling => 'last'
+    );  
+    
+    SELECT pgml.generate('GPT Generator', 'While I wandered weak and weary');
+```
Original file line number	Diff line number	Diff line change
`@@ -161,3 +161,4 @@ cython_debug/`
`161`	`161`
`162`	`162`	`# local scratch pad`
`163`	`163`	`scratch.sql`
	`164`	`+scratch.py`
Original file line number	Diff line number	Diff line change
`@@ -60,16 +60,22 @@ impl Project {`
`60`	`60`
`61`	`61`	`pub fn key_metric_name(&self) -> anyhow::Result<&'static str> {`
`62`	`62`	`match self.task.as_ref().unwrap().as_str() {`
`63`		`- "classification" \| "text-classification" => Ok("f1"),`
	`63`	`+ "classification" \| "text_classification" \| "question_answering" => Ok("f1"),`
`64`	`64`	`"regression" => Ok("r2"),`
	`65`	`+ "summarization" => Ok("rouge_ngram_f1"),`
	`66`	`+ "translation" => Ok("bleu"),`
	`67`	`+ "text_generation" \| "text2text" => Ok("perplexity"),`
`65`	`68`	`task => Err(anyhow::anyhow!("Unhandled task: {}", task)),`
`66`	`69`	`}`
`67`	`70`	`}`
`68`	`71`
`69`	`72`	`pub fn key_metric_display_name(&self) -> anyhow::Result<&'static str> {`
`70`	`73`	`match self.task.as_ref().unwrap().as_str() {`
`71`		`- "classification" \| "text-classification" => Ok("F<sup>1</sup>"),`
	`74`	`+ "classification" \| "text_classification" \| "question_answering" => Ok("F<sup>1</sup>"),`
`72`	`75`	`"regression" => Ok("R<sup>2</sup>"),`
	`76`	`+ "summarization" => Ok("Rouge Ngram F<sup>1</sup>"),`
	`77`	`+ "translation" => Ok("Bleu"),`
	`78`	`+ "text_generation" \| "text2text" => Ok("Perplexity"),`
`73`	`79`	`task => Err(anyhow::anyhow!("Unhandled task: {}", task)),`
`74`	`80`	`}`
`75`	`81`	`}`