Skip to content

Commit a8d8218

Browse files
authored
fix and test preprocessing examples (#1520)
1 parent c3a8514 commit a8d8218

File tree

14 files changed

+70
-26
lines changed

14 files changed

+70
-26
lines changed

.github/workflows/ubuntu-packages-and-docker-image.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ on:
44
workflow_dispatch:
55
inputs:
66
packageVersion:
7-
default: "2.8.2"
7+
default: "2.9.1"
88
jobs:
99
#
1010
# PostgresML extension.

pgml-cms/docs/resources/developer-docs/contributing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ SELECT pgml.version();
127127
postgres=# select pgml.version();
128128
version
129129
-------------------
130-
2.7.4
130+
2.9.1
131131
(1 row)
132132
```
133133
{% endtab %}

pgml-cms/docs/resources/developer-docs/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ CREATE EXTENSION
132132
pgml_test=# SELECT pgml.version();
133133
version
134134
---------
135-
2.7.4
135+
2.9.1
136136
(1 row)
137137
```
138138

pgml-cms/docs/resources/developer-docs/quick-start-with-docker.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ Time: 41.520 ms
8080
postgresml=# SELECT pgml.version();
8181
version
8282
---------
83-
2.7.13
83+
2.9.1
8484
(1 row)
8585
```
8686

pgml-cms/docs/resources/developer-docs/self-hosting/pooler.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,6 @@ Type "help" for help.
115115
postgresml=> SELECT pgml.version();
116116
version
117117
---------
118-
2.7.9
118+
2.9.1
119119
(1 row)
120120
```

pgml-extension/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pgml-extension/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "pgml"
3-
version = "2.9.0"
3+
version = "2.9.1"
44
edition = "2021"
55

66
[lib]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
-- load the diamonds dataset, that contains text categorical variables
2+
SELECT pgml.load_dataset('jdxcosta/diamonds');
3+
4+
-- view the data
5+
SELECT * FROM pgml."jdxcosta/diamonds" LIMIT 10;
6+
7+
-- drop the Unamed column, since it's not useful for training (you could create a view instead)
8+
ALTER TABLE pgml."jdxcosta/diamonds" DROP COLUMN "Unnamed: 0";
9+
10+
-- train a model using preprocessors to scale the numeric variables, and target encode the categoricals
11+
SELECT pgml.train(
12+
project_name => 'Diamond prices',
13+
task => 'regression',
14+
relation_name => 'pgml.jdxcosta/diamonds',
15+
y_column_name => 'price',
16+
algorithm => 'lightgbm',
17+
preprocess => '{
18+
"carat": {"scale": "standard"},
19+
"depth": {"scale": "standard"},
20+
"table": {"scale": "standard"},
21+
"cut": {"encode": "target", "scale": "standard"},
22+
"color": {"encode": "target", "scale": "standard"},
23+
"clarity": {"encode": "target", "scale": "standard"}
24+
}'
25+
);
26+
27+
-- run some predictions, notice we're passing a heterogeneous row (tuple) as input, rather than a homogenous ARRAY[].
28+
SELECT price, pgml.predict('Diamond prices', (carat, cut, color, clarity, depth, "table", x, y, z)) AS prediction
29+
FROM pgml."jdxcosta/diamonds"
30+
LIMIT 10;
31+
32+
-- This is a difficult dataset for more algorithms, which makes it a good challenge for preprocessing, and additional
33+
-- feature engineering. What's next?

pgml-extension/sql/pgml--2.9.0--2.9.1.sql

Whitespace-only changes.

pgml-extension/src/bindings/transformers/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ pub fn load_dataset(
380380
.ok_or(anyhow!("dataset `data` key is not an object"))?;
381381
let column_names = types
382382
.iter()
383-
.map(|(name, _type)| name.clone())
383+
.map(|(name, _type)| format!("\"{}\"", name))
384384
.collect::<Vec<String>>()
385385
.join(", ");
386386
let column_types = types
@@ -393,13 +393,14 @@ pub fn load_dataset(
393393
"int64" => "INT8",
394394
"int32" => "INT4",
395395
"int16" => "INT2",
396+
"int8" => "INT2",
396397
"float64" => "FLOAT8",
397398
"float32" => "FLOAT4",
398399
"float16" => "FLOAT4",
399400
"bool" => "BOOLEAN",
400401
_ => bail!("unhandled dataset feature while reading dataset: {type_}"),
401402
};
402-
Ok(format!("{name} {type_}"))
403+
Ok(format!("\"{name}\" {type_}"))
403404
})
404405
.collect::<Result<Vec<String>>>()?
405406
.join(", ");
@@ -455,7 +456,7 @@ pub fn load_dataset(
455456
.into_datum(),
456457
)),
457458
"dict" | "list" => row.push((PgBuiltInOids::JSONBOID.oid(), JsonB(value.clone()).into_datum())),
458-
"int64" | "int32" | "int16" => row.push((
459+
"int64" | "int32" | "int16" | "int8" => row.push((
459460
PgBuiltInOids::INT8OID.oid(),
460461
value
461462
.as_i64()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy