Skip to content

Refactor the initialization of GUC parameters. #1360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pgml-extension/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pgml-extension/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ serde = { version = "1.0" }
serde_json = { version = "1.0", features = ["preserve_order"] }
typetag = "0.2"
xgboost = { git = "https://github.com/postgresml/rust-xgboost", branch = "master" }
lazy_static = "1.4.0"

[dev-dependencies]
pgrx-tests = "=0.11.3"
Expand Down
8 changes: 3 additions & 5 deletions pgml-extension/src/bindings/python/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,9 @@ use pgrx::*;
use pyo3::prelude::*;
use pyo3::types::PyTuple;

use crate::config::get_config;
use crate::config::PGML_VENV;
use crate::create_pymodule;

static CONFIG_NAME: &str = "pgml.venv";

create_pymodule!("/src/bindings/python/python.py");

pub fn activate_venv(venv: &str) -> Result<bool> {
Expand All @@ -23,8 +21,8 @@ pub fn activate_venv(venv: &str) -> Result<bool> {
}

pub fn activate() -> Result<bool> {
match get_config(CONFIG_NAME) {
Some(venv) => activate_venv(&venv),
match PGML_VENV.1.get() {
Some(venv) => activate_venv(&venv.to_string_lossy()),
None => Ok(false),
}
}
Expand Down
55 changes: 31 additions & 24 deletions pgml-extension/src/bindings/transformers/whitelist.rs
Original file line number Diff line number Diff line change
@@ -1,47 +1,54 @@
use anyhow::{bail, Error};
use pgrx::GucSetting;
#[cfg(any(test, feature = "pg_test"))]
use pgrx::{pg_schema, pg_test};
use serde_json::Value;
use std::ffi::CStr;

use crate::config::get_config;

static CONFIG_HF_WHITELIST: &str = "pgml.huggingface_whitelist";
static CONFIG_HF_TRUST_REMOTE_CODE_BOOL: &str = "pgml.huggingface_trust_remote_code";
static CONFIG_HF_TRUST_WHITELIST: &str = "pgml.huggingface_trust_remote_code_whitelist";
use crate::config::{PGML_HF_TRUST_REMOTE_CODE, PGML_HF_TRUST_WHITELIST, PGML_HF_WHITELIST};

/// Verify that the model in the task JSON is allowed based on the huggingface whitelists.
pub fn verify_task(task: &Value) -> Result<(), Error> {
let task_model = match get_model_name(task) {
Some(model) => model.to_string(),
None => return Ok(()),
};
let whitelisted_models = config_csv_list(CONFIG_HF_WHITELIST);
let whitelisted_models = config_csv_list(&PGML_HF_WHITELIST.1);

let model_is_allowed = whitelisted_models.is_empty() || whitelisted_models.contains(&task_model);
if !model_is_allowed {
bail!("model {task_model} is not whitelisted. Consider adding to {CONFIG_HF_WHITELIST} in postgresql.conf");
bail!(
"model {} is not whitelisted. Consider adding to {} in postgresql.conf",
task_model,
PGML_HF_WHITELIST.0
);
}

let task_trust = get_trust_remote_code(task);
let trust_remote_code = get_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL)
.map(|v| v == "true")
.unwrap_or(true);
let trust_remote_code = PGML_HF_TRUST_REMOTE_CODE.1.get();

let trusted_models = config_csv_list(CONFIG_HF_TRUST_WHITELIST);
let trusted_models = config_csv_list(&PGML_HF_TRUST_WHITELIST.1);

let model_is_trusted = trusted_models.is_empty() || trusted_models.contains(&task_model);

let remote_code_allowed = trust_remote_code && model_is_trusted;
if !remote_code_allowed && task_trust == Some(true) {
bail!("model {task_model} is not trusted to run remote code. Consider setting {CONFIG_HF_TRUST_REMOTE_CODE_BOOL} = 'true' or adding {task_model} to {CONFIG_HF_TRUST_WHITELIST}");
bail!(
"model {} is not trusted to run remote code. Consider setting {} = 'true' or adding {} to {}",
task_model,
PGML_HF_TRUST_REMOTE_CODE.0,
task_model,
PGML_HF_TRUST_WHITELIST.0
);
}

Ok(())
}

fn config_csv_list(name: &str) -> Vec<String> {
match get_config(name) {
fn config_csv_list(csv_list: &GucSetting<Option<&'static CStr>>) -> Vec<String> {
match csv_list.get() {
Some(value) => value
.to_string_lossy()
.trim_matches('"')
.split(',')
.filter_map(|s| if s.is_empty() { None } else { Some(s.to_string()) })
Expand Down Expand Up @@ -122,7 +129,7 @@ mod tests {
#[pg_test]
fn test_empty_whitelist() {
let model = "Salesforce/xgen-7b-8k-inst";
set_config(CONFIG_HF_WHITELIST, "").unwrap();
set_config(PGML_HF_WHITELIST.0, "").unwrap();
let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());
Expand All @@ -131,12 +138,12 @@ mod tests {
#[pg_test]
fn test_nonempty_whitelist() {
let model = "Salesforce/xgen-7b-8k-inst";
set_config(CONFIG_HF_WHITELIST, model).unwrap();
set_config(PGML_HF_WHITELIST.0, model).unwrap();
let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());

set_config(CONFIG_HF_WHITELIST, "other_model").unwrap();
set_config(PGML_HF_WHITELIST.0, "other_model").unwrap();
let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_err());
Expand All @@ -145,18 +152,18 @@ mod tests {
#[pg_test]
fn test_trusted_model() {
let model = "Salesforce/xgen-7b-8k-inst";
set_config(CONFIG_HF_WHITELIST, model).unwrap();
set_config(CONFIG_HF_TRUST_WHITELIST, model).unwrap();
set_config(PGML_HF_WHITELIST.0, model).unwrap();
set_config(PGML_HF_TRUST_WHITELIST.0, model).unwrap();

let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());

let task_json = format!(json_template!(), model, true);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());
assert!(verify_task(&task).is_err());
Copy link
Contributor Author

@higuoxing higuoxing Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the initial commit of this test, this step should fail with RemoteCodeNotTrusted.

assert_eq!(
verify_task_against_whitelist(&task),
Err(WhitelistError::RemoteCodeNotTrusted)
);


set_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL, "true").unwrap();
set_config(PGML_HF_TRUST_REMOTE_CODE.0, "true").unwrap();
let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());
Expand All @@ -169,8 +176,8 @@ mod tests {
#[pg_test]
fn test_untrusted_model() {
let model = "Salesforce/xgen-7b-8k-inst";
set_config(CONFIG_HF_WHITELIST, model).unwrap();
set_config(CONFIG_HF_TRUST_WHITELIST, "other_model").unwrap();
set_config(PGML_HF_WHITELIST.0, model).unwrap();
set_config(PGML_HF_TRUST_WHITELIST.0, "other_model").unwrap();

let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
Expand All @@ -180,7 +187,7 @@ mod tests {
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_err());

set_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL, "true").unwrap();
set_config(PGML_HF_TRUST_REMOTE_CODE.0, "true").unwrap();
let task_json = format!(json_template!(), model, false);
let task: Value = serde_json::from_str(&task_json).unwrap();
assert!(verify_task(&task).is_ok());
Expand Down
66 changes: 51 additions & 15 deletions pgml-extension/src/config.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,58 @@
use lazy_static::lazy_static;
use pgrx::{GucContext, GucFlags, GucRegistry, GucSetting};
use std::ffi::CStr;

#[cfg(any(test, feature = "pg_test"))]
use pgrx::{pg_schema, pg_test};
use pgrx_pg_sys::AsPgCStr;

pub fn get_config(name: &str) -> Option<String> {
// SAFETY: name is not null because it is a Rust reference.
let ptr = unsafe { pgrx_pg_sys::GetConfigOption(name.as_pg_cstr(), true, false) };
(!ptr.is_null()).then(move || {
// SAFETY: assuming pgrx_pg_sys is providing a valid, null terminated pointer.
unsafe { CStr::from_ptr(ptr) }.to_string_lossy().to_string()
})
lazy_static! {
pub static ref PGML_VENV: (&'static str, GucSetting<Option<&'static CStr>>) =
("pgml.venv", GucSetting::<Option<&'static CStr>>::new(None));
pub static ref PGML_HF_WHITELIST: (&'static str, GucSetting<Option<&'static CStr>>) = (
"pgml.huggingface_whitelist",
GucSetting::<Option<&'static CStr>>::new(None),
);
pub static ref PGML_HF_TRUST_REMOTE_CODE: (&'static str, GucSetting<bool>) =
("pgml.huggingface_trust_remote_code", GucSetting::<bool>::new(false));
pub static ref PGML_HF_TRUST_WHITELIST: (&'static str, GucSetting<Option<&'static CStr>>) = (
"pgml.huggingface_trust_remote_code_whitelist",
GucSetting::<Option<&'static CStr>>::new(None),
);
}

pub fn initialize_server_params() {
GucRegistry::define_string_guc(
PGML_VENV.0,
"Python's virtual environment path",
"",
&PGML_VENV.1,
GucContext::Userset,
GucFlags::default(),
);
GucRegistry::define_string_guc(
PGML_HF_WHITELIST.0,
"Models allowed to be downloaded from huggingface",
"",
&PGML_HF_WHITELIST.1,
GucContext::Userset,
GucFlags::default(),
);
GucRegistry::define_bool_guc(
PGML_HF_TRUST_REMOTE_CODE.0,
"Whether model can execute remote codes",
"",
&PGML_HF_TRUST_REMOTE_CODE.1,
GucContext::Userset,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we make these GUCs SUSET?

SUSET options can be set at postmaster startup, with the SIGHUP mechanism, or from the startup packet or SQL if you're a superuser.

https://github.com/postgres/postgres/blob/d93627bcbe5001750e7611f0e637200e2d81dcff/src/include/utils/guc.h#L74

GucFlags::default(),
);
GucRegistry::define_string_guc(
PGML_HF_TRUST_WHITELIST.0,
"Models allowed to execute remote codes when pgml.hugging_face_trust_remote_code = 'on'",
"",
&PGML_HF_TRUST_WHITELIST.1,
GucContext::Userset,
GucFlags::default(),
);
}

#[cfg(any(test, feature = "pg_test"))]
Expand All @@ -26,17 +68,11 @@ pub fn set_config(name: &str, value: &str) -> Result<(), pgrx::spi::Error> {
mod tests {
use super::*;

#[pg_test]
fn read_config_max_connections() {
let name = "max_connections";
assert_eq!(get_config(name), Some("100".into()));
}

#[pg_test]
fn read_pgml_huggingface_whitelist() {
let name = "pgml.huggingface_whitelist";
let value = "meta-llama/Llama-2-7b";
set_config(name, value).unwrap();
assert_eq!(get_config(name), Some(value.into()));
assert_eq!(PGML_HF_WHITELIST.1.get().unwrap().to_string_lossy(), value);
}
}
1 change: 1 addition & 0 deletions pgml-extension/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ extension_sql_file!("../sql/schema.sql", name = "schema");
#[cfg(not(feature = "use_as_lib"))]
#[pg_guard]
pub extern "C" fn _PG_init() {
config::initialize_server_params();
bindings::python::activate().expect("Error setting python venv");
orm::project::init();
}
Expand Down
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy