diff --git a/build/build_all.py b/build/build_all.py index 10b7bc55..7004dd74 100644 --- a/build/build_all.py +++ b/build/build_all.py @@ -40,7 +40,7 @@ def main(): parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.") parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands") parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.") - parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.') + parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.') parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.') parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.') parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.') @@ -119,7 +119,6 @@ def process_docker(datasets): 'hcmi': ['hcmi'], 'beataml': ['beataml'], 'mpnst': ['mpnst'], - 'mpnstpdx': ['mpnstpdx'], 'pancpdo': ['pancpdo'], 'bladderpdo': ['bladderpdo'], 'sarcpdo': ['sarcpdo'], @@ -410,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'): # if args.figshare or args.validate: # FigShare File Prefixes: - prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo'] + prefixes = ['beataml', 'hcmi', 'cptac', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo','mpnst'] broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] if "broad_sanger" in datasets: prefixes.extend(broad_sanger_datasets) diff --git a/build/build_dataset.py b/build/build_dataset.py index 7904a43e..780b583b 100644 --- a/build/build_dataset.py +++ b/build/build_dataset.py @@ -41,7 +41,6 @@ def process_docker(dataset,validate): 'hcmi': ['hcmi'], 'beataml': ['beataml'], 'mpnst': ['mpnst'], - 'mpnstpdx': ['mpnstpdx'], 'pancpdo': ['pancpdo'], 'cptac': ['cptac'], 'sarcpdo': ['sarcpdo'], @@ -128,7 +127,6 @@ def process_omics(executor, dataset, should_continue): 'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'hcmi': ['mutations', 'transcriptomics'], - 'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'sarcpdo': ['mutations', 'transcriptomics'], 'pancpdo': ['transcriptomics'], 'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'], diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R old mode 100755 new mode 100644 index 0ec5704b..db1f238e --- a/build/mpnst/00_sample_gen.R +++ b/build/mpnst/00_sample_gen.R @@ -1,5 +1,4 @@ -# This script generate a new sample table based on pervious beatAML improved sample ID -# It will take the maximum value of beatAML improved sample ID and continue from ID count from there +# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id) # Load required libraries library(data.table) library(synapser) @@ -11,14 +10,12 @@ if(length(args) > 1 ){ stop("Up to one argument is allowed. This is the filepath to the previously run samples file.") } - if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) { orig_samples <- "" } else { orig_samples <- fread(args[1]) } - # Check if Synapse token is available from the environment synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") if (synapse_token == "") { @@ -29,6 +26,10 @@ synapser::synLogin(authToken=synapse_token) manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> as.data.frame() +#Drop contaminated sample JH-2-009 +manifest <- manifest %>% + filter(Sample != "JH-2-009") + ###sample file has a strict schema ## - improve_sample_id @@ -62,9 +63,6 @@ main<-rbind(sampTable,pdxmt)|> dplyr::select(-MicroTissueDrugFolder)|> rbind(tumorTable) -#main <- fread("mpnst/NF_MPNST_samples.csv") -#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv") - # If there is no previous samples file - start at 1, else, continue where the previous one left off. if (identical(orig_samples, "")) { max_id <- 1 @@ -72,21 +70,6 @@ if (identical(orig_samples, "")) { max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE) } - main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main)) -#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv") -# Step 1: Create a dictionary from 'main' -#id_dict <- setNames(main$improve_sample_id, main$other_id) - -# Step 2: Update 'ID' in 'synapse_main' -#synapse_main$ID <- id_dict[synapse_main$Sample] - -# Handling NA values if any mismatch occurs (Optional based on your data integrity) -# If there are NAs generated, you might need to check for unmatched keys -# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows - -# Step 3: Save the updated 'synapse_main' -#fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv") -#fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file fwrite(main,'/tmp/mpnst_samples.csv') diff --git a/build/mpnst/01_combined_omics.R b/build/mpnst/01_combined_omics.R new file mode 100644 index 00000000..dcbdfbae --- /dev/null +++ b/build/mpnst/01_combined_omics.R @@ -0,0 +1,246 @@ +#!/usr/bin/env Rscript + +# Combined MPNST & MPNST-PDX Data Extraction Script +# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples. + +# Load required libraries +library(data.table) +library(synapser) +library(dplyr) +library(tidyr) + +# Retrieve command line arguments +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 3) { + stop("Usage: Rscript 01_combined_omics.R ", call. = FALSE) +} +PAT <- args[1] +samples <- args[2] +genes <- args[3] + +# Log in to Synapse +token <- PAT +synLogin(authToken = token) + +# Read sample mapping and gene mapping +samples_df <- fread(samples) %>% + select(improve_sample_id, common_name, model_type) %>% + distinct() +genes_df <- fread(genes) + +# Subset by model type +pdx_samps <- filter(samples_df, model_type == "patient derived xenograft") +tumor_samps<- filter(samples_df, model_type == "tumor") +mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest. + +# Retrieve manifest table from Synapse +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) + +# Build sample tables +pdx_data <- manifest %>% + select(common_name, starts_with("PDX")) %>% + left_join(pdx_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = PDX_RNASeq, + Mutations = PDX_Somatic_Mutations, + CopyNumber = PDX_CNV, + Proteomics = PDX_Proteomics) %>% + filter(!is.na(improve_sample_id)) + +tumor_data <- manifest %>% + select(common_name, starts_with("Tumor")) %>% + left_join(tumor_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = Tumor_RNASeq, + Mutations = Tumor_Somatic_Mutations, + CopyNumber = Tumor_CNV) %>% + mutate(Proteomics = "") %>% + filter(!is.na(improve_sample_id)) + +mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files) + select(common_name, starts_with("PDX")) %>% + left_join(mt_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = PDX_RNASeq, + Mutations = PDX_Somatic_Mutations, + CopyNumber = PDX_CNV, + Proteomics = PDX_Proteomics) %>% + filter(!is.na(improve_sample_id)) + +# Combine all sample tables +dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct() +print("dcombined:") +print(dcombined) + +# Helper to assign study label based on model_type +study_label <- function(type) { + case_when( + type == "patient derived xenograft" ~ "MPNST PDX", + type == "tumor" ~ "MPNST Tumor", + type == "xenograft derived organoid" ~ "MPNST PDX MT", + TRUE ~ "MPNST" + ) +} + +# Helper to pick metadata based on sample ID and column +pick_meta <- function(id, column) { + # columns are {"Proteomics","RNASeq","Mutations","CopyNumber"} + if (any(tumor_data[[column]] == id, na.rm = TRUE)) { + sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1) + } else if (any(mt_data[[column]] == id, na.rm = TRUE)) { + sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1) + } else if (any(pdx_data[[column]] == id, na.rm = TRUE)) { + sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1) + } else { + return(NULL) + } + list( + sample_id = sdf$improve_sample_id, + model_type = sdf$model_type + ) +} + +# Safe extraction: only return non-empty data frames +i_safe_extract <- function(df, sample_id, source_val, study_val) { + if (is.null(df) || nrow(df) == 0) return(NULL) + df$improve_sample_id <- sample_id + df$source <- source_val + df$study <- study_val + df +} + +# 1) Proteomics +proteomics_list <- lapply( + setdiff(dcombined$Proteomics, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "Proteomics") + if (is.null(meta)) return(NULL) + + df <- tryCatch( + fread(synGet(id)$path) %>% + rename(gene_symbol = Gene) %>% + left_join(genes_df, by = "gene_symbol") %>% + select(entrez_id, proteomics = logRatio) %>% + filter(!is.na(entrez_id), proteomics != 0) %>% + distinct(), + error = function(e) NULL + ) + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +proteomics <- bind_rows(proteomics_list) +fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv")) +message("Wrote combined proteomics") + + +# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..) +transcriptomics_list <- lapply( + setdiff(dcombined$RNASeq, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "RNASeq") + if (is.null(meta)) return(NULL) + + df <- tryCatch({ + fread(synGet(id)$path) %>% + separate(Name, into = c("other_id","vers"), sep = "\\.") %>% + select(-vers) %>% + left_join(genes_df) %>% + select(entrez_id, transcriptomics = TPM) %>% + filter(!is.na(entrez_id), transcriptomics != 0) %>% + distinct() + }, error = function(e) NULL) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +transcriptomics <- bind_rows(transcriptomics_list) +fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv")) +message("Wrote combined transcriptomics") + + +# 3) Mutations (WES) +wes_list <- lapply( + setdiff(dcombined$Mutations, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "Mutations") + if (is.null(meta)) return(NULL) + + clean_id <- gsub('[\"\\[\\]]', '', id) + df <- tryCatch( + fread(synGet(clean_id)$path) %>% + select(entrez_id = Entrez_Gene_Id, + mutation = HGVSc, + variant_classification = Variant_Classification) %>% + filter(entrez_id %in% genes_df$entrez_id) %>% + distinct(), + error = function(e) NULL + ) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +wes <- bind_rows(wes_list) +fwrite(wes, file.path("/tmp", "mpnst_mutations.csv")) +message("Wrote combined mutations") + + +# 4) Copy Number Variation (CNV) +cnv_list <- lapply( + setdiff(dcombined$CopyNumber, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "CopyNumber") + if (is.null(meta)) return(NULL) + + clean_id <- gsub('[\"\\[\\]]', '', id) + raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL) + if (is.null(raw)) return(NULL) + + df_long <- raw %>% + separate_rows(gene, sep = ",") %>% + rename(gene_symbol = gene) %>% + left_join(genes_df, by = "gene_symbol") %>% + filter(!is.na(entrez_id)) %>% + select(entrez_id, log2) %>% + distinct() %>% + mutate(copy_number = 2^log2) %>% + select(-log2) + + df <- df_long %>% + mutate(copy_call = case_when( + copy_number < 0.5210507 ~ "deep del", + copy_number < 0.7311832 ~ "het loss", + copy_number < 1.214125 ~ "diploid", + copy_number < 1.422233 ~ "gain", + TRUE ~ "amp" + )) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +cnv <- bind_rows(cnv_list) +fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv")) +message("Wrote combined copy number") + + +message("All combined data files created.") diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R deleted file mode 100755 index 9097465a..00000000 --- a/build/mpnst/01_mpnst_get_omics.R +++ /dev/null @@ -1,205 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [genes]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -genefile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - -# Define the Ensembl mart # biomart issues still exist -# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later... - -# Path to the directory to save .sf files -#path <- "./tmp" -#dir.create(path, showWarnings = FALSE) - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -tumor_samps<-subset(samples_df,model_type=='tumor') -mt_samps<-subset(samples_df,model_type=='xenograft derived organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##for now we only have tumor and PDX data -##they each get their own sample identifier -pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,common_name,model_type,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|> - subset(!is.na(improve_sample_id)) - -tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> - left_join(tumor_samps)|> - dplyr::select(improve_sample_id,common_name,model_type,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> - mutate(Proteomics='')|> - subset(!is.na(improve_sample_id)) - ##we dont have tumor proteomics from these samples -#print(tumor_data) - -mt_data<- manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(mt_samps)|> - dplyr::select(improve_sample_id,common_name,model_type, RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|>##we dont have mt data yet, so collecting PDX instead - subset(!is.na(improve_sample_id)) -#print(tumor_data) - - -combined<-rbind(pdx_data,tumor_data,mt_data)|>distinct() - -# gene mapping table -genes_df <- fread(genefile) - - -##added proteomics first -proteomics<-do.call('rbind',lapply(setdiff(mt_data$Proteomics,c('',NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(mt_data,Proteomics==x) - #print(sample) - res<-fread(synGet(x)$path)|> - #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - #dplyr::select(-vers)|> - dplyr::rename(gene_symbol='Gene')|> - left_join(genes_df)|> - dplyr::select(entrez_id,proteomics='logRatio')|> - distinct()|> - subset(!is.na(entrez_id))|> - subset(proteomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(proteomics,'/tmp/mpnst_proteomics.csv.gz') - - -#### FIRST WE GET RNASeq Data - -rnaseq<-do.call('rbind',lapply(setdiff(mt_data$RNASeq,c(NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(mt_data,RNASeq==x) - #print(sample) - res<-fread(synGet(x)$path)|> - tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - dplyr::select(-vers)|> - left_join(genes_df)|> - dplyr::select(entrez_id,transcriptomics='TPM')|> - subset(!is.na(entrez_id))|> - subset(transcriptomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(rnaseq,'/tmp/mpnst_transcriptomics.csv.gz') - - - -#####NEXT WE DO WES DATA -print("Getting WES") -wes<-do.call(rbind,lapply(setdiff(mt_data$`Mutations`,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(mt_data,Mutations==x) - print(sample$improve_sample_id) - res<-NULL - try(res<-fread(synGet(x2)$path)|> - dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|> - subset(entrez_id%in%genes_df$entrez_id)|> - distinct()) - if(is.null(res)) - return(NULL) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - - return(distinct(res)) - # } -})) - -fwrite(wes,'/tmp/mpnst_mutations.csv.gz') - - -print(paste("getting CNV")) -##next let's do CNVs! -cnv<-do.call(rbind,lapply(setdiff(mt_data$CopyNumber,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(mt_data,CopyNumber==x) - print(sample$improve_sample_id) - res<-fread(synGet(x2)$path) - - long_df<- res|> - tidyr::separate_rows(gene,sep=',')|> - dplyr::rename(gene_symbol='gene')|> - dplyr::left_join(genes_df)|> - subset(!is.na(entrez_id))|> - dplyr::select(entrez_id,log2)|> - dplyr::distinct()|> - dplyr::mutate(copy_number=2^log2)|> - dplyr::select(-log2) - - res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - ifelse(copy_number<0.7311832,'het loss', - ifelse(copy_number<1.214125,'diploid', - ifelse(copy_number<1.422233,'gain','amp')))))|> - mutate(study='MPNST PDX MT',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|> - dplyr::distinct() - - # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)] - # filtered_df <- long_df |> - # subset(is.finite(log2))|> - # filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols - # filtered_df <- filtered_df[, .(gene_symbol = V1, - # improve_sample_id = sample$improve_sample_id[1], - # copy_number = 2^log2, - # source = "NF Data Portal", - # study = "MPNST PDX MT")] - # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - # dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - # ifelse(copy_number<0.7311832,'het loss', - # ifelse(copy_number<1.214125,'diploid', - # ifelse(copy_number<1.422233,'gain','amp')))))|> - # left_join(genes_df)|> - # dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|> - # subset(!is.na(entrez_id))|> - # distinct() - # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id)) - return(res) - # } -})) - -fwrite(cnv,'/tmp/mpnst_copy_number.csv.gz') - -##TODO: get proteomics!!! diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R index e90a31fb..f88f0f99 100644 --- a/build/mpnst/02_get_drug_data.R +++ b/build/mpnst/02_get_drug_data.R @@ -1,172 +1,128 @@ -# Load required libraries +#!/usr/bin/env Rscript + +# Combined Drug List Extraction for MPNST & MPNST‑PDX + library(data.table) -# library(biomaRt)# biomart issues still exist library(dplyr) library(stringr) library(synapser) +library(reticulate) - -# Retrieve command line arguments +# 0) Args & login args <- commandArgs(trailingOnly = TRUE) - - -# Check the number of arguments provided if (length(args) < 1) { - stop("At least one argument is required. Usage: Rscript 02_get_drug_data.R [olddrugfile]", call. = FALSE) + stop("Usage: Rscript combined_drug_list.R [old_drugs.tsv,...]", call.=FALSE) } - - -# Assign arguments -newdrugfile <- args[1] # Path to the new drug file -olddrugfiles <- ifelse(length(args) >= 2 && args[2] != "", args[2], NA) - -# Read SYNAPSE_AUTH_TOKEN from the environment -synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") -if (synapse_token == "") { - stop("Error: SYNAPSE_AUTH_TOKEN environment variable is not set.") +newdrugfile <- args[1] +newdrugfile <- file.path(newdrugfile) +olddrugfiles <- if (length(args)>=2 && nzchar(args[2])) args[2] else NA + +token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") +if (token == "") stop("Please set SYNAPSE_AUTH_TOKEN in your environment", call.=FALSE) +synLogin(authToken = token) + +# 1) Fetch manifest +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) + +# 2) PDX‑sourced drugs via annotations +pdx_df <- manifest %>% + select(common_name, PDX_Drug_Data) %>% + distinct() %>% + filter(!is.na(PDX_Drug_Data)) + +pdx_ids <- unique(unlist(strsplit(pdx_df$PDX_Drug_Data, ","))) +pdx_ids <- pdx_ids[ pdx_ids != "" & !is.na(pdx_ids) & pdx_ids != "NA" ] + +get_pdx_drugs <- function(synid) { + # Query the metadata table for this file's experimentalCondition + q <- sprintf( + "select experimentalCondition from syn21993642 where id='%s'", + synid + ) + df <- synTableQuery(q)$asDataFrame() + if (nrow(df)==0) return(character(0)) + # Split on semicolon, lowercase and drop empties + conds <- unlist(strsplit(df$experimentalCondition, ";")) + tolower(conds[conds!=""]) } -synLogin(authToken = synapse_token) - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') +pdx_drugs <- unique(unlist(lapply(pdx_ids, get_pdx_drugs))) +pdx_drugs <- setdiff(pdx_drugs, "control") -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) +# 3) MicroTissue‑sourced drugs via table "children" +mts_df <- manifest %>% + select(common_name, MicroTissueDrugFolder) %>% + filter(!is.na(MicroTissueDrugFolder)) +mts_ids <- unique(unlist(strsplit(mts_df$MicroTissueDrugFolder, ","))) +mts_ids <- mts_ids[mts_ids != "" & !is.na(mts_ids) & mts_ids != "NA"] -##MTS contain lists of directories -mts<-manifest|> - dplyr::select(common_name,MicroTissueDrugFolder)|> - subset(!is.na(MicroTissueDrugFolder)) - - - -##define functions - -##first function to get children from parentId -getDrugsByParent<-function(parid){ - qtab<-synTableQuery(paste('select id,name,experimentalCondition,parentId from syn21993642 where parentId=\'',parid,'\''))$asDataFrame()|> - subset(!is.na(experimentalCondition))|>dplyr::select(id,name,experimentalCondition) - ##now we need to parse the metadatda table get the info - - return(unique(qtab$experimentalCondition)) - +get_mts_drugs <- function(parentId) { + q <- sprintf("select experimentalCondition from syn21993642 where parentId='%s'", parentId) + synTableQuery(q)$asDataFrame() %>% + pull(experimentalCondition) %>% + unique() %>% + tolower() } -##now loop through manifest to get all the files -mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(common_name)] - -alldrugs<-unique(unlist(lapply(mts_fold$V1,function(x){ - samp<-subset(mts_fold,V1==x) - res<-getDrugsByParent(x) - return(res) -}))) - - -alldrugs[which(alldrugs=='PD901')]<-'PD-0325901' - -print(paste(alldrugs,collapse=',')) - +mts_drugs <- unique(unlist(lapply(mts_ids, get_mts_drugs))) +# 4) Combine and fix bad names +all_drugs <- unique(c(pdx_drugs, mts_drugs)) +all_drugs[all_drugs == "pd901"] <- "pd-0325901" +message("Combined drug list: ", paste(all_drugs, collapse=", ")) -## new code: - - -# Handle old drugs +# 5) Read old‑drug files or initialize empty if (!is.na(olddrugfiles)) { - # Read and combine old drug files - olddrug_list <- lapply(unique(unlist(strsplit(olddrugfiles, split = ','))), function(x) { - if (file.exists(x)) { - return(fread(x, header = TRUE, sep = '\t', quote = '')) - } else { - warning(paste("Old drug file does not exist:", x)) - return(NULL) + paths <- strsplit(olddrugfiles, ",")[[1]] %>% trimws() + old_list <- lapply(paths, function(f) { + if (file.exists(f)) fread(f, sep="\t", header=TRUE) else { + warning("Missing old‑drug file: ", f) + NULL } }) - - # Remove NULL entries and ensure uniqueness - olddrug_list <- Filter(Negate(is.null), olddrug_list) - - if (length(olddrug_list) > 0) { - olddrugs <- unique(rbindlist(olddrug_list, use.names = TRUE, fill = TRUE)) - print(paste('Read in', nrow(olddrugs), 'old drugs')) + old_list <- Filter(Negate(is.null), old_list) + if (length(old_list) > 0) { + olddrugs <- unique(rbindlist(old_list, use.names=TRUE, fill=TRUE)) + message("Read ", nrow(olddrugs), " old drug records") } else { - olddrugs <- data.frame( - improve_drug_id = integer(), - chem_name = character(), - pubchem_id = character(), - canSMILES = character(), - # isoSMILES = character(), - InChIKey = character(), - formula = character(), - weight = numeric(), - stringsAsFactors = FALSE + olddrugs <- data.table( + improve_drug_id=integer(), chem_name=character(), + pubchem_id=character(), canSMILES=character(), + InChIKey=character(), formula=character(), weight=numeric() ) - print("Old drug files not valid. Created empty olddrugs dataframe.") + message("No valid old data; using empty template") } } else { - # Create an empty dataframe with specified columns - olddrugs <- data.frame( - improve_drug_id = integer(), - chem_name = character(), - pubchem_id = character(), - canSMILES = character(), - # isoSMILES = character(), - InChIKey = character(), - formula = character(), - weight = numeric(), - stringsAsFactors = FALSE + olddrugs <- data.table( + improve_drug_id=integer(), chem_name=character(), + pubchem_id=character(), canSMILES=character(), + InChIKey=character(), formula=character(), weight=numeric() ) - print("No old drug file provided. Created empty olddrugs dataframe.") + message("No old‑drug files provided; starting fresh") } -# Write the initial drug file (old drugs) -write.table(olddrugs, file = newdrugfile, sep = '\t', row.names = FALSE, quote = FALSE,col.names=T) - - -# Define the ignore file path -ignore_file_path <- '/tmp/mpnst_ignore_chems.txt' - - -# ##copy old drug to new drug -# olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))) -# olddrugs<-unique(olddrugs) +# 6) Write placeholder +fwrite(olddrugs, newdrugfile, sep="\t", quote=FALSE) +message("Wrote placeholder to ", newdrugfile) -# print(paste('Read in ',nrow(olddrugs),'old drugs')) -# #file.copy(olddrugfile,newdrugfile) -# write.table(olddrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T) +# 7) Augment via Python +ignore_file <- "/tmp/combined_drugs_ignore_chems.txt" +use_python("/opt/venv/bin/python3", required=TRUE) +# use_python("/Users/jaco059/miniconda3/bin/python3", required=TRUE) - -##now load reticulate down here - -library(reticulate) - -use_python("/opt/venv/bin/python3", required = TRUE) +# source_python("build/utils/pubchem_retrieval.py") source_python("pubchem_retrieval.py") - -update_dataframe_and_write_tsv(unique_names=alldrugs,output_filename=newdrugfile,ignore_chems=ignore_file_path) - - -tab<-read.table(newdrugfile,sep='\t',header=T,quote="",fill=TRUE) - -newdrugs<-tab|> - subset(chem_name%in%tolower(alldrugs)) - -tab<-tab|> - subset(improve_drug_id%in%newdrugs$improve_drug_id) - -write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE) - -print(paste("Final drug table written to", newdrugfile)) - - -##now call the python drug script - - +update_dataframe_and_write_tsv( + unique_names = all_drugs, + output_filename = newdrugfile, + ignore_chems = ignore_file +) + +# 8) Final filter & save +tab <- fread(newdrugfile, sep="\t", header=TRUE) +final_tab <- unique(tab) +fwrite(final_tab, newdrugfile, sep="\t", quote=FALSE) +message("Wrote full synonyms list to ", newdrugfile) \ No newline at end of file diff --git a/build/mpnst/03_get_drug_response_data.R b/build/mpnst/03_get_drug_response_data.R deleted file mode 100644 index 9bbb6f00..00000000 --- a/build/mpnst/03_get_drug_response_data.R +++ /dev/null @@ -1,152 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) -library(stringr) -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [drugs]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -drugfile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") -print(head(samples_df)) - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -org_samps<-subset(samples_df,model_type=='organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.table()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - left_join(pdx_samps)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) - - -##MTS contain lists of directories -mts<-manifest|> - dplyr::select(common_name,MicroTissueDrugFolder)|> - left_join(org_samps)|> - distinct()|> - subset(!is.na(MicroTissueDrugFolder)) - - -# Modify the extract_date_hour function to return a named vector -extract_date_hour <- function(experiment_id) { - pattern <- "(\\d{6})_?(\\d{2,3})?" - matches <- str_match(experiment_id, pattern) - date <- matches[, 2] - hour <- matches[, 3] - date[is.na(date)] <- NA # Replace with NA instead of blank - hour[is.na(hour)] <- 48 # Replace with 48 instead of blank (default) - return(list(date = date, hour = hour)) -} - - - -##define functions - -##first function to get children from parentId -getDrugDataByParent<-function(parid,sampleId){ - qtab<-synTableQuery(paste('select id,name,experimentalCondition,parentId from syn21993642 where parentId=\'',parid,'\''))$asDataFrame()|> - as.data.frame()|> - subset(!is.na(experimentalCondition))|> - dplyr::select(id,name,experimentalCondition)|> - subset(name!='synapse_storage_manifest.csv') - ##now we need to parse the metadatda table get the info - - res<-do.call(rbind,lapply(qtab$id,function(x){ - sname <- subset(qtab,id==x) - #print(sname) - sname <-extract_date_hour(sname$name) - #print(x) - #print(sname) - data <- fread(synGet(x)$path)|> - filter(response_type=='percent viability')|> - mutate(improve_sample_id=sampleId, - DOSE=(10^dosage)*1000000, ##dosage is log(M), need to move to micromolar - GROWTH=response, #/100, - source = "NF Data Portal", - #CELL = improve_sample_id, - chem_name = compound_name, - study = paste0('MT ',sname$date,' exp'), - time = sname$hour) %>% - select(improve_sample_id,DOSE,GROWTH,source,chem_name,study,time) - - return(data) - })) - return(res) -} - -##now loop through manifest to get all the files -mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)] - -mts_fold <- mts_fold[which(!mts_fold$V1%in%c("NA",NA)),] - -print(mts_fold) - -alldrugs<-do.call(rbind,lapply(mts_fold$V1,function(x){ - samp<-subset(mts_fold,V1==x) - print(samp$common_name) - res<-getDrugDataByParent(x,samp$improve_sample_id) - return(res) -})) - -##do the drug matching -drug_df<-fread(drugfile) - -##update drug name PD901 since it's mussing - -alldrugs$chem_name[which(alldrugs$chem_name=='PD901')]<-'PD-0325901' - - - #drug_df$chem_name=tolower(drug_df$chem_name) -alldrugs$chem_name<-tolower(alldrugs$chem_name) - -#print(drug_df) -drug_map<-subset(drug_df,chem_name%in%alldrugs$chem_name) - -findrugs<-alldrugs|> - left_join(drug_map)|> - mutate(time_unit='hours')|> - dplyr::select(DOSE,GROWTH,source,study,Drug=improve_drug_id,time,time_unit,improve_sample_id)|> - distinct()|> - subset(!is.na(Drug)) - -missing<-setdiff(alldrugs$chem_name,drug_map$chem_name) -print(paste('missing',length(missing),'drugs:')) -print(paste(missing,collapse=',')) - -#TODO: add in new drug lookup -print(head(findrugs)) -fwrite(findrugs,'/tmp/curve_data.tsv',sep='\t') - -pycmd = '/opt/venv/bin/python fit_curve.py --input /tmp/curve_data.tsv --output /tmp/experiments' -print('running curve fitting') -system(pycmd) - -##mmve file name -file.rename('/tmp/experiments.0','/tmp/mpnst_experiments.tsv') - - diff --git a/build/mpnst/03_get_experiments.R b/build/mpnst/03_get_experiments.R new file mode 100644 index 00000000..a430cae8 --- /dev/null +++ b/build/mpnst/03_get_experiments.R @@ -0,0 +1,282 @@ +library(data.table) +library(synapser) +library(dplyr) +library(stringr) +library(readr) +library(readxl) +library(tidyr) + +# Check that correct number of arguments are present +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 4) { + stop("Usage: Rscript 03_get_experiments.R ", call. = FALSE) +} +PAT <- args[1] +samples <- args[2] +drugfile <- args[3] +out_prefix <- args[4] + +synLogin(authToken = PAT) + +# Read in sampes file +samples_df <- fread(samples) %>% + select(improve_sample_id, common_name, model_type) %>% + distinct() + +pdx_samps <- filter(samples_df, model_type == "patient derived xenograft") +mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") + +# Get manifest table from Synapse +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) %>% + as.data.table() + +# Helper Function to extract date and hour from experiment ID +extract_date_hour <- function(experiment_id) { + pattern <- "(\\d{6})_?(\\d{2,3})?" + m <- str_match(experiment_id, pattern) + date <- m[,2]; hour <- m[,3] + date[is.na(date)] <- NA + hour[is.na(hour)] <- 48 + list(date = date, hour = hour) +} + +# ──────────────────────────────────────────────── +# MicroTissue Experiments +# ──────────────────────────────────────────────── + +getDrugDataByParent <- function(parid, sampleId) { + q <- sprintf( + "select id,name,experimentalCondition,parentId from syn21993642 where parentId='%s'", + parid + ) + qtab <- synTableQuery(q)$asDataFrame() %>% + filter(!is.na(experimentalCondition), name != "synapse_storage_manifest.csv") %>% + select(id, name, experimentalCondition) + do.call(rbind, lapply(qtab$id, function(x) { + info <- filter(qtab, id == x) + d <- extract_date_hour(info$name) + fread(synGet(x)$path) %>% + filter(response_type == "percent viability") %>% + transmute( + improve_sample_id = sampleId, + DOSE = (10^dosage) * 1e6, + GROWTH = response, + source = "NF Data Portal", + chem_name = compound_name, + study = paste0("MT ", d$date, " exp"), + time = d$hour + ) + })) +} + +# Create map of MicroTissue Drug Folders +mts_map <- manifest %>% + select(common_name, MicroTissueDrugFolder) %>% + inner_join(mt_samps, by = "common_name") %>% + separate_rows(MicroTissueDrugFolder, sep = ",") %>% + # keep exactly what old script did: drop only "NA" and actual NA + filter( + !is.na(MicroTissueDrugFolder), + MicroTissueDrugFolder != "NA" + ) %>% + select( + improve_sample_id, + folder = MicroTissueDrugFolder + ) + +# Fetch all MicroTissue drug response data +mt_data <- do.call(rbind, lapply(seq_len(nrow(mts_map)), function(i) { + sample_id <- mts_map$improve_sample_id[i] + folder <- mts_map$folder[i] + getDrugDataByParent(folder, sample_id) +})) + +drug_map <- fread(drugfile) %>% + select(improve_drug_id, chem_name) %>% + distinct() + +# Clean up drug names and join with drug_map +mt_curve <- mt_data %>% + mutate( + chem_name = tolower(chem_name), + chem_name = ifelse(chem_name == "pd901", "pd-0325901", chem_name) + ) %>% + left_join(drug_map, by = "chem_name") %>% + filter(!is.na(improve_drug_id)) %>% + transmute( + source = source, + improve_sample_id = improve_sample_id, + Drug = improve_drug_id, + study = study, + time = time, + time_unit = "hours", + DOSE = DOSE, + GROWTH = GROWTH + ) + +# Run curve fitting, Write MicroTissue curve data +fwrite(mt_curve, file.path("/tmp", paste0(out_prefix, "_mt_curve_data.tsv")), sep = "\t") + +message("Wrote MT curve data") + +# Write MT experiments file +system(sprintf( + "/opt/venv/bin/python fit_curve.py --input %s --output %s", + paste0("/tmp/", out_prefix, "_mt_curve_data.tsv"), + paste0("/tmp/", out_prefix, "_mt_experiments") +)) +file.rename( + paste0("/tmp/", out_prefix, "_mt_experiments.0"), + paste0("/tmp/", out_prefix, "_mt_experiments.tsv") +) +message("Wrote MT experiments") + +# ──────────────────────────────────────────────── +# PDX Experiments +# ──────────────────────────────────────────────── + +# Create a map of PDX Drug Data +# This will be used to fetch the drug data for each PDX sample +pdx_map <- do.call(rbind, lapply(seq_len(nrow(manifest)), function(i) { + row <- manifest[i, ] + samp <- pdx_samps[pdx_samps$common_name == row$common_name, ] + if (nrow(samp)==0 || is.na(row$PDX_Drug_Data) || row$PDX_Drug_Data %in% c("", "NA")) + return(NULL) + ids <- strsplit(row$PDX_Drug_Data, ",")[[1]] + ids <- trimws(ids[ids!=""]) + data.frame( + improve_sample_id = samp$improve_sample_id, + child_id = ids, + stringsAsFactors = FALSE + ) +})) + +# Create a dataframe of PDX metadata +pdx_meta <- do.call(rbind, lapply(seq_len(nrow(pdx_map)), function(i) { + sid <- pdx_map$improve_sample_id[i] + cid <- pdx_map$child_id[i] + pid <- synGet(cid)$parentId + if (is.null(pid) || pid=="") stop("no parentId for ", cid) + data.frame( + improve_sample_id = sid, + child_id = cid, + parentId = pid, + stringsAsFactors = FALSE + ) +})) + +all_pdx <- do.call(rbind, lapply(seq_len(nrow(pdx_meta)), function(i) { + m <- pdx_meta[i, ] + pth <- synGet(m$child_id)$path + raw <- if (grepl("\\.xlsx?$", pth)) read_xlsx(pth) else read_csv(pth) + + # detect second‐drug column + sec_opts <- c("compound 2_name", "compound_2_name") + drug2_col <- intersect(sec_opts, names(raw))[1] + compound2 <- if (!is.na(drug2_col)) raw[[drug2_col]] else NA_character_ + + df <- data.frame( + child_id = m$child_id, + specimen_id = raw$specimen_id, + compound_name = raw$compound_name, + compound_2_name = compound2, + experimental_time_point = raw$experimental_time_point, + experimental_time_point_unit = raw$experimental_time_point_unit, + assay_value = raw$assay_value, + stringsAsFactors = FALSE + ) + + df <- within(df, { + drug1 <- tolower(trimws(compound_name)) + drug2 <- tolower(trimws(compound_2_name)) + treatment <- ifelse( + is.na(drug1) | drug1 %in% c("", "na", "n/a", "nan"), + "control", + ifelse(!is.na(drug2) & drug2 != "", + paste(drug1, drug2, sep = "+"), + drug1 + ) + ) + time <- experimental_time_point + time_unit <- experimental_time_point_unit + volume <- assay_value + }) + + df[ , c("child_id", "specimen_id", "treatment", "time", "time_unit", "volume")] +})) + +# join on parentId and sample +pdx_data <- merge(all_pdx, pdx_meta, by="child_id") + +pdx_data <- subset(pdx_data, duplicated(child_id) | TRUE) +pdx_data <- within(pdx_data, { + experiment <- parentId + model_id <- improve_sample_id +}) + +# Filter out experiments missing a control +has_ctl <- tapply(pdx_data$treatment == "control", pdx_data$experiment, any) +no_ctl_exps <- names(has_ctl)[!has_ctl] +pdx_data <- pdx_data[pdx_data$experiment %in% names(has_ctl)[has_ctl], ] + +# Reorder final columns +pdx_data <- pdx_data[ , c("experiment","specimen_id","treatment", + "time","time_unit","volume","model_id")] + +# Correct doxorubinsin typo across all data +pdx_data$treatment <- gsub("doxorubinsin", + "doxorubicin", + pdx_data$treatment, + ignore.case = TRUE) + +# Drop any remaining NA rows +pdx_data <- na.omit(pdx_data) + +# write & fit +fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), sep = "\t") + + +message("Wrote PDX curve data") + +system(sprintf( + "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s --source 'NF Data Portal' --study 'MPNST PDX'", + paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), + drugfile, + paste0("/tmp/", out_prefix, "_pdx") +)) + + + +message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations") + + +# ──────────────────────────────────────────────── +# Combine all Experiments +# ──────────────────────────────────────────────── + +# Read MicroTissue experiments +mt_exp <- fread(paste0("/tmp/", out_prefix, "_mt_experiments.tsv")) %>% + mutate( + dose_response_value = as.character(dose_response_value) + ) + +# Read PDX experiments +pdx_exp <- fread(paste0("/tmp/", out_prefix, "_pdx_experiments.tsv")) %>% + mutate( + dose_response_value = as.character(dose_response_value) + ) + +# Join experiments into one. +all_exp <- bind_rows(mt_exp, pdx_exp) + +# Write out Experiments +fwrite(all_exp, paste0("/tmp/", out_prefix, "_experiments.tsv"), sep = "\t") +message("Wrote combined experiments: /tmp/", out_prefix, "_experiments.tsv") + + +# Rename the Drug Combination data file to fit schema naming +file.rename( + paste0("/tmp/", out_prefix, "_pdx_combinations.tsv"), + paste0("/tmp/", out_prefix, "_combinations.tsv") +) \ No newline at end of file diff --git a/build/mpnst/build_exp.sh b/build/mpnst/build_exp.sh index a9a2b763..14506cfe 100644 --- a/build/mpnst/build_exp.sh +++ b/build/mpnst/build_exp.sh @@ -3,5 +3,7 @@ set -euo pipefail trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR -echo "Running 03_get_drug_response_data.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." -Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 +echo "Running 03_get_experiments.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." +Rscript 03_get_experiments.R $SYNAPSE_AUTH_TOKEN $1 $2 mpnst +rm /tmp/mpnst_pdx_experiments.tsv /tmp/mpnst_mt_experiments.tsv /tmp/mpnst_mt_curve_data.tsv /tmp/mpnst_pdx_curve_data.tsv + diff --git a/build/mpnst/build_omics.sh b/build/mpnst/build_omics.sh index b08ac63d..d6d2cec7 100644 --- a/build/mpnst/build_omics.sh +++ b/build/mpnst/build_omics.sh @@ -3,5 +3,5 @@ set -euo pipefail trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR -echo "Running 01_mpnst_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." -Rscript 01_mpnst_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 +echo "Running 01_combined_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." +Rscript 01_combined_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnst/requirements.r b/build/mpnst/requirements.r index 7796236d..e8bfac35 100755 --- a/build/mpnst/requirements.r +++ b/build/mpnst/requirements.r @@ -9,3 +9,5 @@ install.packages("data.table") install.packages("R.utils") install.packages("stringr") install.packages("tidyr") +install.packages("readr") +install.packages("readxl") diff --git a/build/mpnst/requirements.txt b/build/mpnst/requirements.txt index 27c4dc2a..8f07cbd2 100755 --- a/build/mpnst/requirements.txt +++ b/build/mpnst/requirements.txt @@ -8,4 +8,5 @@ scikit-learn scipy requests mordredcommunity -rdkit \ No newline at end of file +rdkit +statsmodels \ No newline at end of file diff --git a/build/mpnst/sample_gen.R b/build/mpnst/sample_gen.R deleted file mode 100644 index 3d19fa85..00000000 --- a/build/mpnst/sample_gen.R +++ /dev/null @@ -1,25 +0,0 @@ -# This script generate a new sample table based on pervious beatAML improved sample ID -# It will take the maximum value of beatAML improved sample ID and continue from ID count from there -# Load required libraries -library(data.table) - -main <- fread("mpnst/NF_MPNST_samples.csv") -previous_aml <- fread("beatAML/beataml_samples.csv") -max_id <- max(previous_aml$improve_sample_id) -main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main)) - -synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv") -# Step 1: Create a dictionary from 'main' -id_dict <- setNames(main$improve_sample_id, main$other_id) - -# Step 2: Update 'ID' in 'synapse_main' -synapse_main$ID <- id_dict[synapse_main$Sample] - -# Handling NA values if any mismatch occurs (Optional based on your data integrity) -# If there are NAs generated, you might need to check for unmatched keys -# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows - -# Step 3: Save the updated 'synapse_main' -fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv") -fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file - diff --git a/build/mpnstpdx/01_mpnstpdx_get_omics.R b/build/mpnstpdx/01_mpnstpdx_get_omics.R deleted file mode 100755 index 86e3cbb8..00000000 --- a/build/mpnstpdx/01_mpnstpdx_get_omics.R +++ /dev/null @@ -1,195 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [genes]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -genefile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - -# Define the Ensembl mart # biomart issues still exist -# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later... - -# Path to the directory to save .sf files -#path <- "./tmp" -#dir.create(path, showWarnings = FALSE) - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNSTpdx_samples.csv") - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -tumor_samps<-subset(samples_df,model_type=='tumor') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##for now we only have tumor and pdx data -##they each get their own sample identifier -pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics') - -tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> - left_join(tumor_samps)|> - dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> - mutate(Proteomics='') ##we dont have tumor proteomics from these samples -#print(tumor_data) - - -pdx_data<-rbind(pdx_data,tumor_data)|>distinct() - -# gene mapping table -genes_df <- fread(genefile) - - -##added proteomics first -proteomics<-do.call('rbind',lapply(setdiff(pdx_data$Proteomics,c('',NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(pdx_data,Proteomics==x) - #print(sample) - res<-fread(synGet(x)$path)|> - #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - #dplyr::select(-vers)|> - dplyr::rename(gene_symbol='Gene')|> - left_join(genes_df)|> - dplyr::select(entrez_id,proteomics='logRatio')|> - distinct()|> - subset(!is.na(entrez_id))|> - subset(proteomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(proteomics,'/tmp/mpnstpdx_proteomics.csv.gz') - - -#### FIRST WE GET RNASeq Data - -rnaseq<-do.call('rbind',lapply(setdiff(pdx_data$RNASeq,c(NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(pdx_data,RNASeq==x) - #print(sample) - res<-fread(synGet(x)$path)|> - tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - dplyr::select(-vers)|> - left_join(genes_df)|> - dplyr::select(entrez_id,transcriptomics='TPM')|> - subset(!is.na(entrez_id))|> - subset(transcriptomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(rnaseq,'/tmp/mpnstpdx_transcriptomics.csv.gz') - - - -#####NEXT WE DO WES DATA -print("Getting WES") -wes<-do.call(rbind,lapply(setdiff(pdx_data$`Mutations`,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(pdx_data,Mutations==x) - print(sample$improve_sample_id) - res<-NULL - try(res<-fread(synGet(x2)$path)|> - dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|> - subset(entrez_id%in%genes_df$entrez_id)|> - distinct()) - if(is.null(res)) - return(NULL) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - - return(distinct(res)) - # } -})) - -fwrite(wes,'/tmp/mpnstpdx_mutations.csv.gz') - - -print(paste("getting CNV")) -##next let's do CNVs! -cnv<-do.call(rbind,lapply(setdiff(pdx_data$CopyNumber,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(pdx_data,CopyNumber==x) - print(sample$improve_sample_id) - res<-fread(synGet(x2)$path) - - long_df<- res|> - tidyr::separate_rows(gene,sep=',')|> - dplyr::rename(gene_symbol='gene')|> - dplyr::left_join(genes_df)|> - subset(!is.na(entrez_id))|> - dplyr::select(entrez_id,log2)|> - dplyr::distinct()|> - dplyr::mutate(copy_number=2^log2)|> - dplyr::select(-log2) - - res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - ifelse(copy_number<0.7311832,'het loss', - ifelse(copy_number<1.214125,'diploid', - ifelse(copy_number<1.422233,'gain','amp')))))|> - mutate(study='MPNST PDX',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|> - dplyr::distinct() - - # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)] - # filtered_df <- long_df |> - # subset(is.finite(log2))|> - # filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols - # filtered_df <- filtered_df[, .(gene_symbol = V1, - # improve_sample_id = sample$improve_sample_id[1], - # copy_number = 2^log2, - # source = "NF Data Portal", - # study = "MPNST PDX")] - # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - # dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - # ifelse(copy_number<0.7311832,'het loss', - # ifelse(copy_number<1.214125,'diploid', - # ifelse(copy_number<1.422233,'gain','amp')))))|> - # left_join(genes_df)|> - # dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|> - # subset(!is.na(entrez_id))|> - # distinct() - # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id)) - return(res) - # } -})) - -fwrite(cnv,'/tmp/mpnstpdx_copy_number.csv.gz') - -##TODO: get proteomics!!! diff --git a/build/mpnstpdx/02_get_drug_data.R b/build/mpnstpdx/02_get_drug_data.R deleted file mode 100644 index 1f6ad47e..00000000 --- a/build/mpnstpdx/02_get_drug_data.R +++ /dev/null @@ -1,120 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(dplyr) -library(stringr) -library(reticulate) -library(synapser) -library(tidyr) - - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [olddrugfile] [newdrugfile]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -olddrugfiles <- args[2] -newdrugfile <- args[3] -# Log in to Synapse -synLogin(authToken = PAT) - - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) - - - - - -##define functions - -#print(pdx) -##now loop through manifest to get all the files -pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> - subset(!is.na(V1))|> - subset(V1!='NA')|> - dplyr::rename(id='V1') - -#print(pdx_fold) -###this is not all of themju -pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> - as.data.frame()|> - dplyr::select('experimentalCondition')|> - dplyr::mutate(id=x)))|> - left_join(pdx_fold)|> - tidyr::separate_rows(experimentalCondition,sep=';')|> - mutate(chem_name=tolower(experimentalCondition)) - -#pdx_drug <- data.table(pdx_meta)[,strsplit(as.character(experimentalCondition),';'),by= .(common_name,id)]|> -# mutate(drug=tolower(experimentalCondition)) -#drugs<-sapply(pdx_meta$experimentalCondition,function(x) tolower(unlist(strsplit(x,split=';'))))|> -# unlist()|> -# unique() - -drugs<-setdiff(pdx_meta$chem_name,'control') - - -print(paste(drugs,collapse=',')) - - -##copy old drug to new drug -olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))) -olddrugs<-unique(olddrugs) - -print(paste('Read in ',nrow(olddrugs),'old drug files')) - -fdrugs<-subset(olddrugs,chem_name%in%drugs) -if(nrow(fdrugs)>0){ - dids<-fdrugs$improve_drug_id -}else{ - dids<-c() -} -newdrugs<-subset(olddrugs,improve_drug_id%in%dids) - -print(paste('Found',length(dids),'improved drug ids that exist, saving those')) - - - #file.copy(olddrugfile,newdrugfile) -write.table(newdrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T) -output_file_path <- newdrugfile -ignore_file_path <- '/tmp/mpnstpdx_ignore_chems.txt' - - -##now load reticulate down here - - - -use_python("/opt/venv/bin/python3", required = TRUE) -source_python("pubchem_retrieval.py") - -update_dataframe_and_write_tsv(unique_names=drugs,output_filename=output_file_path,ignore_chems=ignore_file_path) - - -tab<-read.table(newdrugfile,sep='\t',header=T,quote="",comment.char="") - -newdrugs<-tab|> - subset(chem_name%in%tolower(alldrugs)) - -tab<-tab|> - subset(improve_drug_id%in%newdrugs$improve_drug_id) - -write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE) - - -##now call the python drug script - - diff --git a/build/mpnstpdx/03_get_drug_response_data.R b/build/mpnstpdx/03_get_drug_response_data.R deleted file mode 100644 index 095dba34..00000000 --- a/build/mpnstpdx/03_get_drug_response_data.R +++ /dev/null @@ -1,174 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) -library(stringr) -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [drugs]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -drugfile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") -print(head(samples_df)) - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -org_samps<-subset(samples_df,model_type=='organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - subset(!PDX_Drug_Data%in%c("NA",NA))|> - left_join(pdx_samps)|> - distinct() - -print(pdx) - - -# Modify the extract_date_hour function to return a named vector -extract_date_hour <- function(experiment_id) { - pattern <- "(\\d{6})_?(\\d{2,3})?" - matches <- str_match(experiment_id, pattern) - date <- matches[, 2] - hour <- matches[, 3] - date[is.na(date)] <- NA # Replace with NA instead of blank - hour[is.na(hour)] <- 48 # Replace with 48 instead of blank (default) - return(list(date = date, hour = hour)) -} - - - -##define functions - -##first function to get children from parentId - -##now loop through manifest to get all the files -#mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)] - - - -##do the drug matching -drug_df<-fread(drugfile)|> - dplyr::select('improve_drug_id','chem_name')|> - distinct() - -##update drug name PD901 since it's mussing -##now loop through manifest to get all the files -pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> - dplyr::rename(id='V1')|> - subset(!is.na(id)) - -pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> - as.data.frame()|> - dplyr::select('experimentalCondition')|> - dplyr::mutate(id=x)))|>left_join(pdx_fold)|> - # tidyr::separate_rows(experimentalCondition,sep=';')|> - # mutate(chem_name=tolower(experimentalCondition))|> - # left_join(drug_df)|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,id)|> - distinct()|> - subset(!is.na(id)) -pdx_meta$parentId=unlist(lapply(pdx_meta$id,function(x) synGet(x)$parentId)) - -##the older pdx data is in separate files. the newer is not. -#we need to reformat the older to look like the newer -oldfolders=c('syn22018363','syn22024460','syn22024428','syn22024429','syn22024437','syn22024438') - -old_meta<-subset(pdx_meta,parentId%in%oldfolders) - -old_data<-do.call(rbind,lapply(unique(old_meta$parentId),function(x){ - ids<-subset(old_meta,parentId==x)|> - subset(!is.na(id)) - - do.call(rbind,lapply(ids$id,function(y){ - tab<-readr::read_csv(synapser::synGet(y)$path) - print(head(tab)) - tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', - 'experimental_time_point','experimental_time_point_unit', - 'assay_type','assay_value','assay_units'))|> - mutate(id=x)|> - mutate(chem_name=tolower(compound_name)) - - # tab$single_or_combo=sapply(tab$chem_name,function(z) ifelse(length(grep('\\+',z))>0,'combo','single')) - tab$chem_name=gsub('n/a','control',tab$chem_name)|> - tidyr::replace_na('control') - - tab$chem_name=sapply(tab$chem_name,function(z) ifelse(z=='doxorubinsin','doxorubicin',z)) - # tab<-tab|>left_join(drug_df) - #print(head(tab)) - return(tab) - })) -}))|> - left_join(unique(select(old_meta,id=parentId,improve_sample_id)))|> - dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() - - - -new_meta<-subset(pdx_meta,!parentId%in%oldfolders) - -##now combine each of the old pdx files into single files -#each file has all experiments in it -new_data<-do.call(rbind,lapply(unique(new_meta$id), function(x){ - fpath=synapser::synGet(x)$path - if(length(grep('xls',fpath))>0){ - tab<-readxl::read_xlsx(fpath) - }else{ - tab<-readr::read_csv(fpath) - } - print(head(tab)) - tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', - 'experimental_time_point','experimental_time_point_unit', - 'assay_type','assay_value','assay_units'))|> - mutate(id=x) - - # tab$single_or_combo=sapply(tab$compound_name,function(x) ifelse(length(grep('\\+',x))>0,'combo','single')) - tab$compound_name=gsub('N/A','control',tab$compound_name)|>tidyr::replace_na('control') - tab<-tab|> - mutate(chem_name=tolower(compound_name))#|> - # left_join(drug_df) - #print(head(tab)) - return(tab)}))|> - left_join(pdx_meta)|> - dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() - -##maybe tweak the data frame a bit depending on curve fitting script -pdx_data<-rbind(old_data,new_data) - -#single_pdx<-subset(pdx_data,single_or_combo=='single') -#combo_pdx<-subset(pdx_data,single_or_combo=='combo') -#print(head(pdx_data)) -fwrite(pdx_data,'/tmp/curve_data.tsv',sep='\t') - -##TODO: create new curve fitting script in python -pycmd = '/opt/venv/bin/python calc_pdx_metrics.py --input /tmp/curve_data.tsv --outprefix /tmp/mpnstpdx' -print('running curve fitting') -#system(pycmd) - -##now read in data again, separate out by single/combo, then map to drug id - -##mmve file name -#file.rename('/tmp/experiments.0','/tmp/mpnstpdx_experiments.tsv') - - diff --git a/build/mpnstpdx/README.md b/build/mpnstpdx/README.md deleted file mode 100755 index b0059283..00000000 --- a/build/mpnstpdx/README.md +++ /dev/null @@ -1,47 +0,0 @@ -## Build Instructions for MPNST PDX Dataset - -To build the MPNST PDX dataset, follow these steps from the coderdata root -directory. Currently using the test files as input. - -1. Build the Docker image: - ``` - docker build -f build/docker/Dockerfile.mpnstpdx -t mpnstpdx . --build-arg HTTPS_PROXY=$HTTPS_PROXY - ``` - -2. Generate new identifiers for these samples to create a - `mpnstpdx_samples.csv` file. This pulls from the latest synapse - project metadata table. - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_samples.sh /tmp/build/build_test/test_samples.csv - ``` - -3. Pull the data and map it to the samples. This uses the metadata - table pulled above. - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_omics.sh /tmp/build/build_test/test_genes.csv /tmp/mpnstpdx_samples.csv - ``` - -4. Process drug data - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv - ``` - -5. Process experiment data. This uses the metadata from above as well as the file directory on synapse: - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_exp.sh /tmp/mpnstpdx_samples.csv /tmp/mpnstpdx_drugs.tsv.gz - ``` - -Please ensure that each step is followed in order for correct dataset compilation. - -## MPNST PDX Dataset Structure -The MPNST dataset includes the following output files: -``` -├── mpnstpdx_samples.csv -├── mpnstpdx_transcriptomics.csv -├── mpnstpdx_mutations.csv -├── mpnstpdx_copy_number.csv -├── mpnstpdx_drugs.tsv -├── mpnstpdx_drug_descriptors.tsv.gz -├── mpnstpdx_experiments.tsv.gz -``` - diff --git a/build/mpnstpdx/build_drugs.sh b/build/mpnstpdx/build_drugs.sh deleted file mode 100644 index 78502bc7..00000000 --- a/build/mpnstpdx/build_drugs.sh +++ /dev/null @@ -1,4 +0,0 @@ -##get drug data -Rscript 02_get_drug_data.R $SYNAPSE_AUTH_TOKEN $1 /tmp/mpnstpdx_drugs.tsv -##get drug descriptors -/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/mpnstpdx_drugs.tsv --desctable /tmp/mpnstpdx_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/mpnstpdx/build_exp.sh b/build/mpnstpdx/build_exp.sh deleted file mode 100644 index 4e34f6b3..00000000 --- a/build/mpnstpdx/build_exp.sh +++ /dev/null @@ -1,2 +0,0 @@ -Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 -/opt/venv/bin/python3 calc_pdx_metrics.py /tmp/curve_data.tsv --drugfile=/tmp/mpnstpdx_drugs.tsv --outprefix=/tmp/mpnstpdx diff --git a/build/mpnstpdx/build_omics.sh b/build/mpnstpdx/build_omics.sh deleted file mode 100644 index 969b4fba..00000000 --- a/build/mpnstpdx/build_omics.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -euo pipefail - -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR - -echo "Running 01_mpnstpdx_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." -Rscript 01_mpnstpdx_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnstpdx/build_samples.sh b/build/mpnstpdx/build_samples.sh deleted file mode 100644 index aa88aa02..00000000 --- a/build/mpnstpdx/build_samples.sh +++ /dev/null @@ -1 +0,0 @@ -cp /tmp/mpnst_samples.csv /tmp/mpnstpdx_samples.csv diff --git a/build/mpnstpdx/requirements.r b/build/mpnstpdx/requirements.r deleted file mode 100755 index e6139cd4..00000000 --- a/build/mpnstpdx/requirements.r +++ /dev/null @@ -1,13 +0,0 @@ -install.packages('reticulate', repos='https://cloud.r-project.org') -reticulate::use_virtualenv('/opt/venv', required = TRUE) -install.packages('remotes') -remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org') -install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org')) -install.packages("dplyr") -install.packages("data.table") -install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org")) -install.packages("R.utils") -install.packages("stringr") -install.packages("tidyr") -install.packages('readr') -install.packages("readxl") diff --git a/build/mpnstpdx/requirements.txt b/build/mpnstpdx/requirements.txt deleted file mode 100755 index b0944928..00000000 --- a/build/mpnstpdx/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -pyarrow -pandas -matplotlib -numpy==1.26.4 -argparse -tqdm -scikit-learn -scipy -requests -mordredcommunity -rdkit -statsmodels diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index e0f4c05a..83876dd3 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - #print(time) + #print('at line 187') + #print(time.shape) + #print(time.dtype) + #print(np.max(time.astype(int))) + #print('auc is : ') + #print(auc) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - + #print('drug name is ' + drug_name) + data['exp_type'] = data['exp_type'].astype('category') + data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) + #print(data) + #print(data['exp_type'].cat.categories) # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' + #print(data['exp_type'].cat.categories) # Fit the model model = mixedlm(formula, data, groups=data['model_id']) fit = model.fit() @@ -284,6 +294,7 @@ def lmm(time, volume, treatment, drug_name): # time_coef_value = fit.params['time'] #print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] + #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: # coef_value = None # Handle the case when the interaction term is not present @@ -301,6 +312,8 @@ def main(): parser.add_argument('curvefile') parser.add_argument('--drugfile') parser.add_argument('--outprefix',default='/tmp/') + parser.add_argument('--study') + parser.add_argument('--source') args = parser.parse_args() @@ -314,20 +327,21 @@ def main(): expsing = expsing.dropna() # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value - - combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) - combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() - - expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - - expcomb[['source']]='Synapse' - expcomb[['study']]='MPNST PDX in vivo' - - expsing[['source']]='Synapse' - expsing[['study']]='MPNST PDX in vivo' + if combos.shape[0]> 0: + combos[['drug1','drug2']]=combos['drug'].str.split('+',expand=True) + + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb[['source']]=args.source + expcomb[['study']]=args.study + expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + + expsing[['source']]=args.source + expsing[['study']]=args.study expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t") - expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + #expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") @@ -341,21 +355,25 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) - print(name[0]) - print(drugs) + #print('line 355') + #print(name[0]) + #print(drugs) mod = list(set(group.model_id))[0] ctl_data = group[group.treatment == control] ctl_time = np.array(ctl_data.time) ctl_volume = np.array(ctl_data.volume) - + if (ctl_volume.shape[0] < 2): + continue ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - print(d) - d_data = group[group.treatment == d] + #print('is our drug a string or dict?') + #print(str(d)) + d_data = group[group.treatment == str(d)] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) - + if (treat_volume.shape[0] < 2): + continue # Get ABC for group treat_auc = AUC(treat_time, treat_volume) treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume) @@ -368,6 +386,7 @@ def get_drug_stats(df, control='control'): #llm comb = pd.concat([ctl_data, d_data]) + #print(comb) lmm_res = lmm(comb.time, comb.volume, comb.treatment, d) lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'}) if '+' in d: @@ -396,4 +415,4 @@ def get_drug_stats(df, control='control'): return sing, comb if __name__=='__main__': - main() + main() \ No newline at end of file diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml index 4cce4283..8035ff99 100644 --- a/schema/expected_files.yaml +++ b/schema/expected_files.yaml @@ -43,24 +43,6 @@ datasets: - target_class: Drug Descriptor file: /tmp/mpnst_drug_descriptors.tsv - mpnstpdx: - - target_class: Sample - file: /tmp/mpnstpdx_samples.csv - - target_class: Transcriptomics - file: /tmp/mpnstpdx_transcriptomics.csv - - target_class: Proteomics - file: /tmp/mpnstpdx_proteomics.csv - - target_class: Mutations - file: /tmp/mpnstpdx_mutations.csv - - target_class: Copy Number - file: /tmp/mpnstpdx_copy_number.csv - - target_class: Experiments - file: /tmp/mpnstpdx_experiments.tsv - - target_class: Drug - file: /tmp/mpnstpdx_drugs.tsv - - target_class: Drug Descriptor - file: /tmp/mpnstpdx_drug_descriptors.tsv - cptac: - target_class: Sample file: /tmp/cptac_samples.csv pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy