PNNL-CompBio · jjacobson95 · Jul 25, 2025 · Apr 24, 2025 · May 1, 2025 · May 1, 2025
diff --git a/build/build_all.py b/build/build_all.py
@@ -40,7 +40,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -119,7 +119,6 @@ def process_docker(datasets):
             'hcmi': ['hcmi'],
             'beataml': ['beataml'],
             'mpnst': ['mpnst'],
-            'mpnstpdx': ['mpnstpdx'],
             'pancpdo': ['pancpdo'],
             'bladderpdo': ['bladderpdo'],
             'sarcpdo': ['sarcpdo'],
@@ -410,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # if args.figshare or args.validate:
         # FigShare File Prefixes:
 
-        prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
+        prefixes = ['beataml', 'hcmi', 'cptac', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo','mpnst']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
         if "broad_sanger" in datasets:
             prefixes.extend(broad_sanger_datasets)

diff --git a/build/build_dataset.py b/build/build_dataset.py
@@ -41,7 +41,6 @@ def process_docker(dataset,validate):
         'hcmi': ['hcmi'],
         'beataml': ['beataml'],
         'mpnst': ['mpnst'],
-        'mpnstpdx': ['mpnstpdx'],
         'pancpdo': ['pancpdo'],
         'cptac': ['cptac'],
         'sarcpdo': ['sarcpdo'],
@@ -128,7 +127,6 @@ def process_omics(executor, dataset, should_continue):
         'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'hcmi': ['mutations', 'transcriptomics'],
-        'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
         'sarcpdo': ['mutations', 'transcriptomics'],
         'pancpdo': ['transcriptomics'],
         'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],

diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R
@@ -1,5 +1,4 @@
-# This script generate a new sample table based on pervious beatAML improved sample ID
-# It will take the maximum value of beatAML improved sample ID and continue from ID count from there
+# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id)
 # Load required libraries
 library(data.table)
 library(synapser)
@@ -11,14 +10,12 @@ if(length(args) > 1 ){
     stop("Up to one argument is allowed. This is the filepath to the previously run samples file.")
 }
 
-
 if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) {
     orig_samples <- ""
 } else {
     orig_samples <- fread(args[1])
 }
 
-
 # Check if Synapse token is available from the environment
 synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN")
 if (synapse_token == "") {
@@ -29,6 +26,10 @@ synapser::synLogin(authToken=synapse_token)
 manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
                                                              as.data.frame()
 
+#Drop contaminated sample JH-2-009
+manifest <- manifest %>% 
+  filter(Sample != "JH-2-009")
+
 
 ###sample file has a strict schema
 ## - improve_sample_id
@@ -62,31 +63,13 @@ main<-rbind(sampTable,pdxmt)|>
     dplyr::select(-MicroTissueDrugFolder)|>
     rbind(tumorTable)
 
-#main <- fread("mpnst/NF_MPNST_samples.csv")
-#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv")
-
 # If there is no previous samples file - start at 1, else, continue where the previous one left off.
 if (identical(orig_samples, "")) {
     max_id <- 1  
 } else {
     max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE)
 }
 
-
 main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main))
 
-#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv")
-# Step 1: Create a dictionary from 'main'
-#id_dict <- setNames(main$improve_sample_id, main$other_id)
-
-# Step 2: Update 'ID' in 'synapse_main'
-#synapse_main$ID <- id_dict[synapse_main$Sample]
-
-# Handling NA values if any mismatch occurs (Optional based on your data integrity)
-# If there are NAs generated, you might need to check for unmatched keys
-# synapse_main$ID[is.na(synapse_main$ID)] <- -1  # Assign a placeholder like -1 for unmatched rows
-
-# Step 3: Save the updated 'synapse_main'
-#fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv")
-#fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file
 fwrite(main,'/tmp/mpnst_samples.csv')
diff --git a/build/mpnst/01_combined_omics.R b/build/mpnst/01_combined_omics.R
@@ -0,0 +1,246 @@
+#!/usr/bin/env Rscript
+
+# Combined MPNST & MPNST-PDX Data Extraction Script
+# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples.
+
+# Load required libraries
+library(data.table)
+library(synapser)
+library(dplyr)
+library(tidyr)
+
+# Retrieve command line arguments
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 3) {
+  stop("Usage: Rscript 01_combined_omics.R <PAT> <samples.csv> <genes.csv>", call. = FALSE)
+}
+PAT      <- args[1]
+samples  <- args[2]
+genes    <- args[3]
+
+# Log in to Synapse
+token <- PAT
+synLogin(authToken = token)
+
+# Read sample mapping and gene mapping
+samples_df <- fread(samples) %>%
+  select(improve_sample_id, common_name, model_type) %>%
+  distinct()
+genes_df <- fread(genes)
+
+# Subset by model type
+pdx_samps   <- filter(samples_df, model_type == "patient derived xenograft")
+tumor_samps<- filter(samples_df, model_type == "tumor")
+mt_samps    <- filter(samples_df, model_type == "xenograft derived organoid")  # These end up being the same as pdx_samps in the manifest.
+
+# Retrieve manifest table from Synapse
+manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>%
+  rename(common_name = Sample)
+
+# Build sample tables
+pdx_data <- manifest %>%
+  select(common_name, starts_with("PDX")) %>%
+  left_join(pdx_samps, by = "common_name") %>%
+  select(improve_sample_id, common_name, model_type,
+         RNASeq = PDX_RNASeq,
+         Mutations = PDX_Somatic_Mutations,
+         CopyNumber = PDX_CNV,
+         Proteomics = PDX_Proteomics) %>%
+  filter(!is.na(improve_sample_id))
+
+tumor_data <- manifest %>%
+  select(common_name, starts_with("Tumor")) %>%
+  left_join(tumor_samps, by = "common_name") %>%
+  select(improve_sample_id, common_name, model_type,
+         RNASeq = Tumor_RNASeq,
+         Mutations = Tumor_Somatic_Mutations,
+         CopyNumber = Tumor_CNV) %>%
+  mutate(Proteomics = "") %>%
+  filter(!is.na(improve_sample_id))
+
+mt_data <- manifest %>%                     #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files)
+  select(common_name, starts_with("PDX")) %>%
+  left_join(mt_samps, by = "common_name") %>%
+  select(improve_sample_id, common_name, model_type,
+         RNASeq = PDX_RNASeq,
+         Mutations = PDX_Somatic_Mutations,
+         CopyNumber = PDX_CNV,
+         Proteomics = PDX_Proteomics) %>%
+  filter(!is.na(improve_sample_id))
+
+# Combine all sample tables
+dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct()
+print("dcombined:")
+print(dcombined)
+
+# Helper to assign study label based on model_type
+study_label <- function(type) {
+  case_when(
+    type == "patient derived xenograft"     ~ "MPNST PDX",
+    type == "tumor"                          ~ "MPNST Tumor",
+    type == "xenograft derived organoid"     ~ "MPNST PDX MT",
+    TRUE                                       ~ "MPNST"
+  )
+}
+
+# Helper to pick metadata based on sample ID and column
+pick_meta <- function(id, column) {
+  # columns are  {"Proteomics","RNASeq","Mutations","CopyNumber"}
+  if (any(tumor_data[[column]] == id, na.rm = TRUE)) {
+    sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1)
+  } else if (any(mt_data[[column]] == id, na.rm = TRUE)) {
+    sdf <- mt_data    %>% filter(.data[[column]] == id) %>% slice(1)
+  } else if (any(pdx_data[[column]] == id, na.rm = TRUE)) {
+    sdf <- pdx_data   %>% filter(.data[[column]] == id) %>% slice(1)
+  } else {
+    return(NULL)
+  }
+  list(
+    sample_id  = sdf$improve_sample_id,
+    model_type = sdf$model_type
+  )
+}
+
+# Safe extraction: only return non-empty data frames
+i_safe_extract <- function(df, sample_id, source_val, study_val) {
+  if (is.null(df) || nrow(df) == 0) return(NULL)
+  df$improve_sample_id <- sample_id
+  df$source            <- source_val
+  df$study             <- study_val
+  df
+}
+
+# 1) Proteomics
+proteomics_list <- lapply(
+  setdiff(dcombined$Proteomics, c("", NA, "NA")),
+  function(id) {
+    meta <- pick_meta(id, "Proteomics")
+    if (is.null(meta)) return(NULL)
+
+    df <- tryCatch(
+      fread(synGet(id)$path) %>%
+        rename(gene_symbol = Gene) %>%
+        left_join(genes_df, by = "gene_symbol") %>%
+        select(entrez_id, proteomics = logRatio) %>%
+        filter(!is.na(entrez_id), proteomics != 0) %>%
+        distinct(),
+      error = function(e) NULL
+    )
+    i_safe_extract(
+      df,
+      meta$sample_id,
+      "NF Data Portal",
+      study_label(meta$model_type)
+    )
+  }
+)
+proteomics <- bind_rows(proteomics_list)
+fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv"))
+message("Wrote combined proteomics")
+
+
+# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..)
+transcriptomics_list <- lapply(
+  setdiff(dcombined$RNASeq, c("", NA, "NA")),
+  function(id) {
+    meta <- pick_meta(id, "RNASeq")
+    if (is.null(meta)) return(NULL)
+
+    df <- tryCatch({
+      fread(synGet(id)$path) %>%
+        separate(Name, into = c("other_id","vers"), sep = "\\.") %>%
+        select(-vers) %>%
+        left_join(genes_df) %>%
+        select(entrez_id, transcriptomics = TPM) %>%
+        filter(!is.na(entrez_id), transcriptomics != 0) %>%
+        distinct()
+    }, error = function(e) NULL)
+
+    i_safe_extract(
+      df,
+      meta$sample_id,
+      "NF Data Portal",
+      study_label(meta$model_type)
+    )
+  }
+)
+transcriptomics <- bind_rows(transcriptomics_list)
+fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv"))
+message("Wrote combined transcriptomics")
+
+
+# 3) Mutations (WES)
+wes_list <- lapply(
+  setdiff(dcombined$Mutations, c("", NA, "NA")),
+  function(id) {
+    meta <- pick_meta(id, "Mutations")
+    if (is.null(meta)) return(NULL)
+
+    clean_id <- gsub('[\"\\[\\]]', '', id)
+    df <- tryCatch(
+      fread(synGet(clean_id)$path) %>%
+        select(entrez_id = Entrez_Gene_Id,
+               mutation               = HGVSc,
+               variant_classification = Variant_Classification) %>%
+        filter(entrez_id %in% genes_df$entrez_id) %>%
+        distinct(),
+      error = function(e) NULL
+    )
+
+    i_safe_extract(
+      df,
+      meta$sample_id,
+      "NF Data Portal",
+      study_label(meta$model_type)
+    )
+  }
+)
+wes <- bind_rows(wes_list)
+fwrite(wes, file.path("/tmp", "mpnst_mutations.csv"))
+message("Wrote combined mutations")
+
+
+# 4) Copy Number Variation (CNV)
+cnv_list <- lapply(
+  setdiff(dcombined$CopyNumber, c("", NA, "NA")),
+  function(id) {
+    meta <- pick_meta(id, "CopyNumber")
+    if (is.null(meta)) return(NULL)
+
+    clean_id <- gsub('[\"\\[\\]]', '', id)
+    raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL)
+    if (is.null(raw)) return(NULL)
+
+    df_long <- raw %>%
+      separate_rows(gene, sep = ",") %>%
+      rename(gene_symbol = gene) %>%
+      left_join(genes_df, by = "gene_symbol") %>%
+      filter(!is.na(entrez_id)) %>%
+      select(entrez_id, log2) %>%
+      distinct() %>%
+      mutate(copy_number = 2^log2) %>%
+      select(-log2)
+
+    df <- df_long %>%
+      mutate(copy_call = case_when(
+        copy_number < 0.5210507 ~ "deep del",
+        copy_number < 0.7311832 ~ "het loss",
+        copy_number < 1.214125  ~ "diploid",
+        copy_number < 1.422233  ~ "gain",
+        TRUE                    ~ "amp"
+      ))
+
+    i_safe_extract(
+      df,
+      meta$sample_id,
+      "NF Data Portal",
+      study_label(meta$model_type)
+    )
+  }
+)
+cnv <- bind_rows(cnv_list)
+fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv"))
+message("Wrote combined copy number")
+
+
+message("All combined data files created.")