Skip to content

Combine mpnst and mpnstpdx Datasets #425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jul 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6d1bf3e
added samples and omics files
alexandriai168 Apr 24, 2025
d74ac3c
Merge remote-tracking branch 'origin' into novartisPDX
alexandriai168 May 1, 2025
c35ec76
created get omics data function
alexandriai168 May 1, 2025
3798bfb
add novartispdx sample file
May 8, 2025
5bd7262
added get_copy_call.py to utils
alexandriai168 May 9, 2025
8cd287c
Merge branch 'novartisPDX-samples' of https://github.com/PNNL-CompBio…
alexandriai168 May 9, 2025
9b664bf
added copy number funciton
alexandriai168 May 13, 2025
7c37952
added transcriptomics function
alexandriai168 May 13, 2025
08c2f2f
Addition of drugs and experiments, alterations to calc_pdx_metrics
Jul 1, 2025
9d36112
progress on experiments data
Jul 14, 2025
a9164ca
final script for novartispdx and some changes to calc_pdx_metrics.py
Jul 16, 2025
481385e
Datasets Merged, Build works, Validate works
jjacobson95 Jul 21, 2025
3cf23f8
Merge remote-tracking branch 'refs/remotes/origin/main'
jjacobson95 Jul 21, 2025
b3824fb
Drop JH-2-009 Sample due to contamination
jjacobson95 Jul 22, 2025
65e2f01
Should be ready
jjacobson95 Jul 22, 2025
19adfc5
Made final changes so this is ready for build process
jjacobson95 Jul 22, 2025
1cecc29
Merge branch 'novartisPDX-drugs-experiments' into mpnst_dataset_join
jjacobson95 Jul 23, 2025
e811969
Made update in response to merge from novartis branch
jjacobson95 Jul 24, 2025
91c9079
Import calc_pdx_metrics.py from novartisPDX-drugs-experiments
jjacobson95 Jul 24, 2025
729db17
Merge remote-tracking branch 'refs/remotes/origin/mpnst_dataset_join'…
jjacobson95 Jul 24, 2025
e5f7f3a
Revert "Merge branch 'novartisPDX-drugs-experiments' into mpnst_datas…
jjacobson95 Jul 24, 2025
b958e83
Revert "Made update in response to merge from novartis branch"
jjacobson95 Jul 24, 2025
fca76bd
Fixed git issue. Adding pdx update from novartis and local change
jjacobson95 Jul 24, 2025
9533703
All working now
jjacobson95 Jul 24, 2025
8c2f4c0
removed cptac
jjacobson95 Jul 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions build/build_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main():
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.')
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.')
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
Expand Down Expand Up @@ -119,7 +119,6 @@ def process_docker(datasets):
'hcmi': ['hcmi'],
'beataml': ['beataml'],
'mpnst': ['mpnst'],
'mpnstpdx': ['mpnstpdx'],
'pancpdo': ['pancpdo'],
'bladderpdo': ['bladderpdo'],
'sarcpdo': ['sarcpdo'],
Expand Down Expand Up @@ -410,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
# if args.figshare or args.validate:
# FigShare File Prefixes:

prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo']
prefixes = ['beataml', 'hcmi', 'cptac', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo','mpnst']
broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
if "broad_sanger" in datasets:
prefixes.extend(broad_sanger_datasets)
Expand Down
2 changes: 0 additions & 2 deletions build/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def process_docker(dataset,validate):
'hcmi': ['hcmi'],
'beataml': ['beataml'],
'mpnst': ['mpnst'],
'mpnstpdx': ['mpnstpdx'],
'pancpdo': ['pancpdo'],
'cptac': ['cptac'],
'sarcpdo': ['sarcpdo'],
Expand Down Expand Up @@ -128,7 +127,6 @@ def process_omics(executor, dataset, should_continue):
'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'hcmi': ['mutations', 'transcriptomics'],
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
'sarcpdo': ['mutations', 'transcriptomics'],
'pancpdo': ['transcriptomics'],
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
Expand Down
27 changes: 5 additions & 22 deletions build/mpnst/00_sample_gen.R
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# This script generate a new sample table based on pervious beatAML improved sample ID
# It will take the maximum value of beatAML improved sample ID and continue from ID count from there
# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id)
# Load required libraries
library(data.table)
library(synapser)
Expand All @@ -11,14 +10,12 @@ if(length(args) > 1 ){
stop("Up to one argument is allowed. This is the filepath to the previously run samples file.")
}


if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) {
orig_samples <- ""
} else {
orig_samples <- fread(args[1])
}


# Check if Synapse token is available from the environment
synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN")
if (synapse_token == "") {
Expand All @@ -29,6 +26,10 @@ synapser::synLogin(authToken=synapse_token)
manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
as.data.frame()

#Drop contaminated sample JH-2-009
manifest <- manifest %>%
filter(Sample != "JH-2-009")


###sample file has a strict schema
## - improve_sample_id
Expand Down Expand Up @@ -62,31 +63,13 @@ main<-rbind(sampTable,pdxmt)|>
dplyr::select(-MicroTissueDrugFolder)|>
rbind(tumorTable)

#main <- fread("mpnst/NF_MPNST_samples.csv")
#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv")

# If there is no previous samples file - start at 1, else, continue where the previous one left off.
if (identical(orig_samples, "")) {
max_id <- 1
} else {
max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE)
}


main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main))

#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv")
# Step 1: Create a dictionary from 'main'
#id_dict <- setNames(main$improve_sample_id, main$other_id)

# Step 2: Update 'ID' in 'synapse_main'
#synapse_main$ID <- id_dict[synapse_main$Sample]

# Handling NA values if any mismatch occurs (Optional based on your data integrity)
# If there are NAs generated, you might need to check for unmatched keys
# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows

# Step 3: Save the updated 'synapse_main'
#fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv")
#fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file
fwrite(main,'/tmp/mpnst_samples.csv')
246 changes: 246 additions & 0 deletions build/mpnst/01_combined_omics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#!/usr/bin/env Rscript

# Combined MPNST & MPNST-PDX Data Extraction Script
# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples.

# Load required libraries
library(data.table)
library(synapser)
library(dplyr)
library(tidyr)

# Retrieve command line arguments
args <- commandArgs(trailingOnly = TRUE)
if (length(args) < 3) {
stop("Usage: Rscript 01_combined_omics.R <PAT> <samples.csv> <genes.csv>", call. = FALSE)
}
PAT <- args[1]
samples <- args[2]
genes <- args[3]

# Log in to Synapse
token <- PAT
synLogin(authToken = token)

# Read sample mapping and gene mapping
samples_df <- fread(samples) %>%
select(improve_sample_id, common_name, model_type) %>%
distinct()
genes_df <- fread(genes)

# Subset by model type
pdx_samps <- filter(samples_df, model_type == "patient derived xenograft")
tumor_samps<- filter(samples_df, model_type == "tumor")
mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest.

# Retrieve manifest table from Synapse
manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>%
rename(common_name = Sample)

# Build sample tables
pdx_data <- manifest %>%
select(common_name, starts_with("PDX")) %>%
left_join(pdx_samps, by = "common_name") %>%
select(improve_sample_id, common_name, model_type,
RNASeq = PDX_RNASeq,
Mutations = PDX_Somatic_Mutations,
CopyNumber = PDX_CNV,
Proteomics = PDX_Proteomics) %>%
filter(!is.na(improve_sample_id))

tumor_data <- manifest %>%
select(common_name, starts_with("Tumor")) %>%
left_join(tumor_samps, by = "common_name") %>%
select(improve_sample_id, common_name, model_type,
RNASeq = Tumor_RNASeq,
Mutations = Tumor_Somatic_Mutations,
CopyNumber = Tumor_CNV) %>%
mutate(Proteomics = "") %>%
filter(!is.na(improve_sample_id))

mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files)
select(common_name, starts_with("PDX")) %>%
left_join(mt_samps, by = "common_name") %>%
select(improve_sample_id, common_name, model_type,
RNASeq = PDX_RNASeq,
Mutations = PDX_Somatic_Mutations,
CopyNumber = PDX_CNV,
Proteomics = PDX_Proteomics) %>%
filter(!is.na(improve_sample_id))

# Combine all sample tables
dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct()
print("dcombined:")
print(dcombined)

# Helper to assign study label based on model_type
study_label <- function(type) {
case_when(
type == "patient derived xenograft" ~ "MPNST PDX",
type == "tumor" ~ "MPNST Tumor",
type == "xenograft derived organoid" ~ "MPNST PDX MT",
TRUE ~ "MPNST"
)
}

# Helper to pick metadata based on sample ID and column
pick_meta <- function(id, column) {
# columns are {"Proteomics","RNASeq","Mutations","CopyNumber"}
if (any(tumor_data[[column]] == id, na.rm = TRUE)) {
sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1)
} else if (any(mt_data[[column]] == id, na.rm = TRUE)) {
sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1)
} else if (any(pdx_data[[column]] == id, na.rm = TRUE)) {
sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1)
} else {
return(NULL)
}
list(
sample_id = sdf$improve_sample_id,
model_type = sdf$model_type
)
}

# Safe extraction: only return non-empty data frames
i_safe_extract <- function(df, sample_id, source_val, study_val) {
if (is.null(df) || nrow(df) == 0) return(NULL)
df$improve_sample_id <- sample_id
df$source <- source_val
df$study <- study_val
df
}

# 1) Proteomics
proteomics_list <- lapply(
setdiff(dcombined$Proteomics, c("", NA, "NA")),
function(id) {
meta <- pick_meta(id, "Proteomics")
if (is.null(meta)) return(NULL)

df <- tryCatch(
fread(synGet(id)$path) %>%
rename(gene_symbol = Gene) %>%
left_join(genes_df, by = "gene_symbol") %>%
select(entrez_id, proteomics = logRatio) %>%
filter(!is.na(entrez_id), proteomics != 0) %>%
distinct(),
error = function(e) NULL
)
i_safe_extract(
df,
meta$sample_id,
"NF Data Portal",
study_label(meta$model_type)
)
}
)
proteomics <- bind_rows(proteomics_list)
fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv"))
message("Wrote combined proteomics")


# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..)
transcriptomics_list <- lapply(
setdiff(dcombined$RNASeq, c("", NA, "NA")),
function(id) {
meta <- pick_meta(id, "RNASeq")
if (is.null(meta)) return(NULL)

df <- tryCatch({
fread(synGet(id)$path) %>%
separate(Name, into = c("other_id","vers"), sep = "\\.") %>%
select(-vers) %>%
left_join(genes_df) %>%
select(entrez_id, transcriptomics = TPM) %>%
filter(!is.na(entrez_id), transcriptomics != 0) %>%
distinct()
}, error = function(e) NULL)

i_safe_extract(
df,
meta$sample_id,
"NF Data Portal",
study_label(meta$model_type)
)
}
)
transcriptomics <- bind_rows(transcriptomics_list)
fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv"))
message("Wrote combined transcriptomics")


# 3) Mutations (WES)
wes_list <- lapply(
setdiff(dcombined$Mutations, c("", NA, "NA")),
function(id) {
meta <- pick_meta(id, "Mutations")
if (is.null(meta)) return(NULL)

clean_id <- gsub('[\"\\[\\]]', '', id)
df <- tryCatch(
fread(synGet(clean_id)$path) %>%
select(entrez_id = Entrez_Gene_Id,
mutation = HGVSc,
variant_classification = Variant_Classification) %>%
filter(entrez_id %in% genes_df$entrez_id) %>%
distinct(),
error = function(e) NULL
)

i_safe_extract(
df,
meta$sample_id,
"NF Data Portal",
study_label(meta$model_type)
)
}
)
wes <- bind_rows(wes_list)
fwrite(wes, file.path("/tmp", "mpnst_mutations.csv"))
message("Wrote combined mutations")


# 4) Copy Number Variation (CNV)
cnv_list <- lapply(
setdiff(dcombined$CopyNumber, c("", NA, "NA")),
function(id) {
meta <- pick_meta(id, "CopyNumber")
if (is.null(meta)) return(NULL)

clean_id <- gsub('[\"\\[\\]]', '', id)
raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL)
if (is.null(raw)) return(NULL)

df_long <- raw %>%
separate_rows(gene, sep = ",") %>%
rename(gene_symbol = gene) %>%
left_join(genes_df, by = "gene_symbol") %>%
filter(!is.na(entrez_id)) %>%
select(entrez_id, log2) %>%
distinct() %>%
mutate(copy_number = 2^log2) %>%
select(-log2)

df <- df_long %>%
mutate(copy_call = case_when(
copy_number < 0.5210507 ~ "deep del",
copy_number < 0.7311832 ~ "het loss",
copy_number < 1.214125 ~ "diploid",
copy_number < 1.422233 ~ "gain",
TRUE ~ "amp"
))

i_safe_extract(
df,
meta$sample_id,
"NF Data Portal",
study_label(meta$model_type)
)
}
)
cnv <- bind_rows(cnv_list)
fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv"))
message("Wrote combined copy number")


message("All combined data files created.")
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy