From 6d1bf3e388d51ca1f5af184e99c046ef2ac8f453 Mon Sep 17 00:00:00 2001 From: alexandriai168 Date: Thu, 24 Apr 2025 10:38:52 -0700 Subject: [PATCH 01/20] added samples and omics files --- build/novartispdx/01-samples-novartispdx.py | 0 build/novartispdx/02-omics-novartispdx.py | 6 ++++++ build/novartispdx/build_omics.sh | 0 build/novartispdx/build_samples.sh | 0 4 files changed, 6 insertions(+) create mode 100644 build/novartispdx/01-samples-novartispdx.py create mode 100644 build/novartispdx/02-omics-novartispdx.py create mode 100644 build/novartispdx/build_omics.sh create mode 100644 build/novartispdx/build_samples.sh diff --git a/build/novartispdx/01-samples-novartispdx.py b/build/novartispdx/01-samples-novartispdx.py new file mode 100644 index 00000000..e69de29b diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py new file mode 100644 index 00000000..074775cb --- /dev/null +++ b/build/novartispdx/02-omics-novartispdx.py @@ -0,0 +1,6 @@ +import pandas as pd +import numpy as np +import os +import math +import argparse + diff --git a/build/novartispdx/build_omics.sh b/build/novartispdx/build_omics.sh new file mode 100644 index 00000000..e69de29b diff --git a/build/novartispdx/build_samples.sh b/build/novartispdx/build_samples.sh new file mode 100644 index 00000000..e69de29b From c35ec76178e8bf2e5eb50dc1a1ab0064ff040f0b Mon Sep 17 00:00:00 2001 From: alexandriai168 Date: Thu, 1 May 2025 15:56:03 -0700 Subject: [PATCH 02/20] created get omics data function --- build/novartispdx/02-omics-novartispdx.py | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py index 074775cb..f6ab0fec 100644 --- a/build/novartispdx/02-omics-novartispdx.py +++ b/build/novartispdx/02-omics-novartispdx.py @@ -4,3 +4,47 @@ import math import argparse +def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None): + """ + Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account + and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens + Omics data is an excel file. The excel file is then parsed for the RNAseq, copy number, and mutations data. + + Parameters + ---------- + synID : string + SynapseID of dataset to download. Default is synapseID of the sequencing dataset. + + save_path : string + Local path where the downloaded file will be saved. + + synToken : string + Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens + + Returns + ------- + mutations_data : pd.DataFrame + A DataFrame containing mutations data. + + copy_number_data : pd.DataFrame + A DataFrame containing copy number data. + + rnaseq_data : pd.DataFrame + A DataFrame containing RNAseq data. + """ + + syn = synapseclient.Synapse() + syn.login(authToken=synToken) + + # Obtain a pointer and download the data + syn66364488 = syn.get(entity=synID, downloadLocation = save_path) + + # Get the path to the local copy of the data file + sequencing_filepath = syn66364488.path + all_omics_excel = pd.ExcelFile(open(sequencing_filepath, 'rb')) + mutations_data = pd.read_excel(all_omics_excel, 'pdxe_mut_and_cn2') # table with somatic mutation information + copy_number_data = pd.read_excel(all_omics_excel, 'copy number') # table with copy number information + rnaseq_data = pd.read_excel(all_omics_excel, 'RNAseq_fpkm') + + + return(rnaseq_data, copy_number_data, mutations_data) \ No newline at end of file From 3798bfb4fb4caa8256a3cf6ced96074f986e9e8d Mon Sep 17 00:00:00 2001 From: RubyFore Date: Thu, 8 May 2025 14:57:09 -0700 Subject: [PATCH 03/20] add novartispdx sample file Sample file for 386 solid tumor PDX from the Novartis project. --- build/novartispdx/01-samples-novartispdx.py | 59 +++++++++++++++++++++ build/novartispdx/build_samples.sh | 7 +++ 2 files changed, 66 insertions(+) mode change 100644 => 100755 build/novartispdx/build_samples.sh diff --git a/build/novartispdx/01-samples-novartispdx.py b/build/novartispdx/01-samples-novartispdx.py index e69de29b..e5d431b1 100644 --- a/build/novartispdx/01-samples-novartispdx.py +++ b/build/novartispdx/01-samples-novartispdx.py @@ -0,0 +1,59 @@ +import pandas as pd +import synapseclient +import numpy as np +import argparse +import os + +def get_complete_novartispdx_sample_sheet(synObject): + + files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file'])) + + synIDs = [item['id'] for item in files] + # leave off synIDs for drug info + synIDs.remove('syn66276102') + synIDs.remove('syn66276098') + synIDs.remove("syn66477971") + # create empty dataframe + allsamplesheet = pd.DataFrame() + # iterate through IDs and concatenate + for id in synIDs: + curr = synObject.get(id) + currdf = pd.read_csv(curr.path) + allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True) + # rename columns and reformat cancer type from CANCER_HISTOLOGY column + allsamplesheet['other_id'] = allsamplesheet['Sample ID'] + allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID'] + allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1] + allsamplesheet['species'] = "Homo Sapiens(human)" + allsamplesheet['model_type'] = 'patient derived xenograft' + allsamplesheet['other_id_source'] = 'Synapse' + allsamplesheet['other_names'] = '' + finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']] + return finalsamplesheet + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet") + + parser.add_argument('-t', '--token', type=str, help='Synapse Token') + + parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation") + + args = parser.parse_args() + + print("Logging into Synapse") + PAT = args.token + synObject = synapseclient.login(authToken=PAT) + + samplesheet = get_complete_novartispdx_sample_sheet(synObject) + + if (args.prevSamples): + prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id) + else: + prev_max_improve_id = 0 + + samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1) + + samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False) + + \ No newline at end of file diff --git a/build/novartispdx/build_samples.sh b/build/novartispdx/build_samples.sh old mode 100644 new mode 100755 index e69de29b..562f74a8 --- a/build/novartispdx/build_samples.sh +++ b/build/novartispdx/build_samples.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running 01-samples-novartispdx.py with token and previous sample file $1" +python3 01-samples-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -p $1 \ No newline at end of file From 5bd72626bffd838be3acc72890ed2eeb2a8610bf Mon Sep 17 00:00:00 2001 From: alexandriai168 Date: Fri, 9 May 2025 14:31:42 -0700 Subject: [PATCH 04/20] added get_copy_call.py to utils --- build/utils/get_copy_call.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 build/utils/get_copy_call.py diff --git a/build/utils/get_copy_call.py b/build/utils/get_copy_call.py new file mode 100644 index 00000000..f2b3864c --- /dev/null +++ b/build/utils/get_copy_call.py @@ -0,0 +1,22 @@ +def get_copy_call(a): + """ + Heler Function - Determine copy call for a value. + """ + + if a is None: + return float('nan') + + if math.isnan(a): + return float('nan') + + a_val = math.log2(float(a)+0.000001) + if a_val < 0.5210507: + return 'deep del' + elif a_val < 0.7311832: + return 'het loss' + elif a_val < 1.214125: + return 'diploid' + elif a_val < 1.422233: + return 'gain' + else: + return 'amp' From 9b664bf8fc839bc910e7e08ddf5c034f2a2610e7 Mon Sep 17 00:00:00 2001 From: alexandriai168 Date: Tue, 13 May 2025 15:11:51 -0700 Subject: [PATCH 05/20] added copy number funciton --- build/novartispdx/02-omics-novartispdx.py | 57 ++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py index f6ab0fec..2052e0e6 100644 --- a/build/novartispdx/02-omics-novartispdx.py +++ b/build/novartispdx/02-omics-novartispdx.py @@ -47,4 +47,59 @@ def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = rnaseq_data = pd.read_excel(all_omics_excel, 'RNAseq_fpkm') - return(rnaseq_data, copy_number_data, mutations_data) \ No newline at end of file + return(rnaseq_data, copy_number_data, mutations_data) + + +def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data): + """ + Maps copy number data to improved sample id's and entrez gene data. Also does some data formatting. + + Parameters + ---------- + copy_number_data : pd.Dataframe OR string + Pandas dataframe object with copy number data OR path to csv with copy number data + + improve_id_data : pd.Dataframe OR string + Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2() + + entrez_data : pd.Dataframe OR string + Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes + + Returns + ------- + sample_entrez_cn_df : pd.DataFrame + A DataFrame containing the mapped copy number data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id + + """ + # read in data + if isinstance(copy_number_data, pd.DataFrame) == False: + copy_number_data = pd.read_csv(copy_number_data) + + if isinstance(improve_id_data, pd.DataFrame) == False: + improve_id_data = pd.read_csv(improve_id_data) + + if isinstance(entrez_data, pd.DataFrame) == False: + entrez_data = pd.read_csv(entrez_data) + + # melt dataframe so that there is gene name and improve_sample_id per row + long_cn_df = pd.melt(copy_number_data, id_vars=['Sample'], value_vars=copy_number_data.columns[copy_number_data.columns != 'Sample']) + + # get entrez id's from Sample + entrez_cn_df = pd.merge(long_cn_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'left', left_on= "Sample", right_on= "other_id") + + # get copy call from value column (aka copy number) + entrez_cn_df['copy_call'] = [get_copy_call(a) for a in entrez_cn_df['value']] + + # get improve sample id + improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","") + sample_entrez_cn_df = pd.merge(entrez_cn_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'left', left_on= "variable", right_on= "to_merge") + + # clean up columns and data types + sample_entrez_cn_df = sample_entrez_cn_df.drop(columns=['Sample','variable','other_id','to_merge']) + sample_entrez_cn_df['source'] = "CPDM" + sample_entrez_cn_df['study'] = "novartispdx" + sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'}) + sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'}) + sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']] + + return(sample_entrez_cn_df) \ No newline at end of file From 7c3795210bb3cc5f7e7587aab9c6def8bc1b848c Mon Sep 17 00:00:00 2001 From: alexandriai168 Date: Tue, 13 May 2025 16:18:36 -0700 Subject: [PATCH 06/20] added transcriptomics function --- build/novartispdx/02-omics-novartispdx.py | 67 ++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py index 2052e0e6..69dfb134 100644 --- a/build/novartispdx/02-omics-novartispdx.py +++ b/build/novartispdx/02-omics-novartispdx.py @@ -101,5 +101,70 @@ def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data): sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'}) sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'}) sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']] + sample_entrez_cn_df = sample_entrez_cn_df.drop_duplicates() + + + return(sample_entrez_cn_df) + + +def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_data): + """ + Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting. - return(sample_entrez_cn_df) \ No newline at end of file + Parameters + ---------- + copy_number_data : pd.Dataframe OR string + Pandas dataframe object with transcriptomics data OR path to csv with transcriptomics data + + improve_id_data : pd.Dataframe OR string + Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2() + + entrez_data : pd.Dataframe OR string + Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes + + Returns + ------- + sample_entrez_cn_df : pd.DataFrame + A DataFrame containing the mapped transcriptomics data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id + + """ + # read in data + if isinstance(transcriptomics_data, pd.DataFrame) == False: + transcriptomics_data = pd.read_csv(transcriptomics_data) + + if isinstance(improve_id_data, pd.DataFrame) == False: + improve_id_data = pd.read_csv(improve_id_data) + + if isinstance(entrez_data, pd.DataFrame) == False: + entrez_data = pd.read_csv(entrez_data) + + # melt dataframe so that there is gene name and improve_sample_id per row + rnaseq_df = rnaseq_df.rename(columns={'Sample':'stable_id'}) + rnaseq_df.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t') + + # run tpmFromCounts.py to convert counts to tpm + os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv") + + # read in amd melt dataframe so that there is an entrez and sample id per row + tpm_transciptomics_data = pd.read_csv("/tmp/transcriptomics_tpm.tsv", sep="\t") + long_rnaseq = pd.melt(tpm_transciptomics_data, id_vars=['stable_id'], value_vars=tpm_transciptomics_data.columns[tpm_transciptomics_data.columns != 'stable_id']) + + # merge entrez id's + entrez_transcriptomics_df = pd.merge(long_rnaseq.drop_duplicates(), entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id") + + # get improve sample id + improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","") + sample_entrez_transcriptomics_df = pd.merge(entrez_transcriptomics_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "to_merge") + + # clean up columns and data types + sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.drop(columns=['stable_id','variable','other_id','to_merge']) + sample_entrez_transcriptomics_df['source'] = "CPDM" + sample_entrez_transcriptomics_df['study'] = "novartispdx" + sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.rename(columns={'value':'transcriptomics'}) + sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'}) + sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']] + + return(sample_entrez_transcriptomics_df) + + + From 08c2f2f373c5f983098f71d505d486abcf0b8271 Mon Sep 17 00:00:00 2001 From: RubyFore Date: Tue, 1 Jul 2025 16:19:02 -0700 Subject: [PATCH 07/20] Addition of drugs and experiments, alterations to calc_pdx_metrics Addition of drug data (~12 drugs were not matchable) and experiments from `calc_pdx_metrics.py`. `calc_pdx_metrics.py` was altered to skip auc calculations with fewer than 2 points (the linear model would not converge). --- build/novartispdx/02-omics-novartispdx.py | 44 ++++++++++++ build/novartispdx/03-drugs-novartispdx.py | 70 +++++++++++++++++++ .../novartispdx/04-experiments-novartispdx.py | 59 ++++++++++++++++ build/novartispdx/build_drugs.sh | 15 ++++ build/novartispdx/build_experiments.sh | 0 build/novartispdx/build_omics.sh | 12 ++++ build/utils/calc_pdx_metrics.py | 30 ++++++-- 7 files changed, 223 insertions(+), 7 deletions(-) create mode 100644 build/novartispdx/03-drugs-novartispdx.py create mode 100644 build/novartispdx/04-experiments-novartispdx.py create mode 100755 build/novartispdx/build_drugs.sh create mode 100644 build/novartispdx/build_experiments.sh diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py index 69dfb134..edb3a451 100644 --- a/build/novartispdx/02-omics-novartispdx.py +++ b/build/novartispdx/02-omics-novartispdx.py @@ -4,6 +4,33 @@ import math import argparse + +def get_copy_call(a): + """ + Heler Function - Determine copy call for a value. + """ + + if a is None: + return float('nan') + + if math.isnan(a): + return float('nan') + + a_val = math.log2(float(a)+0.000001) + if a_val < 0.5210507: + return 'deep del' + elif a_val < 0.7311832: + return 'het loss' + elif a_val < 1.214125: + return 'diploid' + elif a_val < 1.422233: + return 'gain' + else: + return 'amp' + + return pd.Series([get_copy_call(a) for a in arr]) + + def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None): """ Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account @@ -167,4 +194,21 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat return(sample_entrez_transcriptomics_df) +if __name__ == "__main__": + print('in main') + parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project") + parser.add_argument('-s', '--samples', help='Path to sample file',default=None) + parser.add_argument('-g', '--genes', help='Path to genes file', default = None) + parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False) + parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False) + parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False) + parser.add_argument('-t', '--token', help='Synapse token') + + args = parser.parse_args() + print("Logging into Synapse") + PAT = args.token + + genes=pd.read_csv(args.genes) + samples = pd.read_csv(args.samples) + data =download_parse_omics_novPDX(syn id,savestring, PAT) \ No newline at end of file diff --git a/build/novartispdx/03-drugs-novartispdx.py b/build/novartispdx/03-drugs-novartispdx.py new file mode 100644 index 00000000..4f746a9a --- /dev/null +++ b/build/novartispdx/03-drugs-novartispdx.py @@ -0,0 +1,70 @@ +import synapseclient +import pandas as pd +import numpy as np +import argparse +import os +# for testing locally +from utils.pubchem_retrieval import update_dataframe_and_write_tsv +# for building in docker +#from pubchem_retrieval import update_dataframe_and_write_tsv + + +def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath): + file = synObject.get('syn66276102') + # read raw drug data from synapse + rawDrugData = pd.read_csv(file.path) + # split on + operator - there are 2- and one 3- way drug combos in this dataset + sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True) + ### NEED TO ALSO remove drug names with different dose info + + + # taking the drug names from the first and second column from the split - there is only one + # drug name in the 3rd column (onen 3-way combo) that is replicated in other treatments as well + alldrugnames = pd.Series(pd.concat([sepDrugNames[0], sepDrugNames[1]]).dropna()).str.split('"', expand=True)[0].str.split("-", expand=True)[0] + #nodoseinfo = pd.Series(alldrugnames.str.split("-", expand =True)[0]) + #combineddrugames = pd.concat([alldrugnames, nodoseinfo]) + finalDrugNames = pd.Series(alldrugnames.unique()).str.strip().unique() + # get unique drugs + newdrugnames = finalDrugNames[finalDrugNames != 'untreated'] + + #print(finalDrugNames.tolist) + #newdrugnames = finalDrugNames.remove('untreated') + print(2) + print(newdrugnames) + + + # use helper functions in pubchem_retrieval.py + alldrugs = [] + if prevDrugFilepath is not None and prevDrugFilepath is not "": + prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')] + alldrugs = pd.concat(prevdrugs).drop_duplicates() + + imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)] + newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)] + + ##write drugs + newdrugs.to_csv(outputPath, sep='\t', index=False) + + if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match + print('Missing drugs in existing file, querying pubchem') + update_dataframe_and_write_tsv(newdrugnames,outputPath) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Novartis PDX data") + parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for bladderpdo', nargs="?", default = None) + parser.add_argument('-o', '--outputPath', help='Output path for updated novartispdx drug file', default = "/tmp/novartispdx_drugs.tsv") + parser.add_argument('-t', '--token', help='Synapse token') + + args = parser.parse_args() + print("Logging into Synapse") + PAT = args.token + print("after PAT assignment") + synObject = synapseclient.login(authToken=PAT) + print('after creating synObject') + if args.prevDrugFilePath: + previousDrugs = args.prevDrugFilePath + else: + previousDrugs = None + create_novartis_pdx_drugs_file(synObject, previousDrugs, args.outputPath) \ No newline at end of file diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py new file mode 100644 index 00000000..bd1bd2d2 --- /dev/null +++ b/build/novartispdx/04-experiments-novartispdx.py @@ -0,0 +1,59 @@ +import synapseclient +import pandas as pd +import numpy as np +import argparse +import os + +def get_novartis_pdx_experiments_file: + # input for the calc_pdx_metrics script + + file1 = synObject.get('syn66276102') + rawDrugData = pd.read_csv(file1.path) + # STILL NEED TO : link to improve ids. + # update a few drug ids for greater inclusion + novartispdx_curvefile = rawDrugData[['Model', 'Days Post T0', 'Volume (mm3)', 'Treatment']] + novartispdx_curvefile=novartispdx_curvefile.rename({'Model': 'model_id', 'Days Post T0' : 'time', 'Volume (mm3)': 'volume', 'Treatment':'treatment'}, axis=1) + novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.lower() + novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.replace('"', '') + novartispdx_curvefile['treatment']=novartispdx_curvefile['treatment'].str.replace('untreated', 'control') + novartispdx_curvefile['experiment'] = novartispdx_curvefile.groupby(['model_id']).ngroup()+1 + # remove triple combination(s) + novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+.*\+')] + # remove dose information appended to some drugs in the treatment column and include in dose colum + druganddose = novartispdx_curvefile['treatment'].str.split('-', expand=True) + druganddose = druganddose.rename({0: 'treatment', 1:'dose'}, axis=1) + novartispdx_curvefile['treatment']=druganddose['treatment'] + novartispdx_curvefile['dose'] = druganddose['dose'] + # remove pdxs with only one drug treatment (no control) + unique_vals_tally = novartispdx_curvefile.groupby('experiment').nunique() + todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index + novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)] + # remove groups with no 'control' treatment + groupeddf = test.groupby('experiment') + no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any()) + + missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment'] + finaldf=test[~test['experiment'].isin(missingcontrols)] + + finalcurvefile = finaldf + return finalcurvefile + #finalcurvefile.to_csv('/tmp/novartispdx_doserep.tsv', sep="\t") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-t', '--token', help='Synapse authentication token') + parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples') + parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples') + parser.add_argument('-o', '--output', default = '/tmp/novartispdx_doserep.tsv',help='Output file to be read into curve fitting code') + + args = parser.parse_args() + print("Logging into Synapse") + PAT = args.token + synObject = synapseclient.login(authToken=PAT) + drug_df = pd.read_csv(args.drugfile, sep='\t') + samples_df = pd.read_csv(args.curSampleFile) + + doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df, drug_df) + doseresponse_data.to_csv(args.output, sep='\t') + diff --git a/build/novartispdx/build_drugs.sh b/build/novartispdx/build_drugs.sh new file mode 100755 index 00000000..53abb4ec --- /dev/null +++ b/build/novartispdx/build_drugs.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running script with token and drugFile $1" +# for running locally (from build directory): +python3 -m novartispdx.03-drugs-novartispdx --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv +#python3 novar +#python3 03-drugs-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv + +echo "Running build_drug_desc.py..." +#for running locally: +python3 utils/build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz +#python3 build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh new file mode 100644 index 00000000..e69de29b diff --git a/build/novartispdx/build_omics.sh b/build/novartispdx/build_omics.sh index e69de29b..39585d8c 100644 --- a/build/novartispdx/build_omics.sh +++ b/build/novartispdx/build_omics.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -euo pipefail + +trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR + +echo "Running script with token, curSamples $2, and genes $1." +# for mutation data (-m) +python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -m +# for expressiondata (-e) +python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -e +# for copynumber +python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -c \ No newline at end of file diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index e0f4c05a..6e28008a 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - #print(time) + print('at line 187') + print(time.shape) + print(time.dtype) + print(np.max(time.astype(int))) + print('auc is : ') + print(auc) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - + print('drug name is ' + drug_name) + data['exp_type'] = data['exp_type'].astype('category') + data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) + print(data) + print(data['exp_type'].cat.categories) # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' + #print(data['exp_type'].cat.categories) # Fit the model model = mixedlm(formula, data, groups=data['model_id']) fit = model.fit() @@ -282,8 +292,9 @@ def lmm(time, volume, treatment, drug_name): #interaction_term = 'time:exp_type' # if interaction_term in fit.params: # time_coef_value = fit.params['time'] - #print(fit.params) + print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] + #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: # coef_value = None # Handle the case when the interaction term is not present @@ -341,6 +352,7 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) + print('line 355') print(name[0]) print(drugs) mod = list(set(group.model_id))[0] @@ -348,14 +360,17 @@ def get_drug_stats(df, control='control'): ctl_data = group[group.treatment == control] ctl_time = np.array(ctl_data.time) ctl_volume = np.array(ctl_data.volume) - + if (ctl_volume.shape[0] < 2): + continue ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - print(d) - d_data = group[group.treatment == d] + print('is our drug a string or dict?') + print(str(d)) + d_data = group[group.treatment == str(d)] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) - + if (treat_volume.shape[0] < 2): + continue # Get ABC for group treat_auc = AUC(treat_time, treat_volume) treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume) @@ -368,6 +383,7 @@ def get_drug_stats(df, control='control'): #llm comb = pd.concat([ctl_data, d_data]) + #print(comb) lmm_res = lmm(comb.time, comb.volume, comb.treatment, d) lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'}) if '+' in d: From 9d361120a0ef1e18c5c0917a197942e2aa0e881e Mon Sep 17 00:00:00 2001 From: RubyFore Date: Mon, 14 Jul 2025 14:08:34 -0700 Subject: [PATCH 08/20] progress on experiments data adding in improve sample and drug ids, added build_experiments.py, testing single experiments with linkml. More work to do with combo experiments. --- build/novartispdx/03-drugs-novartispdx.py | 2 +- .../novartispdx/04-experiments-novartispdx.py | 35 ++++++++++++------- build/novartispdx/build_experiments.sh | 5 +++ build/utils/calc_pdx_metrics.py | 20 ++++++----- 4 files changed, 39 insertions(+), 23 deletions(-) mode change 100644 => 100755 build/novartispdx/build_experiments.sh diff --git a/build/novartispdx/03-drugs-novartispdx.py b/build/novartispdx/03-drugs-novartispdx.py index 4f746a9a..a015bba9 100644 --- a/build/novartispdx/03-drugs-novartispdx.py +++ b/build/novartispdx/03-drugs-novartispdx.py @@ -15,7 +15,7 @@ def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath): rawDrugData = pd.read_csv(file.path) # split on + operator - there are 2- and one 3- way drug combos in this dataset sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True) - ### NEED TO ALSO remove drug names with different dose info + # taking the drug names from the first and second column from the split - there is only one diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py index bd1bd2d2..dd7e84cd 100644 --- a/build/novartispdx/04-experiments-novartispdx.py +++ b/build/novartispdx/04-experiments-novartispdx.py @@ -4,7 +4,9 @@ import argparse import os -def get_novartis_pdx_experiments_file: + +# add improve IDs - for sample and drug +def get_novartis_pdx_experiments_file(synObject, samples_df): # input for the calc_pdx_metrics script file1 = synObject.get('syn66276102') @@ -29,31 +31,38 @@ def get_novartis_pdx_experiments_file: todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)] # remove groups with no 'control' treatment - groupeddf = test.groupby('experiment') + groupeddf = novartispdx_curvefile.groupby('experiment') no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any()) missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment'] - finaldf=test[~test['experiment'].isin(missingcontrols)] - - finalcurvefile = finaldf + nomissingcontrols=novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(missingcontrols)] + #merge on drug names done in calc_pdx_metrics.py + #final_w_drugIDs = finaldf.merge(drug_df, how='left',right_on='chem_name', left_on="treatment") + final_allIDs = nomissingcontrols.merge(samples_df, how='left', right_on='common_name', left_on='model_id') + print(final_allIDs.head) + final_allIDs = final_allIDs.drop('model_id', axis=1) + finalDF = final_allIDs.rename({'improve_sample_id':'model_id'}, axis=1) + print(finalDF.head) + finalcurvefile = finalDF[['model_id', 'time', 'volume', 'treatment', 'experiment', 'dose']] + print(finalcurvefile.head) return finalcurvefile - #finalcurvefile.to_csv('/tmp/novartispdx_doserep.tsv', sep="\t") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-t', '--token', help='Synapse authentication token') - parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples') - parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples') - parser.add_argument('-o', '--output', default = '/tmp/novartispdx_doserep.tsv',help='Output file to be read into curve fitting code') + parser.add_argument('-s', '--curSampleFile', default='/tmp/novartispdx_samples.csv', help='Sample mapping file for bladder pdo samples') + parser.add_argument('-d', '--drugfile', default='/tmp/novartispdx_drugs.tsv', help='Drug mapping file for bladder pdo samples') + parser.add_argument('-o', '--output', default = '/tmp/novartispdx_experiments.tsv',help='Output experiments file') args = parser.parse_args() print("Logging into Synapse") PAT = args.token synObject = synapseclient.login(authToken=PAT) - drug_df = pd.read_csv(args.drugfile, sep='\t') + #drug_df = pd.read_csv(args.drugfile, sep='\t') samples_df = pd.read_csv(args.curSampleFile) - - doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df, drug_df) - doseresponse_data.to_csv(args.output, sep='\t') + + doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df) + print(doseresponse_data.head) + doseresponse_data.to_csv('/tmp/novartispdx_curvedata.tsv', columns=list({'model_id', 'time', 'volume', 'treatment','experiment', 'dose'}), sep='\t') diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh old mode 100644 new mode 100755 index e69de29b..287292a7 --- a/build/novartispdx/build_experiments.sh +++ b/build/novartispdx/build_experiments.sh @@ -0,0 +1,5 @@ + +#python3 04-experiments-novartispdx.py --token $SYNAPSE_AUTH_TOKEN + +python3 -m novartispdx.04-experiments-novartispdx --token $SYNAPSE_AUTH_TOKEN -o ~/Projects/CoderData/dev-environment/novartispdx/novartispdx_curvedata.tsv +python3 utils/calc_pdx_metrics.py /tmp/novartispdx_curvedata.tsv --drugfile=/tmp/novartispdx_drugs.tsv --outprefix=/tmp/novartispdx \ No newline at end of file diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index 6e28008a..ef3c3b41 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -275,11 +275,11 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - print('drug name is ' + drug_name) + #print('drug name is ' + drug_name) data['exp_type'] = data['exp_type'].astype('category') data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) - print(data) - print(data['exp_type'].cat.categories) + #print(data) + #print(data['exp_type'].cat.categories) # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' @@ -327,11 +327,13 @@ def main(): # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) + print('COMBOS ARE: ') + print(combos[['drug1', 'drug2']]) combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - + print(expcomb[['improve_drug_1', 'improve_drug_2']]) expcomb[['source']]='Synapse' expcomb[['study']]='MPNST PDX in vivo' @@ -352,9 +354,9 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) - print('line 355') - print(name[0]) - print(drugs) + #print('line 355') + #print(name[0]) + #print(drugs) mod = list(set(group.model_id))[0] ctl_data = group[group.treatment == control] @@ -364,8 +366,8 @@ def get_drug_stats(df, control='control'): continue ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - print('is our drug a string or dict?') - print(str(d)) + #print('is our drug a string or dict?') + #print(str(d)) d_data = group[group.treatment == str(d)] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) From a9164ca5397a39cc6a3f31e21dd649fdc2057819 Mon Sep 17 00:00:00 2001 From: RubyFore Date: Wed, 16 Jul 2025 15:17:57 -0700 Subject: [PATCH 09/20] final script for novartispdx and some changes to calc_pdx_metrics.py Changes to calc_pdx_metrics.py include adding flexibility for `study` and `source` instead of being hard coded to MPNST. Added an if statement around the final touches to combos dataset, because if there is no drug data for the combo drugs, errors arise. Moved writing combos to csv to be inside this if statement. --- build/novartispdx/01-samples-novartispdx.py | 1 - .../novartispdx/04-experiments-novartispdx.py | 6 +-- build/novartispdx/build_experiments.sh | 2 +- build/utils/calc_pdx_metrics.py | 45 ++++++++++--------- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/build/novartispdx/01-samples-novartispdx.py b/build/novartispdx/01-samples-novartispdx.py index e5d431b1..b46797b9 100644 --- a/build/novartispdx/01-samples-novartispdx.py +++ b/build/novartispdx/01-samples-novartispdx.py @@ -44,7 +44,6 @@ def get_complete_novartispdx_sample_sheet(synObject): print("Logging into Synapse") PAT = args.token synObject = synapseclient.login(authToken=PAT) - samplesheet = get_complete_novartispdx_sample_sheet(synObject) if (args.prevSamples): diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py index dd7e84cd..31cebbdc 100644 --- a/build/novartispdx/04-experiments-novartispdx.py +++ b/build/novartispdx/04-experiments-novartispdx.py @@ -20,7 +20,7 @@ def get_novartis_pdx_experiments_file(synObject, samples_df): novartispdx_curvefile['treatment']=novartispdx_curvefile['treatment'].str.replace('untreated', 'control') novartispdx_curvefile['experiment'] = novartispdx_curvefile.groupby(['model_id']).ngroup()+1 # remove triple combination(s) - novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+.*\+')] + novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+')] # remove dose information appended to some drugs in the treatment column and include in dose colum druganddose = novartispdx_curvefile['treatment'].str.split('-', expand=True) druganddose = druganddose.rename({0: 'treatment', 1:'dose'}, axis=1) @@ -39,12 +39,9 @@ def get_novartis_pdx_experiments_file(synObject, samples_df): #merge on drug names done in calc_pdx_metrics.py #final_w_drugIDs = finaldf.merge(drug_df, how='left',right_on='chem_name', left_on="treatment") final_allIDs = nomissingcontrols.merge(samples_df, how='left', right_on='common_name', left_on='model_id') - print(final_allIDs.head) final_allIDs = final_allIDs.drop('model_id', axis=1) finalDF = final_allIDs.rename({'improve_sample_id':'model_id'}, axis=1) - print(finalDF.head) finalcurvefile = finalDF[['model_id', 'time', 'volume', 'treatment', 'experiment', 'dose']] - print(finalcurvefile.head) return finalcurvefile @@ -59,7 +56,6 @@ def get_novartis_pdx_experiments_file(synObject, samples_df): print("Logging into Synapse") PAT = args.token synObject = synapseclient.login(authToken=PAT) - #drug_df = pd.read_csv(args.drugfile, sep='\t') samples_df = pd.read_csv(args.curSampleFile) doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df) diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh index 287292a7..a333402d 100755 --- a/build/novartispdx/build_experiments.sh +++ b/build/novartispdx/build_experiments.sh @@ -2,4 +2,4 @@ #python3 04-experiments-novartispdx.py --token $SYNAPSE_AUTH_TOKEN python3 -m novartispdx.04-experiments-novartispdx --token $SYNAPSE_AUTH_TOKEN -o ~/Projects/CoderData/dev-environment/novartispdx/novartispdx_curvedata.tsv -python3 utils/calc_pdx_metrics.py /tmp/novartispdx_curvedata.tsv --drugfile=/tmp/novartispdx_drugs.tsv --outprefix=/tmp/novartispdx \ No newline at end of file +python3 utils/calc_pdx_metrics.py /tmp/novartispdx_curvedata.tsv --drugfile=/tmp/novartispdx_drugs.tsv --outprefix=/tmp/novartispdx --study='Novartis PDX Gao etal 2015' --source='Synapse' \ No newline at end of file diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index ef3c3b41..b25e4f1e 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,12 +184,12 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - print('at line 187') - print(time.shape) - print(time.dtype) - print(np.max(time.astype(int))) - print('auc is : ') - print(auc) + #print('at line 187') + #print(time.shape) + #print(time.dtype) + #print(np.max(time.astype(int))) + #print('auc is : ') + #print(auc) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -292,7 +292,7 @@ def lmm(time, volume, treatment, drug_name): #interaction_term = 'time:exp_type' # if interaction_term in fit.params: # time_coef_value = fit.params['time'] - print(fit.params) + #print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: @@ -312,6 +312,8 @@ def main(): parser.add_argument('curvefile') parser.add_argument('--drugfile') parser.add_argument('--outprefix',default='/tmp/') + parser.add_argument('--study') + parser.add_argument('--source') args = parser.parse_args() @@ -325,22 +327,21 @@ def main(): expsing = expsing.dropna() # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value - - combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) - print('COMBOS ARE: ') - print(combos[['drug1', 'drug2']]) - combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() - - expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - print(expcomb[['improve_drug_1', 'improve_drug_2']]) - expcomb[['source']]='Synapse' - expcomb[['study']]='MPNST PDX in vivo' - - expsing[['source']]='Synapse' - expsing[['study']]='MPNST PDX in vivo' + if combos.shape[0]> 0: + combos[['drug1','drug2']]=combos['drug'].str.split('+',expand=True) + + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb[['source']]=args.source + expcomb[['study']]=args.study + expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + + expsing[['source']]=args.source + expsing[['study']]=args.study expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t") - expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + #expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") From 481385ec5f99331d4cbb72c419e506dfd2fa2d86 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Mon, 21 Jul 2025 16:52:36 -0700 Subject: [PATCH 10/20] Datasets Merged, Build works, Validate works --- build/mpnst/00_sample_gen.R | 3 +- build/mpnst/01_combined_omics.R | 250 +++++++++++++++++++++ build/mpnst/01_mpnst_get_omics.R | 205 ----------------- build/mpnst/02_get_drug_data.R | 238 ++++++++------------ build/mpnst/03_get_drug_response_data.R | 152 ------------- build/mpnst/03_get_experiments.R | 280 ++++++++++++++++++++++++ build/mpnst/build_exp.sh | 5 +- build/mpnst/build_omics.sh | 4 +- build/mpnst/requirements.r | 2 + build/mpnst/requirements.txt | 3 +- build/mpnst/sample_gen.R | 25 --- 11 files changed, 637 insertions(+), 530 deletions(-) mode change 100755 => 100644 build/mpnst/00_sample_gen.R create mode 100644 build/mpnst/01_combined_omics.R delete mode 100755 build/mpnst/01_mpnst_get_omics.R delete mode 100644 build/mpnst/03_get_drug_response_data.R create mode 100644 build/mpnst/03_get_experiments.R delete mode 100644 build/mpnst/sample_gen.R diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R old mode 100755 new mode 100644 index 0ec5704b..0cae6d45 --- a/build/mpnst/00_sample_gen.R +++ b/build/mpnst/00_sample_gen.R @@ -1,5 +1,4 @@ -# This script generate a new sample table based on pervious beatAML improved sample ID -# It will take the maximum value of beatAML improved sample ID and continue from ID count from there +# This script generate a new sample table based on previous dataset's sample file (taking the max improve_sample_id) # Load required libraries library(data.table) library(synapser) diff --git a/build/mpnst/01_combined_omics.R b/build/mpnst/01_combined_omics.R new file mode 100644 index 00000000..a51bafbb --- /dev/null +++ b/build/mpnst/01_combined_omics.R @@ -0,0 +1,250 @@ +#!/usr/bin/env Rscript + +# Combined MPNST & MPNST-PDX Data Extraction Script +# This script unifies data extraction for PDX, Tumor, and Xenograft-Derived Organoid samples. + +# Load required libraries +library(data.table) +library(synapser) +library(dplyr) +library(tidyr) + +# Retrieve command line arguments +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 3) { + stop("Usage: Rscript 01_combined_omics.R ", call. = FALSE) +} +PAT <- args[1] +samples <- args[2] +genes <- args[3] + +# Log in to Synapse +token <- PAT +synLogin(authToken = token) + +# Read sample mapping and gene mapping +samples_df <- fread(samples) %>% + select(improve_sample_id, common_name, model_type) %>% + distinct() +genes_df <- fread(genes) + +# Subset by model type +pdx_samps <- filter(samples_df, model_type == "patient derived xenograft") +tumor_samps<- filter(samples_df, model_type == "tumor") +mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # These end up being the same as pdx_samps in the manifest. + +# Retrieve manifest table from Synapse +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) + +print("manifest") +print(manifest) + +# Build sample tables +pdx_data <- manifest %>% + select(common_name, starts_with("PDX")) %>% + left_join(pdx_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = PDX_RNASeq, + Mutations = PDX_Somatic_Mutations, + CopyNumber = PDX_CNV, + Proteomics = PDX_Proteomics) %>% + filter(!is.na(improve_sample_id)) + + +tumor_data <- manifest %>% + select(common_name, starts_with("Tumor")) %>% + left_join(tumor_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = Tumor_RNASeq, + Mutations = Tumor_Somatic_Mutations, + CopyNumber = Tumor_CNV) %>% + mutate(Proteomics = "") %>% + filter(!is.na(improve_sample_id)) + +mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present. + select(common_name, starts_with("PDX")) %>% + left_join(mt_samps, by = "common_name") %>% + select(improve_sample_id, common_name, model_type, + RNASeq = PDX_RNASeq, + Mutations = PDX_Somatic_Mutations, + CopyNumber = PDX_CNV, + Proteomics = PDX_Proteomics) %>% + filter(!is.na(improve_sample_id)) + +# Combine all sample tables +dcombined <- bind_rows(pdx_data, tumor_data, mt_data) %>% distinct() +print("dcombined:") +print(dcombined) + +# Helper to assign study label based on model_type +study_label <- function(type) { + case_when( + type == "patient derived xenograft" ~ "MPNST PDX", + type == "tumor" ~ "MPNST Tumor", + type == "xenograft derived organoid" ~ "MPNST PDX MT", + TRUE ~ "MPNST" + ) +} + +# Helper to pick metadata based on sample ID and column +pick_meta <- function(id, column) { + # column {"Proteomics","RNASeq","Mutations","CopyNumber"} + if (any(tumor_data[[column]] == id, na.rm = TRUE)) { + sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1) + } else if (any(mt_data[[column]] == id, na.rm = TRUE)) { + sdf <- mt_data %>% filter(.data[[column]] == id) %>% slice(1) + } else if (any(pdx_data[[column]] == id, na.rm = TRUE)) { + sdf <- pdx_data %>% filter(.data[[column]] == id) %>% slice(1) + } else { + return(NULL) + } + list( + sample_id = sdf$improve_sample_id, + model_type = sdf$model_type + ) +} + +# Safe extraction: only return non-empty data frames +i_safe_extract <- function(df, sample_id, source_val, study_val) { + if (is.null(df) || nrow(df) == 0) return(NULL) + df$improve_sample_id <- sample_id + df$source <- source_val + df$study <- study_val + df +} + +# 1) Proteomics +proteomics_list <- lapply( + setdiff(dcombined$Proteomics, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "Proteomics") + if (is.null(meta)) return(NULL) + + df <- tryCatch( + fread(synGet(id)$path) %>% + rename(gene_symbol = Gene) %>% + left_join(genes_df, by = "gene_symbol") %>% + select(entrez_id, proteomics = logRatio) %>% + filter(!is.na(entrez_id), proteomics != 0) %>% + distinct(), + error = function(e) NULL + ) + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +proteomics <- bind_rows(proteomics_list) +fwrite(proteomics, file.path("/tmp", "mpnst_proteomics.csv")) +message("Wrote combined proteomics") + + +# 2) Transcriptomics (PDX, Tumor, and Organoid / MT which comes from PDX..) +transcriptomics_list <- lapply( + setdiff(dcombined$RNASeq, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "RNASeq") + if (is.null(meta)) return(NULL) + + df <- tryCatch({ + fread(synGet(id)$path) %>% + separate(Name, into = c("other_id","vers"), sep = "\\.") %>% + select(-vers) %>% + left_join(genes_df) %>% + select(entrez_id, transcriptomics = TPM) %>% + filter(!is.na(entrez_id), transcriptomics != 0) %>% + distinct() + }, error = function(e) NULL) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +transcriptomics <- bind_rows(transcriptomics_list) +fwrite(transcriptomics, file.path("/tmp", "mpnst_transcriptomics.csv")) +message("Wrote combined transcriptomics") + + +# 3) Mutations (WES) +wes_list <- lapply( + setdiff(dcombined$Mutations, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "Mutations") + if (is.null(meta)) return(NULL) + + clean_id <- gsub('[\"\\[\\]]', '', id) + df <- tryCatch( + fread(synGet(clean_id)$path) %>% + select(entrez_id = Entrez_Gene_Id, + mutation = HGVSc, + variant_classification = Variant_Classification) %>% + filter(entrez_id %in% genes_df$entrez_id) %>% + distinct(), + error = function(e) NULL + ) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +wes <- bind_rows(wes_list) +fwrite(wes, file.path("/tmp", "mpnst_mutations.csv")) +message("Wrote combined mutations") + + +# 4) Copy Number Variation (CNV) +cnv_list <- lapply( + setdiff(dcombined$CopyNumber, c("", NA, "NA")), + function(id) { + meta <- pick_meta(id, "CopyNumber") + if (is.null(meta)) return(NULL) + + clean_id <- gsub('[\"\\[\\]]', '', id) + raw <- tryCatch(fread(synGet(clean_id)$path), error = function(e) NULL) + if (is.null(raw)) return(NULL) + + df_long <- raw %>% + separate_rows(gene, sep = ",") %>% + rename(gene_symbol = gene) %>% + left_join(genes_df, by = "gene_symbol") %>% + filter(!is.na(entrez_id)) %>% + select(entrez_id, log2) %>% + distinct() %>% + mutate(copy_number = 2^log2) %>% + select(-log2) + + df <- df_long %>% + mutate(copy_call = case_when( + copy_number < 0.5210507 ~ "deep del", + copy_number < 0.7311832 ~ "het loss", + copy_number < 1.214125 ~ "diploid", + copy_number < 1.422233 ~ "gain", + TRUE ~ "amp" + )) + + i_safe_extract( + df, + meta$sample_id, + "NF Data Portal", + study_label(meta$model_type) + ) + } +) +cnv <- bind_rows(cnv_list) +fwrite(cnv, file.path("/tmp", "mpnst_copy_number.csv")) +message("Wrote combined copy number") + + +message("All combined data files created.") diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R deleted file mode 100755 index 9097465a..00000000 --- a/build/mpnst/01_mpnst_get_omics.R +++ /dev/null @@ -1,205 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [genes]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -genefile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - -# Define the Ensembl mart # biomart issues still exist -# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later... - -# Path to the directory to save .sf files -#path <- "./tmp" -#dir.create(path, showWarnings = FALSE) - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -tumor_samps<-subset(samples_df,model_type=='tumor') -mt_samps<-subset(samples_df,model_type=='xenograft derived organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##for now we only have tumor and PDX data -##they each get their own sample identifier -pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,common_name,model_type,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|> - subset(!is.na(improve_sample_id)) - -tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> - left_join(tumor_samps)|> - dplyr::select(improve_sample_id,common_name,model_type,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> - mutate(Proteomics='')|> - subset(!is.na(improve_sample_id)) - ##we dont have tumor proteomics from these samples -#print(tumor_data) - -mt_data<- manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(mt_samps)|> - dplyr::select(improve_sample_id,common_name,model_type, RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics')|>##we dont have mt data yet, so collecting PDX instead - subset(!is.na(improve_sample_id)) -#print(tumor_data) - - -combined<-rbind(pdx_data,tumor_data,mt_data)|>distinct() - -# gene mapping table -genes_df <- fread(genefile) - - -##added proteomics first -proteomics<-do.call('rbind',lapply(setdiff(mt_data$Proteomics,c('',NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(mt_data,Proteomics==x) - #print(sample) - res<-fread(synGet(x)$path)|> - #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - #dplyr::select(-vers)|> - dplyr::rename(gene_symbol='Gene')|> - left_join(genes_df)|> - dplyr::select(entrez_id,proteomics='logRatio')|> - distinct()|> - subset(!is.na(entrez_id))|> - subset(proteomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(proteomics,'/tmp/mpnst_proteomics.csv.gz') - - -#### FIRST WE GET RNASeq Data - -rnaseq<-do.call('rbind',lapply(setdiff(mt_data$RNASeq,c(NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(mt_data,RNASeq==x) - #print(sample) - res<-fread(synGet(x)$path)|> - tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - dplyr::select(-vers)|> - left_join(genes_df)|> - dplyr::select(entrez_id,transcriptomics='TPM')|> - subset(!is.na(entrez_id))|> - subset(transcriptomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(rnaseq,'/tmp/mpnst_transcriptomics.csv.gz') - - - -#####NEXT WE DO WES DATA -print("Getting WES") -wes<-do.call(rbind,lapply(setdiff(mt_data$`Mutations`,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(mt_data,Mutations==x) - print(sample$improve_sample_id) - res<-NULL - try(res<-fread(synGet(x2)$path)|> - dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|> - subset(entrez_id%in%genes_df$entrez_id)|> - distinct()) - if(is.null(res)) - return(NULL) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX MT',nrow(res)) - - return(distinct(res)) - # } -})) - -fwrite(wes,'/tmp/mpnst_mutations.csv.gz') - - -print(paste("getting CNV")) -##next let's do CNVs! -cnv<-do.call(rbind,lapply(setdiff(mt_data$CopyNumber,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(mt_data,CopyNumber==x) - print(sample$improve_sample_id) - res<-fread(synGet(x2)$path) - - long_df<- res|> - tidyr::separate_rows(gene,sep=',')|> - dplyr::rename(gene_symbol='gene')|> - dplyr::left_join(genes_df)|> - subset(!is.na(entrez_id))|> - dplyr::select(entrez_id,log2)|> - dplyr::distinct()|> - dplyr::mutate(copy_number=2^log2)|> - dplyr::select(-log2) - - res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - ifelse(copy_number<0.7311832,'het loss', - ifelse(copy_number<1.214125,'diploid', - ifelse(copy_number<1.422233,'gain','amp')))))|> - mutate(study='MPNST PDX MT',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|> - dplyr::distinct() - - # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)] - # filtered_df <- long_df |> - # subset(is.finite(log2))|> - # filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols - # filtered_df <- filtered_df[, .(gene_symbol = V1, - # improve_sample_id = sample$improve_sample_id[1], - # copy_number = 2^log2, - # source = "NF Data Portal", - # study = "MPNST PDX MT")] - # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - # dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - # ifelse(copy_number<0.7311832,'het loss', - # ifelse(copy_number<1.214125,'diploid', - # ifelse(copy_number<1.422233,'gain','amp')))))|> - # left_join(genes_df)|> - # dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|> - # subset(!is.na(entrez_id))|> - # distinct() - # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id)) - return(res) - # } -})) - -fwrite(cnv,'/tmp/mpnst_copy_number.csv.gz') - -##TODO: get proteomics!!! diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R index e90a31fb..f88f0f99 100644 --- a/build/mpnst/02_get_drug_data.R +++ b/build/mpnst/02_get_drug_data.R @@ -1,172 +1,128 @@ -# Load required libraries +#!/usr/bin/env Rscript + +# Combined Drug List Extraction for MPNST & MPNST‑PDX + library(data.table) -# library(biomaRt)# biomart issues still exist library(dplyr) library(stringr) library(synapser) +library(reticulate) - -# Retrieve command line arguments +# 0) Args & login args <- commandArgs(trailingOnly = TRUE) - - -# Check the number of arguments provided if (length(args) < 1) { - stop("At least one argument is required. Usage: Rscript 02_get_drug_data.R [olddrugfile]", call. = FALSE) + stop("Usage: Rscript combined_drug_list.R [old_drugs.tsv,...]", call.=FALSE) } - - -# Assign arguments -newdrugfile <- args[1] # Path to the new drug file -olddrugfiles <- ifelse(length(args) >= 2 && args[2] != "", args[2], NA) - -# Read SYNAPSE_AUTH_TOKEN from the environment -synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") -if (synapse_token == "") { - stop("Error: SYNAPSE_AUTH_TOKEN environment variable is not set.") +newdrugfile <- args[1] +newdrugfile <- file.path(newdrugfile) +olddrugfiles <- if (length(args)>=2 && nzchar(args[2])) args[2] else NA + +token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") +if (token == "") stop("Please set SYNAPSE_AUTH_TOKEN in your environment", call.=FALSE) +synLogin(authToken = token) + +# 1) Fetch manifest +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) + +# 2) PDX‑sourced drugs via annotations +pdx_df <- manifest %>% + select(common_name, PDX_Drug_Data) %>% + distinct() %>% + filter(!is.na(PDX_Drug_Data)) + +pdx_ids <- unique(unlist(strsplit(pdx_df$PDX_Drug_Data, ","))) +pdx_ids <- pdx_ids[ pdx_ids != "" & !is.na(pdx_ids) & pdx_ids != "NA" ] + +get_pdx_drugs <- function(synid) { + # Query the metadata table for this file's experimentalCondition + q <- sprintf( + "select experimentalCondition from syn21993642 where id='%s'", + synid + ) + df <- synTableQuery(q)$asDataFrame() + if (nrow(df)==0) return(character(0)) + # Split on semicolon, lowercase and drop empties + conds <- unlist(strsplit(df$experimentalCondition, ";")) + tolower(conds[conds!=""]) } -synLogin(authToken = synapse_token) - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') +pdx_drugs <- unique(unlist(lapply(pdx_ids, get_pdx_drugs))) +pdx_drugs <- setdiff(pdx_drugs, "control") -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) +# 3) MicroTissue‑sourced drugs via table "children" +mts_df <- manifest %>% + select(common_name, MicroTissueDrugFolder) %>% + filter(!is.na(MicroTissueDrugFolder)) +mts_ids <- unique(unlist(strsplit(mts_df$MicroTissueDrugFolder, ","))) +mts_ids <- mts_ids[mts_ids != "" & !is.na(mts_ids) & mts_ids != "NA"] -##MTS contain lists of directories -mts<-manifest|> - dplyr::select(common_name,MicroTissueDrugFolder)|> - subset(!is.na(MicroTissueDrugFolder)) - - - -##define functions - -##first function to get children from parentId -getDrugsByParent<-function(parid){ - qtab<-synTableQuery(paste('select id,name,experimentalCondition,parentId from syn21993642 where parentId=\'',parid,'\''))$asDataFrame()|> - subset(!is.na(experimentalCondition))|>dplyr::select(id,name,experimentalCondition) - ##now we need to parse the metadatda table get the info - - return(unique(qtab$experimentalCondition)) - +get_mts_drugs <- function(parentId) { + q <- sprintf("select experimentalCondition from syn21993642 where parentId='%s'", parentId) + synTableQuery(q)$asDataFrame() %>% + pull(experimentalCondition) %>% + unique() %>% + tolower() } -##now loop through manifest to get all the files -mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(common_name)] - -alldrugs<-unique(unlist(lapply(mts_fold$V1,function(x){ - samp<-subset(mts_fold,V1==x) - res<-getDrugsByParent(x) - return(res) -}))) - - -alldrugs[which(alldrugs=='PD901')]<-'PD-0325901' - -print(paste(alldrugs,collapse=',')) - +mts_drugs <- unique(unlist(lapply(mts_ids, get_mts_drugs))) +# 4) Combine and fix bad names +all_drugs <- unique(c(pdx_drugs, mts_drugs)) +all_drugs[all_drugs == "pd901"] <- "pd-0325901" +message("Combined drug list: ", paste(all_drugs, collapse=", ")) -## new code: - - -# Handle old drugs +# 5) Read old‑drug files or initialize empty if (!is.na(olddrugfiles)) { - # Read and combine old drug files - olddrug_list <- lapply(unique(unlist(strsplit(olddrugfiles, split = ','))), function(x) { - if (file.exists(x)) { - return(fread(x, header = TRUE, sep = '\t', quote = '')) - } else { - warning(paste("Old drug file does not exist:", x)) - return(NULL) + paths <- strsplit(olddrugfiles, ",")[[1]] %>% trimws() + old_list <- lapply(paths, function(f) { + if (file.exists(f)) fread(f, sep="\t", header=TRUE) else { + warning("Missing old‑drug file: ", f) + NULL } }) - - # Remove NULL entries and ensure uniqueness - olddrug_list <- Filter(Negate(is.null), olddrug_list) - - if (length(olddrug_list) > 0) { - olddrugs <- unique(rbindlist(olddrug_list, use.names = TRUE, fill = TRUE)) - print(paste('Read in', nrow(olddrugs), 'old drugs')) + old_list <- Filter(Negate(is.null), old_list) + if (length(old_list) > 0) { + olddrugs <- unique(rbindlist(old_list, use.names=TRUE, fill=TRUE)) + message("Read ", nrow(olddrugs), " old drug records") } else { - olddrugs <- data.frame( - improve_drug_id = integer(), - chem_name = character(), - pubchem_id = character(), - canSMILES = character(), - # isoSMILES = character(), - InChIKey = character(), - formula = character(), - weight = numeric(), - stringsAsFactors = FALSE + olddrugs <- data.table( + improve_drug_id=integer(), chem_name=character(), + pubchem_id=character(), canSMILES=character(), + InChIKey=character(), formula=character(), weight=numeric() ) - print("Old drug files not valid. Created empty olddrugs dataframe.") + message("No valid old data; using empty template") } } else { - # Create an empty dataframe with specified columns - olddrugs <- data.frame( - improve_drug_id = integer(), - chem_name = character(), - pubchem_id = character(), - canSMILES = character(), - # isoSMILES = character(), - InChIKey = character(), - formula = character(), - weight = numeric(), - stringsAsFactors = FALSE + olddrugs <- data.table( + improve_drug_id=integer(), chem_name=character(), + pubchem_id=character(), canSMILES=character(), + InChIKey=character(), formula=character(), weight=numeric() ) - print("No old drug file provided. Created empty olddrugs dataframe.") + message("No old‑drug files provided; starting fresh") } -# Write the initial drug file (old drugs) -write.table(olddrugs, file = newdrugfile, sep = '\t', row.names = FALSE, quote = FALSE,col.names=T) - - -# Define the ignore file path -ignore_file_path <- '/tmp/mpnst_ignore_chems.txt' - - -# ##copy old drug to new drug -# olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))) -# olddrugs<-unique(olddrugs) +# 6) Write placeholder +fwrite(olddrugs, newdrugfile, sep="\t", quote=FALSE) +message("Wrote placeholder to ", newdrugfile) -# print(paste('Read in ',nrow(olddrugs),'old drugs')) -# #file.copy(olddrugfile,newdrugfile) -# write.table(olddrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T) +# 7) Augment via Python +ignore_file <- "/tmp/combined_drugs_ignore_chems.txt" +use_python("/opt/venv/bin/python3", required=TRUE) +# use_python("/Users/jaco059/miniconda3/bin/python3", required=TRUE) - -##now load reticulate down here - -library(reticulate) - -use_python("/opt/venv/bin/python3", required = TRUE) +# source_python("build/utils/pubchem_retrieval.py") source_python("pubchem_retrieval.py") - -update_dataframe_and_write_tsv(unique_names=alldrugs,output_filename=newdrugfile,ignore_chems=ignore_file_path) - - -tab<-read.table(newdrugfile,sep='\t',header=T,quote="",fill=TRUE) - -newdrugs<-tab|> - subset(chem_name%in%tolower(alldrugs)) - -tab<-tab|> - subset(improve_drug_id%in%newdrugs$improve_drug_id) - -write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE) - -print(paste("Final drug table written to", newdrugfile)) - - -##now call the python drug script - - +update_dataframe_and_write_tsv( + unique_names = all_drugs, + output_filename = newdrugfile, + ignore_chems = ignore_file +) + +# 8) Final filter & save +tab <- fread(newdrugfile, sep="\t", header=TRUE) +final_tab <- unique(tab) +fwrite(final_tab, newdrugfile, sep="\t", quote=FALSE) +message("Wrote full synonyms list to ", newdrugfile) \ No newline at end of file diff --git a/build/mpnst/03_get_drug_response_data.R b/build/mpnst/03_get_drug_response_data.R deleted file mode 100644 index 9bbb6f00..00000000 --- a/build/mpnst/03_get_drug_response_data.R +++ /dev/null @@ -1,152 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) -library(stringr) -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [drugs]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -drugfile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") -print(head(samples_df)) - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -org_samps<-subset(samples_df,model_type=='organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.table()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - left_join(pdx_samps)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) - - -##MTS contain lists of directories -mts<-manifest|> - dplyr::select(common_name,MicroTissueDrugFolder)|> - left_join(org_samps)|> - distinct()|> - subset(!is.na(MicroTissueDrugFolder)) - - -# Modify the extract_date_hour function to return a named vector -extract_date_hour <- function(experiment_id) { - pattern <- "(\\d{6})_?(\\d{2,3})?" - matches <- str_match(experiment_id, pattern) - date <- matches[, 2] - hour <- matches[, 3] - date[is.na(date)] <- NA # Replace with NA instead of blank - hour[is.na(hour)] <- 48 # Replace with 48 instead of blank (default) - return(list(date = date, hour = hour)) -} - - - -##define functions - -##first function to get children from parentId -getDrugDataByParent<-function(parid,sampleId){ - qtab<-synTableQuery(paste('select id,name,experimentalCondition,parentId from syn21993642 where parentId=\'',parid,'\''))$asDataFrame()|> - as.data.frame()|> - subset(!is.na(experimentalCondition))|> - dplyr::select(id,name,experimentalCondition)|> - subset(name!='synapse_storage_manifest.csv') - ##now we need to parse the metadatda table get the info - - res<-do.call(rbind,lapply(qtab$id,function(x){ - sname <- subset(qtab,id==x) - #print(sname) - sname <-extract_date_hour(sname$name) - #print(x) - #print(sname) - data <- fread(synGet(x)$path)|> - filter(response_type=='percent viability')|> - mutate(improve_sample_id=sampleId, - DOSE=(10^dosage)*1000000, ##dosage is log(M), need to move to micromolar - GROWTH=response, #/100, - source = "NF Data Portal", - #CELL = improve_sample_id, - chem_name = compound_name, - study = paste0('MT ',sname$date,' exp'), - time = sname$hour) %>% - select(improve_sample_id,DOSE,GROWTH,source,chem_name,study,time) - - return(data) - })) - return(res) -} - -##now loop through manifest to get all the files -mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)] - -mts_fold <- mts_fold[which(!mts_fold$V1%in%c("NA",NA)),] - -print(mts_fold) - -alldrugs<-do.call(rbind,lapply(mts_fold$V1,function(x){ - samp<-subset(mts_fold,V1==x) - print(samp$common_name) - res<-getDrugDataByParent(x,samp$improve_sample_id) - return(res) -})) - -##do the drug matching -drug_df<-fread(drugfile) - -##update drug name PD901 since it's mussing - -alldrugs$chem_name[which(alldrugs$chem_name=='PD901')]<-'PD-0325901' - - - #drug_df$chem_name=tolower(drug_df$chem_name) -alldrugs$chem_name<-tolower(alldrugs$chem_name) - -#print(drug_df) -drug_map<-subset(drug_df,chem_name%in%alldrugs$chem_name) - -findrugs<-alldrugs|> - left_join(drug_map)|> - mutate(time_unit='hours')|> - dplyr::select(DOSE,GROWTH,source,study,Drug=improve_drug_id,time,time_unit,improve_sample_id)|> - distinct()|> - subset(!is.na(Drug)) - -missing<-setdiff(alldrugs$chem_name,drug_map$chem_name) -print(paste('missing',length(missing),'drugs:')) -print(paste(missing,collapse=',')) - -#TODO: add in new drug lookup -print(head(findrugs)) -fwrite(findrugs,'/tmp/curve_data.tsv',sep='\t') - -pycmd = '/opt/venv/bin/python fit_curve.py --input /tmp/curve_data.tsv --output /tmp/experiments' -print('running curve fitting') -system(pycmd) - -##mmve file name -file.rename('/tmp/experiments.0','/tmp/mpnst_experiments.tsv') - - diff --git a/build/mpnst/03_get_experiments.R b/build/mpnst/03_get_experiments.R new file mode 100644 index 00000000..db7beb25 --- /dev/null +++ b/build/mpnst/03_get_experiments.R @@ -0,0 +1,280 @@ +library(data.table) +library(synapser) +library(dplyr) +library(stringr) +library(readr) +library(readxl) +library(tidyr) + +# Check that correct number of arguments are present +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 4) { + stop("Usage: Rscript 03_get_experiments.R ", call. = FALSE) +} +PAT <- args[1] +samples <- args[2] +drugfile <- args[3] +out_prefix <- args[4] + +synLogin(authToken = PAT) + +# Read in sampes file +samples_df <- fread(samples) %>% + select(improve_sample_id, common_name, model_type) %>% + distinct() + +pdx_samps <- filter(samples_df, model_type == "patient derived xenograft") +mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") + +# Get manifest table from Synapse +manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% + rename(common_name = Sample) %>% + as.data.table() + +# Helper Function to extract date and hour from experiment ID +extract_date_hour <- function(experiment_id) { + pattern <- "(\\d{6})_?(\\d{2,3})?" + m <- str_match(experiment_id, pattern) + date <- m[,2]; hour <- m[,3] + date[is.na(date)] <- NA + hour[is.na(hour)] <- 48 + list(date = date, hour = hour) +} + +# ──────────────────────────────────────────────── +# MicroTissue Experiments +# ──────────────────────────────────────────────── + +getDrugDataByParent <- function(parid, sampleId) { + q <- sprintf( + "select id,name,experimentalCondition,parentId from syn21993642 where parentId='%s'", + parid + ) + qtab <- synTableQuery(q)$asDataFrame() %>% + filter(!is.na(experimentalCondition), name != "synapse_storage_manifest.csv") %>% + select(id, name, experimentalCondition) + do.call(rbind, lapply(qtab$id, function(x) { + info <- filter(qtab, id == x) + d <- extract_date_hour(info$name) + fread(synGet(x)$path) %>% + filter(response_type == "percent viability") %>% + transmute( + improve_sample_id = sampleId, + DOSE = (10^dosage) * 1e6, + GROWTH = response, + source = "NF Data Portal", + chem_name = compound_name, + study = paste0("MT ", d$date, " exp"), + time = d$hour + ) + })) +} + +# Create map of MicroTissue Drug Folders +mts_map <- manifest %>% + select(common_name, MicroTissueDrugFolder) %>% + inner_join(mt_samps, by = "common_name") %>% + separate_rows(MicroTissueDrugFolder, sep = ",") %>% + # keep exactly what old script did: drop only "NA" and actual NA + filter( + !is.na(MicroTissueDrugFolder), + MicroTissueDrugFolder != "NA" + ) %>% + select( + improve_sample_id, + folder = MicroTissueDrugFolder + ) + +# Fetch all MicroTissue drug response data +mt_data <- do.call(rbind, lapply(seq_len(nrow(mts_map)), function(i) { + sample_id <- mts_map$improve_sample_id[i] + folder <- mts_map$folder[i] + getDrugDataByParent(folder, sample_id) +})) + +drug_map <- fread(drugfile) %>% + select(improve_drug_id, chem_name) %>% + distinct() + +# Clean up drug names and join with drug_map +mt_curve <- mt_data %>% + mutate( + chem_name = tolower(chem_name), + chem_name = ifelse(chem_name == "pd901", "pd-0325901", chem_name) + ) %>% + left_join(drug_map, by = "chem_name") %>% + filter(!is.na(improve_drug_id)) %>% + transmute( + source = source, + improve_sample_id = improve_sample_id, + Drug = improve_drug_id, + study = study, + time = time, + time_unit = "hours", + DOSE = DOSE, + GROWTH = GROWTH + ) + +# Run curve fitting, Write MicroTissue curve data +fwrite(mt_curve, file.path("/tmp", paste0(out_prefix, "_mt_curve_data.tsv")), sep = "\t") + +message("Wrote MT curve data") + +# Write MT experiments file +system(sprintf( + "/opt/venv/bin/python fit_curve.py --input %s --output %s", + paste0("/tmp/", out_prefix, "_mt_curve_data.tsv"), + paste0("/tmp/", out_prefix, "_mt_experiments") +)) +file.rename( + paste0("/tmp/", out_prefix, "_mt_experiments.0"), + paste0("/tmp/", out_prefix, "_mt_experiments.tsv") +) +message("Wrote MT experiments") + +# ──────────────────────────────────────────────── +# PDX Experiments +# ──────────────────────────────────────────────── + +# Create a map of PDX Drug Data +# This will be used to fetch the drug data for each PDX sample +pdx_map <- do.call(rbind, lapply(seq_len(nrow(manifest)), function(i) { + row <- manifest[i, ] + samp <- pdx_samps[pdx_samps$common_name == row$common_name, ] + if (nrow(samp)==0 || is.na(row$PDX_Drug_Data) || row$PDX_Drug_Data %in% c("", "NA")) + return(NULL) + ids <- strsplit(row$PDX_Drug_Data, ",")[[1]] + ids <- trimws(ids[ids!=""]) + data.frame( + improve_sample_id = samp$improve_sample_id, + child_id = ids, + stringsAsFactors = FALSE + ) +})) + +# Create a dataframe of PDX metadata +pdx_meta <- do.call(rbind, lapply(seq_len(nrow(pdx_map)), function(i) { + sid <- pdx_map$improve_sample_id[i] + cid <- pdx_map$child_id[i] + pid <- synGet(cid)$parentId + if (is.null(pid) || pid=="") stop("no parentId for ", cid) + data.frame( + improve_sample_id = sid, + child_id = cid, + parentId = pid, + stringsAsFactors = FALSE + ) +})) + +all_pdx <- do.call(rbind, lapply(seq_len(nrow(pdx_meta)), function(i) { + m <- pdx_meta[i, ] + pth <- synGet(m$child_id)$path + raw <- if (grepl("\\.xlsx?$", pth)) read_xlsx(pth) else read_csv(pth) + + # detect second‐drug column + sec_opts <- c("compound 2_name", "compound_2_name") + drug2_col <- intersect(sec_opts, names(raw))[1] + compound2 <- if (!is.na(drug2_col)) raw[[drug2_col]] else NA_character_ + + df <- data.frame( + child_id = m$child_id, + specimen_id = raw$specimen_id, + compound_name = raw$compound_name, + compound_2_name = compound2, + experimental_time_point = raw$experimental_time_point, + experimental_time_point_unit = raw$experimental_time_point_unit, + assay_value = raw$assay_value, + stringsAsFactors = FALSE + ) + + df <- within(df, { + drug1 <- tolower(trimws(compound_name)) + drug2 <- tolower(trimws(compound_2_name)) + treatment <- ifelse( + is.na(drug1) | drug1 %in% c("", "na", "n/a", "nan"), + "control", + ifelse(!is.na(drug2) & drug2 != "", + paste(drug1, drug2, sep = "+"), + drug1 + ) + ) + time <- experimental_time_point + time_unit <- experimental_time_point_unit + volume <- assay_value + }) + + df[ , c("child_id", "specimen_id", "treatment", "time", "time_unit", "volume")] +})) + +# join on parentId and sample +pdx_data <- merge(all_pdx, pdx_meta, by="child_id") + +pdx_data <- subset(pdx_data, duplicated(child_id) | TRUE) +pdx_data <- within(pdx_data, { + experiment <- parentId + model_id <- improve_sample_id +}) + +# Filter out experiments missing a control +has_ctl <- tapply(pdx_data$treatment == "control", pdx_data$experiment, any) +no_ctl_exps <- names(has_ctl)[!has_ctl] +pdx_data <- pdx_data[pdx_data$experiment %in% names(has_ctl)[has_ctl], ] + +# Reorder final columns +pdx_data <- pdx_data[ , c("experiment","specimen_id","treatment", + "time","time_unit","volume","model_id")] + +# Correct doxorubinsin typo across all data +pdx_data$treatment <- gsub("doxorubinsin", + "doxorubicin", + pdx_data$treatment, + ignore.case = TRUE) + +# Drop any remaining NA rows +pdx_data <- na.omit(pdx_data) + +# write & fit +fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), sep = "\t") + + +message("Wrote PDX curve data") + +system(sprintf( + "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s", + paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), + drugfile, + paste0("/tmp/", out_prefix, "_pdx") +)) + +message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations") + + +# ──────────────────────────────────────────────── +# Combine all Experiments +# ──────────────────────────────────────────────── + +# Read MicroTissue experiments +mt_exp <- fread(paste0("/tmp/", out_prefix, "_mt_experiments.tsv")) %>% + mutate( + dose_response_value = as.character(dose_response_value) + ) + +# Read PDX experiments +pdx_exp <- fread(paste0("/tmp/", out_prefix, "_pdx_experiments.tsv")) %>% + mutate( + dose_response_value = as.character(dose_response_value) + ) + +# Join experiments into one. +all_exp <- bind_rows(mt_exp, pdx_exp) + +# Write out Experiments +fwrite(all_exp, paste0("/tmp/", out_prefix, "_experiments.tsv"), sep = "\t") +message("Wrote combined experiments: /tmp/", out_prefix, "_experiments.tsv") + + +# Rename the Drug Combination data file to fit schema naming +file.rename( + paste0("/tmp/", out_prefix, "_pdx_combinations.tsv"), + paste0("/tmp/", out_prefix, "_combinations.tsv") +) \ No newline at end of file diff --git a/build/mpnst/build_exp.sh b/build/mpnst/build_exp.sh index a9a2b763..f83096d4 100644 --- a/build/mpnst/build_exp.sh +++ b/build/mpnst/build_exp.sh @@ -3,5 +3,6 @@ set -euo pipefail trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR -echo "Running 03_get_drug_response_data.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." -Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 +echo "Running 03_get_experiments.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." +Rscript 03_get_experiments.R $SYNAPSE_AUTH_TOKEN $1 $2 mpnst + diff --git a/build/mpnst/build_omics.sh b/build/mpnst/build_omics.sh index b08ac63d..d6d2cec7 100644 --- a/build/mpnst/build_omics.sh +++ b/build/mpnst/build_omics.sh @@ -3,5 +3,5 @@ set -euo pipefail trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR -echo "Running 01_mpnst_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." -Rscript 01_mpnst_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 +echo "Running 01_combined_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." +Rscript 01_combined_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnst/requirements.r b/build/mpnst/requirements.r index 7796236d..e8bfac35 100755 --- a/build/mpnst/requirements.r +++ b/build/mpnst/requirements.r @@ -9,3 +9,5 @@ install.packages("data.table") install.packages("R.utils") install.packages("stringr") install.packages("tidyr") +install.packages("readr") +install.packages("readxl") diff --git a/build/mpnst/requirements.txt b/build/mpnst/requirements.txt index 27c4dc2a..8f07cbd2 100755 --- a/build/mpnst/requirements.txt +++ b/build/mpnst/requirements.txt @@ -8,4 +8,5 @@ scikit-learn scipy requests mordredcommunity -rdkit \ No newline at end of file +rdkit +statsmodels \ No newline at end of file diff --git a/build/mpnst/sample_gen.R b/build/mpnst/sample_gen.R deleted file mode 100644 index 3d19fa85..00000000 --- a/build/mpnst/sample_gen.R +++ /dev/null @@ -1,25 +0,0 @@ -# This script generate a new sample table based on pervious beatAML improved sample ID -# It will take the maximum value of beatAML improved sample ID and continue from ID count from there -# Load required libraries -library(data.table) - -main <- fread("mpnst/NF_MPNST_samples.csv") -previous_aml <- fread("beatAML/beataml_samples.csv") -max_id <- max(previous_aml$improve_sample_id) -main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main)) - -synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv") -# Step 1: Create a dictionary from 'main' -id_dict <- setNames(main$improve_sample_id, main$other_id) - -# Step 2: Update 'ID' in 'synapse_main' -synapse_main$ID <- id_dict[synapse_main$Sample] - -# Handling NA values if any mismatch occurs (Optional based on your data integrity) -# If there are NAs generated, you might need to check for unmatched keys -# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows - -# Step 3: Save the updated 'synapse_main' -fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv") -fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file - From b3824fb1bf6d9a0a3304c4de0e82da27b54c3fba Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 22 Jul 2025 08:56:10 -0700 Subject: [PATCH 11/20] Drop JH-2-009 Sample due to contamination --- build/mpnst/00_sample_gen.R | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R index 0cae6d45..db1f238e 100644 --- a/build/mpnst/00_sample_gen.R +++ b/build/mpnst/00_sample_gen.R @@ -10,14 +10,12 @@ if(length(args) > 1 ){ stop("Up to one argument is allowed. This is the filepath to the previously run samples file.") } - if (length(args) == 0 || is.na(args[1]) || args[1] == "" || !file.exists(args[1])) { orig_samples <- "" } else { orig_samples <- fread(args[1]) } - # Check if Synapse token is available from the environment synapse_token <- Sys.getenv("SYNAPSE_AUTH_TOKEN") if (synapse_token == "") { @@ -28,6 +26,10 @@ synapser::synLogin(authToken=synapse_token) manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> as.data.frame() +#Drop contaminated sample JH-2-009 +manifest <- manifest %>% + filter(Sample != "JH-2-009") + ###sample file has a strict schema ## - improve_sample_id @@ -61,9 +63,6 @@ main<-rbind(sampTable,pdxmt)|> dplyr::select(-MicroTissueDrugFolder)|> rbind(tumorTable) -#main <- fread("mpnst/NF_MPNST_samples.csv") -#previous_aml <- fread(args[1])#"beatAML/beataml_samples.csv") - # If there is no previous samples file - start at 1, else, continue where the previous one left off. if (identical(orig_samples, "")) { max_id <- 1 @@ -71,21 +70,6 @@ if (identical(orig_samples, "")) { max_id <- max(orig_samples$improve_sample_id, na.rm = TRUE) } - main$improve_sample_id <- seq(from = max_id + 1, length.out = nrow(main)) -#synapse_main <- fread("mpnst/synapse_NF-MPNST_samples.csv") -# Step 1: Create a dictionary from 'main' -#id_dict <- setNames(main$improve_sample_id, main$other_id) - -# Step 2: Update 'ID' in 'synapse_main' -#synapse_main$ID <- id_dict[synapse_main$Sample] - -# Handling NA values if any mismatch occurs (Optional based on your data integrity) -# If there are NAs generated, you might need to check for unmatched keys -# synapse_main$ID[is.na(synapse_main$ID)] <- -1 # Assign a placeholder like -1 for unmatched rows - -# Step 3: Save the updated 'synapse_main' -#fwrite(synapse_main, "mpnst/synapse_NF-MPNST_samples.csv") -#fwrite(main, "mpnst/NF_MPNST_samples.csv") # updated sample file fwrite(main,'/tmp/mpnst_samples.csv') From 65e2f01ee853669c5df3fab33663a273c165dbc1 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 22 Jul 2025 09:13:22 -0700 Subject: [PATCH 12/20] Should be ready --- build/mpnst/01_combined_omics.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/build/mpnst/01_combined_omics.R b/build/mpnst/01_combined_omics.R index a51bafbb..dcbdfbae 100644 --- a/build/mpnst/01_combined_omics.R +++ b/build/mpnst/01_combined_omics.R @@ -37,9 +37,6 @@ mt_samps <- filter(samples_df, model_type == "xenograft derived organoid") # manifest <- synTableQuery("select * from syn53503360")$asDataFrame() %>% rename(common_name = Sample) -print("manifest") -print(manifest) - # Build sample tables pdx_data <- manifest %>% select(common_name, starts_with("PDX")) %>% @@ -51,7 +48,6 @@ pdx_data <- manifest %>% Proteomics = PDX_Proteomics) %>% filter(!is.na(improve_sample_id)) - tumor_data <- manifest %>% select(common_name, starts_with("Tumor")) %>% left_join(tumor_samps, by = "common_name") %>% @@ -62,7 +58,7 @@ tumor_data <- manifest %>% mutate(Proteomics = "") %>% filter(!is.na(improve_sample_id)) -mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present. +mt_data <- manifest %>% #Note, this is the same as pdx_data but I think we default to "xenograft derived organoid" if present (based on original files) select(common_name, starts_with("PDX")) %>% left_join(mt_samps, by = "common_name") %>% select(improve_sample_id, common_name, model_type, @@ -89,7 +85,7 @@ study_label <- function(type) { # Helper to pick metadata based on sample ID and column pick_meta <- function(id, column) { - # column {"Proteomics","RNASeq","Mutations","CopyNumber"} + # columns are {"Proteomics","RNASeq","Mutations","CopyNumber"} if (any(tumor_data[[column]] == id, na.rm = TRUE)) { sdf <- tumor_data %>% filter(.data[[column]] == id) %>% slice(1) } else if (any(mt_data[[column]] == id, na.rm = TRUE)) { From 19adfc5c45af5c247d3bd741c67eba11f170ea02 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 22 Jul 2025 09:19:15 -0700 Subject: [PATCH 13/20] Made final changes so this is ready for build process --- build/build_all.py | 5 +- build/build_dataset.py | 2 - build/mpnstpdx/01_mpnstpdx_get_omics.R | 195 --------------------- build/mpnstpdx/02_get_drug_data.R | 120 ------------- build/mpnstpdx/03_get_drug_response_data.R | 174 ------------------ build/mpnstpdx/README.md | 47 ----- build/mpnstpdx/build_drugs.sh | 4 - build/mpnstpdx/build_exp.sh | 2 - build/mpnstpdx/build_omics.sh | 7 - build/mpnstpdx/build_samples.sh | 1 - build/mpnstpdx/requirements.r | 13 -- build/mpnstpdx/requirements.txt | 12 -- schema/expected_files.yaml | 18 -- 13 files changed, 2 insertions(+), 598 deletions(-) delete mode 100755 build/mpnstpdx/01_mpnstpdx_get_omics.R delete mode 100644 build/mpnstpdx/02_get_drug_data.R delete mode 100644 build/mpnstpdx/03_get_drug_response_data.R delete mode 100755 build/mpnstpdx/README.md delete mode 100644 build/mpnstpdx/build_drugs.sh delete mode 100644 build/mpnstpdx/build_exp.sh delete mode 100644 build/mpnstpdx/build_omics.sh delete mode 100644 build/mpnstpdx/build_samples.sh delete mode 100755 build/mpnstpdx/requirements.r delete mode 100755 build/mpnstpdx/requirements.txt diff --git a/build/build_all.py b/build/build_all.py index 10b7bc55..0bea6583 100644 --- a/build/build_all.py +++ b/build/build_all.py @@ -40,7 +40,7 @@ def main(): parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.") parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands") parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.") - parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo,liverpdo',help='Datasets to process. Defaults to all available.') + parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.') parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.') parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.') parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.') @@ -119,7 +119,6 @@ def process_docker(datasets): 'hcmi': ['hcmi'], 'beataml': ['beataml'], 'mpnst': ['mpnst'], - 'mpnstpdx': ['mpnstpdx'], 'pancpdo': ['pancpdo'], 'bladderpdo': ['bladderpdo'], 'sarcpdo': ['sarcpdo'], @@ -410,7 +409,7 @@ def get_latest_commit_hash(owner, repo, branch='main'): # if args.figshare or args.validate: # FigShare File Prefixes: - prefixes = ['beataml', 'hcmi', 'cptac', 'mpnst', 'mpnstpdx', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo'] + prefixes = ['beataml', 'hcmi', 'cptac', 'pancpdo', 'bladderpdo','sarcpdo', 'genes', 'drugs', 'liverpdo','mpnst'] broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] if "broad_sanger" in datasets: prefixes.extend(broad_sanger_datasets) diff --git a/build/build_dataset.py b/build/build_dataset.py index 7904a43e..780b583b 100644 --- a/build/build_dataset.py +++ b/build/build_dataset.py @@ -41,7 +41,6 @@ def process_docker(dataset,validate): 'hcmi': ['hcmi'], 'beataml': ['beataml'], 'mpnst': ['mpnst'], - 'mpnstpdx': ['mpnstpdx'], 'pancpdo': ['pancpdo'], 'cptac': ['cptac'], 'sarcpdo': ['sarcpdo'], @@ -128,7 +127,6 @@ def process_omics(executor, dataset, should_continue): 'broad_sanger': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'cptac': ['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'hcmi': ['mutations', 'transcriptomics'], - 'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'], 'sarcpdo': ['mutations', 'transcriptomics'], 'pancpdo': ['transcriptomics'], 'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'], diff --git a/build/mpnstpdx/01_mpnstpdx_get_omics.R b/build/mpnstpdx/01_mpnstpdx_get_omics.R deleted file mode 100755 index 86e3cbb8..00000000 --- a/build/mpnstpdx/01_mpnstpdx_get_omics.R +++ /dev/null @@ -1,195 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [genes]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -genefile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - -# Define the Ensembl mart # biomart issues still exist -# ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # biomart issues still exist; fix later... - -# Path to the directory to save .sf files -#path <- "./tmp" -#dir.create(path, showWarnings = FALSE) - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNSTpdx_samples.csv") - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -tumor_samps<-subset(samples_df,model_type=='tumor') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##for now we only have tumor and pdx data -##they each get their own sample identifier -pdx_data<-manifest|>dplyr::select(common_name,starts_with("PDX"))|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,RNASeq='PDX_RNASeq',Mutations='PDX_Somatic_Mutations',CopyNumber='PDX_CNV',Proteomics='PDX_Proteomics') - -tumor_data<- manifest|>dplyr::select(common_name,starts_with("Tumor"))|> - left_join(tumor_samps)|> - dplyr::select(improve_sample_id,RNASeq='Tumor_RNASeq',Mutations='Tumor_Somatic_Mutations',CopyNumber='Tumor_CNV')|> - mutate(Proteomics='') ##we dont have tumor proteomics from these samples -#print(tumor_data) - - -pdx_data<-rbind(pdx_data,tumor_data)|>distinct() - -# gene mapping table -genes_df <- fread(genefile) - - -##added proteomics first -proteomics<-do.call('rbind',lapply(setdiff(pdx_data$Proteomics,c('',NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(pdx_data,Proteomics==x) - #print(sample) - res<-fread(synGet(x)$path)|> - #tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - #dplyr::select(-vers)|> - dplyr::rename(gene_symbol='Gene')|> - left_join(genes_df)|> - dplyr::select(entrez_id,proteomics='logRatio')|> - distinct()|> - subset(!is.na(entrez_id))|> - subset(proteomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(proteomics,'/tmp/mpnstpdx_proteomics.csv.gz') - - -#### FIRST WE GET RNASeq Data - -rnaseq<-do.call('rbind',lapply(setdiff(pdx_data$RNASeq,c(NA,"NA")),function(x){ - # if(x!=""){ - #print(x) - sample<-subset(pdx_data,RNASeq==x) - #print(sample) - res<-fread(synGet(x)$path)|> - tidyr::separate(Name,into=c('other_id','vers'),sep='\\.')|> - dplyr::select(-vers)|> - left_join(genes_df)|> - dplyr::select(entrez_id,transcriptomics='TPM')|> - subset(!is.na(entrez_id))|> - subset(transcriptomics!=0) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - return(distinct(res)) - # } -})) - -fwrite(rnaseq,'/tmp/mpnstpdx_transcriptomics.csv.gz') - - - -#####NEXT WE DO WES DATA -print("Getting WES") -wes<-do.call(rbind,lapply(setdiff(pdx_data$`Mutations`,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(pdx_data,Mutations==x) - print(sample$improve_sample_id) - res<-NULL - try(res<-fread(synGet(x2)$path)|> - dplyr::select(entrez_id='Entrez_Gene_Id',mutation='HGVSc',variant_classification='Variant_Classification')|> - subset(entrez_id%in%genes_df$entrez_id)|> - distinct()) - if(is.null(res)) - return(NULL) - - res$improve_sample_id=rep(sample$improve_sample_id[1],nrow(res)) - res$source=rep('NF Data Portal',nrow(res)) - res$study=rep('MPNST PDX',nrow(res)) - - return(distinct(res)) - # } -})) - -fwrite(wes,'/tmp/mpnstpdx_mutations.csv.gz') - - -print(paste("getting CNV")) -##next let's do CNVs! -cnv<-do.call(rbind,lapply(setdiff(pdx_data$CopyNumber,c(NA,"NA")),function(x){ - - x2=x#gsub('"','',gsub("[",'',gsub("]",'',x,fixed=T),fixed=T),fixed=T) - print(x) - sample<-subset(pdx_data,CopyNumber==x) - print(sample$improve_sample_id) - res<-fread(synGet(x2)$path) - - long_df<- res|> - tidyr::separate_rows(gene,sep=',')|> - dplyr::rename(gene_symbol='gene')|> - dplyr::left_join(genes_df)|> - subset(!is.na(entrez_id))|> - dplyr::select(entrez_id,log2)|> - dplyr::distinct()|> - dplyr::mutate(copy_number=2^log2)|> - dplyr::select(-log2) - - res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - ifelse(copy_number<0.7311832,'het loss', - ifelse(copy_number<1.214125,'diploid', - ifelse(copy_number<1.422233,'gain','amp')))))|> - mutate(study='MPNST PDX',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|> - dplyr::distinct() - - # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)] - # filtered_df <- long_df |> - # subset(is.finite(log2))|> - # filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols - # filtered_df <- filtered_df[, .(gene_symbol = V1, - # improve_sample_id = sample$improve_sample_id[1], - # copy_number = 2^log2, - # source = "NF Data Portal", - # study = "MPNST PDX")] - # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp - # dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del', - # ifelse(copy_number<0.7311832,'het loss', - # ifelse(copy_number<1.214125,'diploid', - # ifelse(copy_number<1.422233,'gain','amp')))))|> - # left_join(genes_df)|> - # dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|> - # subset(!is.na(entrez_id))|> - # distinct() - # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id)) - return(res) - # } -})) - -fwrite(cnv,'/tmp/mpnstpdx_copy_number.csv.gz') - -##TODO: get proteomics!!! diff --git a/build/mpnstpdx/02_get_drug_data.R b/build/mpnstpdx/02_get_drug_data.R deleted file mode 100644 index 1f6ad47e..00000000 --- a/build/mpnstpdx/02_get_drug_data.R +++ /dev/null @@ -1,120 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(dplyr) -library(stringr) -library(reticulate) -library(synapser) -library(tidyr) - - -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [olddrugfile] [newdrugfile]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -olddrugfiles <- args[2] -newdrugfile <- args[3] -# Log in to Synapse -synLogin(authToken = PAT) - - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - distinct()|> - subset(!is.na(PDX_Drug_Data)) - - - - - -##define functions - -#print(pdx) -##now loop through manifest to get all the files -pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> - subset(!is.na(V1))|> - subset(V1!='NA')|> - dplyr::rename(id='V1') - -#print(pdx_fold) -###this is not all of themju -pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> - as.data.frame()|> - dplyr::select('experimentalCondition')|> - dplyr::mutate(id=x)))|> - left_join(pdx_fold)|> - tidyr::separate_rows(experimentalCondition,sep=';')|> - mutate(chem_name=tolower(experimentalCondition)) - -#pdx_drug <- data.table(pdx_meta)[,strsplit(as.character(experimentalCondition),';'),by= .(common_name,id)]|> -# mutate(drug=tolower(experimentalCondition)) -#drugs<-sapply(pdx_meta$experimentalCondition,function(x) tolower(unlist(strsplit(x,split=';'))))|> -# unlist()|> -# unique() - -drugs<-setdiff(pdx_meta$chem_name,'control') - - -print(paste(drugs,collapse=',')) - - -##copy old drug to new drug -olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))) -olddrugs<-unique(olddrugs) - -print(paste('Read in ',nrow(olddrugs),'old drug files')) - -fdrugs<-subset(olddrugs,chem_name%in%drugs) -if(nrow(fdrugs)>0){ - dids<-fdrugs$improve_drug_id -}else{ - dids<-c() -} -newdrugs<-subset(olddrugs,improve_drug_id%in%dids) - -print(paste('Found',length(dids),'improved drug ids that exist, saving those')) - - - #file.copy(olddrugfile,newdrugfile) -write.table(newdrugs,file=newdrugfile,sep='\t',row.names=F,quote=FALSE,col.names=T) -output_file_path <- newdrugfile -ignore_file_path <- '/tmp/mpnstpdx_ignore_chems.txt' - - -##now load reticulate down here - - - -use_python("/opt/venv/bin/python3", required = TRUE) -source_python("pubchem_retrieval.py") - -update_dataframe_and_write_tsv(unique_names=drugs,output_filename=output_file_path,ignore_chems=ignore_file_path) - - -tab<-read.table(newdrugfile,sep='\t',header=T,quote="",comment.char="") - -newdrugs<-tab|> - subset(chem_name%in%tolower(alldrugs)) - -tab<-tab|> - subset(improve_drug_id%in%newdrugs$improve_drug_id) - -write.table(tab,file=newdrugfile,sep='\t',row.names=FALSE,quote=FALSE) - - -##now call the python drug script - - diff --git a/build/mpnstpdx/03_get_drug_response_data.R b/build/mpnstpdx/03_get_drug_response_data.R deleted file mode 100644 index 095dba34..00000000 --- a/build/mpnstpdx/03_get_drug_response_data.R +++ /dev/null @@ -1,174 +0,0 @@ -# Load required libraries -library(data.table) -# library(biomaRt)# biomart issues still exist -library(synapser) -library(dplyr) -library(stringr) -# Retrieve command line arguments -args <- commandArgs(trailingOnly = TRUE) - -# Check if a token was provided -if (length(args) == 0) { - stop("No token or sample file provided. Usage: Rscript my_script.R [samples] [drugs]", call. = FALSE) -} - -# Set your personal access token -PAT <- args[1] -patients <- args[2] -drugfile <- args[3] - -# Log in to Synapse -synLogin(authToken = PAT) - - -# Read the sample mapping CSV and genes.csv -samples_df <- fread(patients)|> - dplyr::select(improve_sample_id,common_name,model_type)|> - distinct()#"mpnst/synapse_NF-MPNST_samples.csv") -print(head(samples_df)) - -pdx_samps<-subset(samples_df,model_type=='patient derived xenograft') -org_samps<-subset(samples_df,model_type=='organoid') - -##now get the manifest from synapse -manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|> - as.data.frame()|> - dplyr::rename(common_name='Sample') - - -##PDX contain list of files -pdx<-manifest|> - dplyr::select(common_name,PDX_Drug_Data)|> - subset(!PDX_Drug_Data%in%c("NA",NA))|> - left_join(pdx_samps)|> - distinct() - -print(pdx) - - -# Modify the extract_date_hour function to return a named vector -extract_date_hour <- function(experiment_id) { - pattern <- "(\\d{6})_?(\\d{2,3})?" - matches <- str_match(experiment_id, pattern) - date <- matches[, 2] - hour <- matches[, 3] - date[is.na(date)] <- NA # Replace with NA instead of blank - hour[is.na(hour)] <- 48 # Replace with 48 instead of blank (default) - return(list(date = date, hour = hour)) -} - - - -##define functions - -##first function to get children from parentId - -##now loop through manifest to get all the files -#mts_fold <- data.table(mts)[,strsplit(as.character(MicroTissueDrugFolder),","), by = .(improve_sample_id,common_name)] - - - -##do the drug matching -drug_df<-fread(drugfile)|> - dplyr::select('improve_drug_id','chem_name')|> - distinct() - -##update drug name PD901 since it's mussing -##now loop through manifest to get all the files -pdx_fold <- data.table(pdx)[,strsplit(as.character(PDX_Drug_Data),","), by = .(common_name)]|> - dplyr::rename(id='V1')|> - subset(!is.na(id)) - -pdx_meta<-do.call(rbind,lapply(pdx_fold$id, function(x) synapser::synGetAnnotations(x)|> - as.data.frame()|> - dplyr::select('experimentalCondition')|> - dplyr::mutate(id=x)))|>left_join(pdx_fold)|> - # tidyr::separate_rows(experimentalCondition,sep=';')|> - # mutate(chem_name=tolower(experimentalCondition))|> - # left_join(drug_df)|> - left_join(pdx_samps)|> - dplyr::select(improve_sample_id,id)|> - distinct()|> - subset(!is.na(id)) -pdx_meta$parentId=unlist(lapply(pdx_meta$id,function(x) synGet(x)$parentId)) - -##the older pdx data is in separate files. the newer is not. -#we need to reformat the older to look like the newer -oldfolders=c('syn22018363','syn22024460','syn22024428','syn22024429','syn22024437','syn22024438') - -old_meta<-subset(pdx_meta,parentId%in%oldfolders) - -old_data<-do.call(rbind,lapply(unique(old_meta$parentId),function(x){ - ids<-subset(old_meta,parentId==x)|> - subset(!is.na(id)) - - do.call(rbind,lapply(ids$id,function(y){ - tab<-readr::read_csv(synapser::synGet(y)$path) - print(head(tab)) - tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', - 'experimental_time_point','experimental_time_point_unit', - 'assay_type','assay_value','assay_units'))|> - mutate(id=x)|> - mutate(chem_name=tolower(compound_name)) - - # tab$single_or_combo=sapply(tab$chem_name,function(z) ifelse(length(grep('\\+',z))>0,'combo','single')) - tab$chem_name=gsub('n/a','control',tab$chem_name)|> - tidyr::replace_na('control') - - tab$chem_name=sapply(tab$chem_name,function(z) ifelse(z=='doxorubinsin','doxorubicin',z)) - # tab<-tab|>left_join(drug_df) - #print(head(tab)) - return(tab) - })) -}))|> - left_join(unique(select(old_meta,id=parentId,improve_sample_id)))|> - dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() - - - -new_meta<-subset(pdx_meta,!parentId%in%oldfolders) - -##now combine each of the old pdx files into single files -#each file has all experiments in it -new_data<-do.call(rbind,lapply(unique(new_meta$id), function(x){ - fpath=synapser::synGet(x)$path - if(length(grep('xls',fpath))>0){ - tab<-readxl::read_xlsx(fpath) - }else{ - tab<-readr::read_csv(fpath) - } - print(head(tab)) - tab<-dplyr::select(tab,c('specimen_id','compound_name','dose','dose_unit', - 'experimental_time_point','experimental_time_point_unit', - 'assay_type','assay_value','assay_units'))|> - mutate(id=x) - - # tab$single_or_combo=sapply(tab$compound_name,function(x) ifelse(length(grep('\\+',x))>0,'combo','single')) - tab$compound_name=gsub('N/A','control',tab$compound_name)|>tidyr::replace_na('control') - tab<-tab|> - mutate(chem_name=tolower(compound_name))#|> - # left_join(drug_df) - #print(head(tab)) - return(tab)}))|> - left_join(pdx_meta)|> - dplyr::select(experiment=id,model_id=improve_sample_id,specimen_id,treatment=chem_name,time=experimental_time_point,time_unit=experimental_time_point_unit,volume=assay_value)|>distinct() - -##maybe tweak the data frame a bit depending on curve fitting script -pdx_data<-rbind(old_data,new_data) - -#single_pdx<-subset(pdx_data,single_or_combo=='single') -#combo_pdx<-subset(pdx_data,single_or_combo=='combo') -#print(head(pdx_data)) -fwrite(pdx_data,'/tmp/curve_data.tsv',sep='\t') - -##TODO: create new curve fitting script in python -pycmd = '/opt/venv/bin/python calc_pdx_metrics.py --input /tmp/curve_data.tsv --outprefix /tmp/mpnstpdx' -print('running curve fitting') -#system(pycmd) - -##now read in data again, separate out by single/combo, then map to drug id - -##mmve file name -#file.rename('/tmp/experiments.0','/tmp/mpnstpdx_experiments.tsv') - - diff --git a/build/mpnstpdx/README.md b/build/mpnstpdx/README.md deleted file mode 100755 index b0059283..00000000 --- a/build/mpnstpdx/README.md +++ /dev/null @@ -1,47 +0,0 @@ -## Build Instructions for MPNST PDX Dataset - -To build the MPNST PDX dataset, follow these steps from the coderdata root -directory. Currently using the test files as input. - -1. Build the Docker image: - ``` - docker build -f build/docker/Dockerfile.mpnstpdx -t mpnstpdx . --build-arg HTTPS_PROXY=$HTTPS_PROXY - ``` - -2. Generate new identifiers for these samples to create a - `mpnstpdx_samples.csv` file. This pulls from the latest synapse - project metadata table. - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_samples.sh /tmp/build/build_test/test_samples.csv - ``` - -3. Pull the data and map it to the samples. This uses the metadata - table pulled above. - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_omics.sh /tmp/build/build_test/test_genes.csv /tmp/mpnstpdx_samples.csv - ``` - -4. Process drug data - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_drugs.sh /tmp/build/build_test/test_drugs.tsv - ``` - -5. Process experiment data. This uses the metadata from above as well as the file directory on synapse: - ``` - docker run -v $PWD:/tmp -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN mpnstpdx sh build_exp.sh /tmp/mpnstpdx_samples.csv /tmp/mpnstpdx_drugs.tsv.gz - ``` - -Please ensure that each step is followed in order for correct dataset compilation. - -## MPNST PDX Dataset Structure -The MPNST dataset includes the following output files: -``` -├── mpnstpdx_samples.csv -├── mpnstpdx_transcriptomics.csv -├── mpnstpdx_mutations.csv -├── mpnstpdx_copy_number.csv -├── mpnstpdx_drugs.tsv -├── mpnstpdx_drug_descriptors.tsv.gz -├── mpnstpdx_experiments.tsv.gz -``` - diff --git a/build/mpnstpdx/build_drugs.sh b/build/mpnstpdx/build_drugs.sh deleted file mode 100644 index 78502bc7..00000000 --- a/build/mpnstpdx/build_drugs.sh +++ /dev/null @@ -1,4 +0,0 @@ -##get drug data -Rscript 02_get_drug_data.R $SYNAPSE_AUTH_TOKEN $1 /tmp/mpnstpdx_drugs.tsv -##get drug descriptors -/opt/venv/bin/python3 build_drug_desc.py --drugtable /tmp/mpnstpdx_drugs.tsv --desctable /tmp/mpnstpdx_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/mpnstpdx/build_exp.sh b/build/mpnstpdx/build_exp.sh deleted file mode 100644 index 4e34f6b3..00000000 --- a/build/mpnstpdx/build_exp.sh +++ /dev/null @@ -1,2 +0,0 @@ -Rscript 03_get_drug_response_data.R $SYNAPSE_AUTH_TOKEN $1 $2 -/opt/venv/bin/python3 calc_pdx_metrics.py /tmp/curve_data.tsv --drugfile=/tmp/mpnstpdx_drugs.tsv --outprefix=/tmp/mpnstpdx diff --git a/build/mpnstpdx/build_omics.sh b/build/mpnstpdx/build_omics.sh deleted file mode 100644 index 969b4fba..00000000 --- a/build/mpnstpdx/build_omics.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -euo pipefail - -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR - -echo "Running 01_mpnstpdx_get_omics.R with $SYNAPSE_AUTH_TOKEN, $2, and $1." -Rscript 01_mpnstpdx_get_omics.R $SYNAPSE_AUTH_TOKEN $2 $1 diff --git a/build/mpnstpdx/build_samples.sh b/build/mpnstpdx/build_samples.sh deleted file mode 100644 index aa88aa02..00000000 --- a/build/mpnstpdx/build_samples.sh +++ /dev/null @@ -1 +0,0 @@ -cp /tmp/mpnst_samples.csv /tmp/mpnstpdx_samples.csv diff --git a/build/mpnstpdx/requirements.r b/build/mpnstpdx/requirements.r deleted file mode 100755 index e6139cd4..00000000 --- a/build/mpnstpdx/requirements.r +++ /dev/null @@ -1,13 +0,0 @@ -install.packages('reticulate', repos='https://cloud.r-project.org') -reticulate::use_virtualenv('/opt/venv', required = TRUE) -install.packages('remotes') -remotes::install_version('rjson', version = '0.2.21', repos = 'https://cloud.r-project.org') -install.packages('synapser', repos = c('http://ran.synapse.org', 'https://cloud.r-project.org')) -install.packages("dplyr") -install.packages("data.table") -install.packages("synapser", repos = c("http://ran.synapse.org", "https://cloud.r-project.org")) -install.packages("R.utils") -install.packages("stringr") -install.packages("tidyr") -install.packages('readr') -install.packages("readxl") diff --git a/build/mpnstpdx/requirements.txt b/build/mpnstpdx/requirements.txt deleted file mode 100755 index b0944928..00000000 --- a/build/mpnstpdx/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -pyarrow -pandas -matplotlib -numpy==1.26.4 -argparse -tqdm -scikit-learn -scipy -requests -mordredcommunity -rdkit -statsmodels diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml index 4cce4283..8035ff99 100644 --- a/schema/expected_files.yaml +++ b/schema/expected_files.yaml @@ -43,24 +43,6 @@ datasets: - target_class: Drug Descriptor file: /tmp/mpnst_drug_descriptors.tsv - mpnstpdx: - - target_class: Sample - file: /tmp/mpnstpdx_samples.csv - - target_class: Transcriptomics - file: /tmp/mpnstpdx_transcriptomics.csv - - target_class: Proteomics - file: /tmp/mpnstpdx_proteomics.csv - - target_class: Mutations - file: /tmp/mpnstpdx_mutations.csv - - target_class: Copy Number - file: /tmp/mpnstpdx_copy_number.csv - - target_class: Experiments - file: /tmp/mpnstpdx_experiments.tsv - - target_class: Drug - file: /tmp/mpnstpdx_drugs.tsv - - target_class: Drug Descriptor - file: /tmp/mpnstpdx_drug_descriptors.tsv - cptac: - target_class: Sample file: /tmp/cptac_samples.csv From e8119697d121108a50ae75fa524cfce2a84cfa09 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 09:26:15 -0700 Subject: [PATCH 14/20] Made update in response to merge from novartis branch --- build/mpnst/03_get_experiments.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/build/mpnst/03_get_experiments.R b/build/mpnst/03_get_experiments.R index db7beb25..5d1a8c4c 100644 --- a/build/mpnst/03_get_experiments.R +++ b/build/mpnst/03_get_experiments.R @@ -239,13 +239,15 @@ fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), s message("Wrote PDX curve data") + system(sprintf( - "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s", - paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), - drugfile, - paste0("/tmp/", out_prefix, "_pdx") + "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s --source 'NF Data Portal' --study 'MPNST PDX'", + paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), + drugfile, + paste0("/tmp/", out_prefix, "_pdx") )) + message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations") From 91c90791252ea7b36e1afa0d8b0cc29bcd084abf Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 09:30:40 -0700 Subject: [PATCH 15/20] Import calc_pdx_metrics.py from novartisPDX-drugs-experiments --- build/utils/calc_pdx_metrics.py | 61 +++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index e0f4c05a..b25e4f1e 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - #print(time) + #print('at line 187') + #print(time.shape) + #print(time.dtype) + #print(np.max(time.astype(int))) + #print('auc is : ') + #print(auc) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - + #print('drug name is ' + drug_name) + data['exp_type'] = data['exp_type'].astype('category') + data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) + #print(data) + #print(data['exp_type'].cat.categories) # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' + #print(data['exp_type'].cat.categories) # Fit the model model = mixedlm(formula, data, groups=data['model_id']) fit = model.fit() @@ -284,6 +294,7 @@ def lmm(time, volume, treatment, drug_name): # time_coef_value = fit.params['time'] #print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] + #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: # coef_value = None # Handle the case when the interaction term is not present @@ -301,6 +312,8 @@ def main(): parser.add_argument('curvefile') parser.add_argument('--drugfile') parser.add_argument('--outprefix',default='/tmp/') + parser.add_argument('--study') + parser.add_argument('--source') args = parser.parse_args() @@ -314,20 +327,21 @@ def main(): expsing = expsing.dropna() # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value - - combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) - combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() - - expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - - expcomb[['source']]='Synapse' - expcomb[['study']]='MPNST PDX in vivo' - - expsing[['source']]='Synapse' - expsing[['study']]='MPNST PDX in vivo' + if combos.shape[0]> 0: + combos[['drug1','drug2']]=combos['drug'].str.split('+',expand=True) + + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb[['source']]=args.source + expcomb[['study']]=args.study + expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + + expsing[['source']]=args.source + expsing[['study']]=args.study expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t") - expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + #expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") @@ -341,21 +355,25 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) - print(name[0]) - print(drugs) + #print('line 355') + #print(name[0]) + #print(drugs) mod = list(set(group.model_id))[0] ctl_data = group[group.treatment == control] ctl_time = np.array(ctl_data.time) ctl_volume = np.array(ctl_data.volume) - + if (ctl_volume.shape[0] < 2): + continue ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - print(d) - d_data = group[group.treatment == d] + #print('is our drug a string or dict?') + #print(str(d)) + d_data = group[group.treatment == str(d)] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) - + if (treat_volume.shape[0] < 2): + continue # Get ABC for group treat_auc = AUC(treat_time, treat_volume) treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume) @@ -368,6 +386,7 @@ def get_drug_stats(df, control='control'): #llm comb = pd.concat([ctl_data, d_data]) + #print(comb) lmm_res = lmm(comb.time, comb.volume, comb.treatment, d) lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'}) if '+' in d: From e5f7f3aa7ccebe76fcd9ee655e1b7e14618048f4 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 09:34:08 -0700 Subject: [PATCH 16/20] Revert "Merge branch 'novartisPDX-drugs-experiments' into mpnst_dataset_join" This reverts commit 1cecc29f82cf973987cd84b399465a7b24ae3949, reversing changes made to 19adfc5c45af5c247d3bd741c67eba11f170ea02. --- build/novartispdx/01-samples-novartispdx.py | 58 ----- build/novartispdx/02-omics-novartispdx.py | 214 ------------------ build/novartispdx/03-drugs-novartispdx.py | 70 ------ .../novartispdx/04-experiments-novartispdx.py | 64 ------ build/novartispdx/build_drugs.sh | 15 -- build/novartispdx/build_experiments.sh | 5 - build/novartispdx/build_omics.sh | 12 - build/novartispdx/build_samples.sh | 7 - build/utils/calc_pdx_metrics.py | 61 ++--- build/utils/get_copy_call.py | 22 -- 10 files changed, 21 insertions(+), 507 deletions(-) delete mode 100644 build/novartispdx/01-samples-novartispdx.py delete mode 100644 build/novartispdx/02-omics-novartispdx.py delete mode 100644 build/novartispdx/03-drugs-novartispdx.py delete mode 100644 build/novartispdx/04-experiments-novartispdx.py delete mode 100755 build/novartispdx/build_drugs.sh delete mode 100755 build/novartispdx/build_experiments.sh delete mode 100644 build/novartispdx/build_omics.sh delete mode 100755 build/novartispdx/build_samples.sh delete mode 100644 build/utils/get_copy_call.py diff --git a/build/novartispdx/01-samples-novartispdx.py b/build/novartispdx/01-samples-novartispdx.py deleted file mode 100644 index b46797b9..00000000 --- a/build/novartispdx/01-samples-novartispdx.py +++ /dev/null @@ -1,58 +0,0 @@ -import pandas as pd -import synapseclient -import numpy as np -import argparse -import os - -def get_complete_novartispdx_sample_sheet(synObject): - - files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file'])) - - synIDs = [item['id'] for item in files] - # leave off synIDs for drug info - synIDs.remove('syn66276102') - synIDs.remove('syn66276098') - synIDs.remove("syn66477971") - # create empty dataframe - allsamplesheet = pd.DataFrame() - # iterate through IDs and concatenate - for id in synIDs: - curr = synObject.get(id) - currdf = pd.read_csv(curr.path) - allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True) - # rename columns and reformat cancer type from CANCER_HISTOLOGY column - allsamplesheet['other_id'] = allsamplesheet['Sample ID'] - allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID'] - allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1] - allsamplesheet['species'] = "Homo Sapiens(human)" - allsamplesheet['model_type'] = 'patient derived xenograft' - allsamplesheet['other_id_source'] = 'Synapse' - allsamplesheet['other_names'] = '' - finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']] - return finalsamplesheet - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet") - - parser.add_argument('-t', '--token', type=str, help='Synapse Token') - - parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation") - - args = parser.parse_args() - - print("Logging into Synapse") - PAT = args.token - synObject = synapseclient.login(authToken=PAT) - samplesheet = get_complete_novartispdx_sample_sheet(synObject) - - if (args.prevSamples): - prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id) - else: - prev_max_improve_id = 0 - - samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1) - - samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False) - - \ No newline at end of file diff --git a/build/novartispdx/02-omics-novartispdx.py b/build/novartispdx/02-omics-novartispdx.py deleted file mode 100644 index edb3a451..00000000 --- a/build/novartispdx/02-omics-novartispdx.py +++ /dev/null @@ -1,214 +0,0 @@ -import pandas as pd -import numpy as np -import os -import math -import argparse - - -def get_copy_call(a): - """ - Heler Function - Determine copy call for a value. - """ - - if a is None: - return float('nan') - - if math.isnan(a): - return float('nan') - - a_val = math.log2(float(a)+0.000001) - if a_val < 0.5210507: - return 'deep del' - elif a_val < 0.7311832: - return 'het loss' - elif a_val < 1.214125: - return 'diploid' - elif a_val < 1.422233: - return 'gain' - else: - return 'amp' - - return pd.Series([get_copy_call(a) for a in arr]) - - -def download_parse_omics_novPDX(synID:str , save_path:str = None, synToken:str = None): - """ - Download omics data from Synapse at synapseID syn66364488. Requires a synapse token, which requires you to make a Synapse account - and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens - Omics data is an excel file. The excel file is then parsed for the RNAseq, copy number, and mutations data. - - Parameters - ---------- - synID : string - SynapseID of dataset to download. Default is synapseID of the sequencing dataset. - - save_path : string - Local path where the downloaded file will be saved. - - synToken : string - Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens - - Returns - ------- - mutations_data : pd.DataFrame - A DataFrame containing mutations data. - - copy_number_data : pd.DataFrame - A DataFrame containing copy number data. - - rnaseq_data : pd.DataFrame - A DataFrame containing RNAseq data. - """ - - syn = synapseclient.Synapse() - syn.login(authToken=synToken) - - # Obtain a pointer and download the data - syn66364488 = syn.get(entity=synID, downloadLocation = save_path) - - # Get the path to the local copy of the data file - sequencing_filepath = syn66364488.path - all_omics_excel = pd.ExcelFile(open(sequencing_filepath, 'rb')) - mutations_data = pd.read_excel(all_omics_excel, 'pdxe_mut_and_cn2') # table with somatic mutation information - copy_number_data = pd.read_excel(all_omics_excel, 'copy number') # table with copy number information - rnaseq_data = pd.read_excel(all_omics_excel, 'RNAseq_fpkm') - - - return(rnaseq_data, copy_number_data, mutations_data) - - -def map_copy_number_novPDX(copy_number_data, improve_id_data, entrez_data): - """ - Maps copy number data to improved sample id's and entrez gene data. Also does some data formatting. - - Parameters - ---------- - copy_number_data : pd.Dataframe OR string - Pandas dataframe object with copy number data OR path to csv with copy number data - - improve_id_data : pd.Dataframe OR string - Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2() - - entrez_data : pd.Dataframe OR string - Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes - - Returns - ------- - sample_entrez_cn_df : pd.DataFrame - A DataFrame containing the mapped copy number data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id - - """ - # read in data - if isinstance(copy_number_data, pd.DataFrame) == False: - copy_number_data = pd.read_csv(copy_number_data) - - if isinstance(improve_id_data, pd.DataFrame) == False: - improve_id_data = pd.read_csv(improve_id_data) - - if isinstance(entrez_data, pd.DataFrame) == False: - entrez_data = pd.read_csv(entrez_data) - - # melt dataframe so that there is gene name and improve_sample_id per row - long_cn_df = pd.melt(copy_number_data, id_vars=['Sample'], value_vars=copy_number_data.columns[copy_number_data.columns != 'Sample']) - - # get entrez id's from Sample - entrez_cn_df = pd.merge(long_cn_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'left', left_on= "Sample", right_on= "other_id") - - # get copy call from value column (aka copy number) - entrez_cn_df['copy_call'] = [get_copy_call(a) for a in entrez_cn_df['value']] - - # get improve sample id - improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","") - sample_entrez_cn_df = pd.merge(entrez_cn_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'left', left_on= "variable", right_on= "to_merge") - - # clean up columns and data types - sample_entrez_cn_df = sample_entrez_cn_df.drop(columns=['Sample','variable','other_id','to_merge']) - sample_entrez_cn_df['source'] = "CPDM" - sample_entrez_cn_df['study'] = "novartispdx" - sample_entrez_cn_df = sample_entrez_cn_df.rename(columns={'value':'copy_number'}) - sample_entrez_cn_df = sample_entrez_cn_df.astype({'entrez_id':'int','improve_sample_id':'int'}) - sample_entrez_cn_df = sample_entrez_cn_df[['entrez_id','copy_number','copy_call','study','source','improve_sample_id']] - sample_entrez_cn_df = sample_entrez_cn_df.drop_duplicates() - - - return(sample_entrez_cn_df) - - -def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_data): - """ - Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting. - - Parameters - ---------- - copy_number_data : pd.Dataframe OR string - Pandas dataframe object with transcriptomics data OR path to csv with transcriptomics data - - improve_id_data : pd.Dataframe OR string - Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2() - - entrez_data : pd.Dataframe OR string - Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes - - Returns - ------- - sample_entrez_cn_df : pd.DataFrame - A DataFrame containing the mapped transcriptomics data with columns: entrez_id, copy_number, copy_call, study, source ,improve_sample_id - - """ - # read in data - if isinstance(transcriptomics_data, pd.DataFrame) == False: - transcriptomics_data = pd.read_csv(transcriptomics_data) - - if isinstance(improve_id_data, pd.DataFrame) == False: - improve_id_data = pd.read_csv(improve_id_data) - - if isinstance(entrez_data, pd.DataFrame) == False: - entrez_data = pd.read_csv(entrez_data) - - # melt dataframe so that there is gene name and improve_sample_id per row - rnaseq_df = rnaseq_df.rename(columns={'Sample':'stable_id'}) - rnaseq_df.to_csv("/tmp/counts_for_tpm_conversion.tsv", sep='\t') - - # run tpmFromCounts.py to convert counts to tpm - os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv") - - # read in amd melt dataframe so that there is an entrez and sample id per row - tpm_transciptomics_data = pd.read_csv("/tmp/transcriptomics_tpm.tsv", sep="\t") - long_rnaseq = pd.melt(tpm_transciptomics_data, id_vars=['stable_id'], value_vars=tpm_transciptomics_data.columns[tpm_transciptomics_data.columns != 'stable_id']) - - # merge entrez id's - entrez_transcriptomics_df = pd.merge(long_rnaseq.drop_duplicates(), entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id") - - # get improve sample id - improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","") - sample_entrez_transcriptomics_df = pd.merge(entrez_transcriptomics_df.drop_duplicates(), improve_id_data[['to_merge','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "to_merge") - - # clean up columns and data types - sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.drop(columns=['stable_id','variable','other_id','to_merge']) - sample_entrez_transcriptomics_df['source'] = "CPDM" - sample_entrez_transcriptomics_df['study'] = "novartispdx" - sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.rename(columns={'value':'transcriptomics'}) - sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df.astype({'entrez_id':'int','improve_sample_id':'int'}) - sample_entrez_transcriptomics_df = sample_entrez_transcriptomics_df[['entrez_id','transcriptomics','improve_sample_id','source','study']] - - return(sample_entrez_transcriptomics_df) - - -if __name__ == "__main__": - print('in main') - parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project") - parser.add_argument('-s', '--samples', help='Path to sample file',default=None) - parser.add_argument('-g', '--genes', help='Path to genes file', default = None) - parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False) - parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False) - parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False) - parser.add_argument('-t', '--token', help='Synapse token') - - args = parser.parse_args() - print("Logging into Synapse") - PAT = args.token - - genes=pd.read_csv(args.genes) - samples = pd.read_csv(args.samples) - - data =download_parse_omics_novPDX(syn id,savestring, PAT) \ No newline at end of file diff --git a/build/novartispdx/03-drugs-novartispdx.py b/build/novartispdx/03-drugs-novartispdx.py deleted file mode 100644 index a015bba9..00000000 --- a/build/novartispdx/03-drugs-novartispdx.py +++ /dev/null @@ -1,70 +0,0 @@ -import synapseclient -import pandas as pd -import numpy as np -import argparse -import os -# for testing locally -from utils.pubchem_retrieval import update_dataframe_and_write_tsv -# for building in docker -#from pubchem_retrieval import update_dataframe_and_write_tsv - - -def create_novartis_pdx_drugs_file(synObject, prevDrugFilepath, outputPath): - file = synObject.get('syn66276102') - # read raw drug data from synapse - rawDrugData = pd.read_csv(file.path) - # split on + operator - there are 2- and one 3- way drug combos in this dataset - sepDrugNames = pd.Series(rawDrugData['Treatment'].unique()).str.split("+", expand=True) - - - - # taking the drug names from the first and second column from the split - there is only one - # drug name in the 3rd column (onen 3-way combo) that is replicated in other treatments as well - alldrugnames = pd.Series(pd.concat([sepDrugNames[0], sepDrugNames[1]]).dropna()).str.split('"', expand=True)[0].str.split("-", expand=True)[0] - #nodoseinfo = pd.Series(alldrugnames.str.split("-", expand =True)[0]) - #combineddrugames = pd.concat([alldrugnames, nodoseinfo]) - finalDrugNames = pd.Series(alldrugnames.unique()).str.strip().unique() - # get unique drugs - newdrugnames = finalDrugNames[finalDrugNames != 'untreated'] - - #print(finalDrugNames.tolist) - #newdrugnames = finalDrugNames.remove('untreated') - print(2) - print(newdrugnames) - - - # use helper functions in pubchem_retrieval.py - alldrugs = [] - if prevDrugFilepath is not None and prevDrugFilepath is not "": - prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')] - alldrugs = pd.concat(prevdrugs).drop_duplicates() - - imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)] - newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)] - - ##write drugs - newdrugs.to_csv(outputPath, sep='\t', index=False) - - if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match - print('Missing drugs in existing file, querying pubchem') - update_dataframe_and_write_tsv(newdrugnames,outputPath) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Novartis PDX data") - parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for bladderpdo', nargs="?", default = None) - parser.add_argument('-o', '--outputPath', help='Output path for updated novartispdx drug file', default = "/tmp/novartispdx_drugs.tsv") - parser.add_argument('-t', '--token', help='Synapse token') - - args = parser.parse_args() - print("Logging into Synapse") - PAT = args.token - print("after PAT assignment") - synObject = synapseclient.login(authToken=PAT) - print('after creating synObject') - if args.prevDrugFilePath: - previousDrugs = args.prevDrugFilePath - else: - previousDrugs = None - create_novartis_pdx_drugs_file(synObject, previousDrugs, args.outputPath) \ No newline at end of file diff --git a/build/novartispdx/04-experiments-novartispdx.py b/build/novartispdx/04-experiments-novartispdx.py deleted file mode 100644 index 31cebbdc..00000000 --- a/build/novartispdx/04-experiments-novartispdx.py +++ /dev/null @@ -1,64 +0,0 @@ -import synapseclient -import pandas as pd -import numpy as np -import argparse -import os - - -# add improve IDs - for sample and drug -def get_novartis_pdx_experiments_file(synObject, samples_df): - # input for the calc_pdx_metrics script - - file1 = synObject.get('syn66276102') - rawDrugData = pd.read_csv(file1.path) - # STILL NEED TO : link to improve ids. - # update a few drug ids for greater inclusion - novartispdx_curvefile = rawDrugData[['Model', 'Days Post T0', 'Volume (mm3)', 'Treatment']] - novartispdx_curvefile=novartispdx_curvefile.rename({'Model': 'model_id', 'Days Post T0' : 'time', 'Volume (mm3)': 'volume', 'Treatment':'treatment'}, axis=1) - novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.lower() - novartispdx_curvefile['treatment'] = novartispdx_curvefile['treatment'].str.replace('"', '') - novartispdx_curvefile['treatment']=novartispdx_curvefile['treatment'].str.replace('untreated', 'control') - novartispdx_curvefile['experiment'] = novartispdx_curvefile.groupby(['model_id']).ngroup()+1 - # remove triple combination(s) - novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['treatment'].str.contains(r'\+')] - # remove dose information appended to some drugs in the treatment column and include in dose colum - druganddose = novartispdx_curvefile['treatment'].str.split('-', expand=True) - druganddose = druganddose.rename({0: 'treatment', 1:'dose'}, axis=1) - novartispdx_curvefile['treatment']=druganddose['treatment'] - novartispdx_curvefile['dose'] = druganddose['dose'] - # remove pdxs with only one drug treatment (no control) - unique_vals_tally = novartispdx_curvefile.groupby('experiment').nunique() - todiscard = unique_vals_tally[unique_vals_tally['treatment']==1].index - novartispdx_curvefile = novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(todiscard)] - # remove groups with no 'control' treatment - groupeddf = novartispdx_curvefile.groupby('experiment') - no_control = groupeddf['treatment'].apply(lambda x: x.str.contains('control').any()) - - missingcontrols = no_control.reset_index()[no_control.reset_index()['treatment'] ==False]['experiment'] - nomissingcontrols=novartispdx_curvefile[~novartispdx_curvefile['experiment'].isin(missingcontrols)] - #merge on drug names done in calc_pdx_metrics.py - #final_w_drugIDs = finaldf.merge(drug_df, how='left',right_on='chem_name', left_on="treatment") - final_allIDs = nomissingcontrols.merge(samples_df, how='left', right_on='common_name', left_on='model_id') - final_allIDs = final_allIDs.drop('model_id', axis=1) - finalDF = final_allIDs.rename({'improve_sample_id':'model_id'}, axis=1) - finalcurvefile = finalDF[['model_id', 'time', 'volume', 'treatment', 'experiment', 'dose']] - return finalcurvefile - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-t', '--token', help='Synapse authentication token') - parser.add_argument('-s', '--curSampleFile', default='/tmp/novartispdx_samples.csv', help='Sample mapping file for bladder pdo samples') - parser.add_argument('-d', '--drugfile', default='/tmp/novartispdx_drugs.tsv', help='Drug mapping file for bladder pdo samples') - parser.add_argument('-o', '--output', default = '/tmp/novartispdx_experiments.tsv',help='Output experiments file') - - args = parser.parse_args() - print("Logging into Synapse") - PAT = args.token - synObject = synapseclient.login(authToken=PAT) - samples_df = pd.read_csv(args.curSampleFile) - - doseresponse_data = get_novartis_pdx_experiments_file(synObject, samples_df) - print(doseresponse_data.head) - doseresponse_data.to_csv('/tmp/novartispdx_curvedata.tsv', columns=list({'model_id', 'time', 'volume', 'treatment','experiment', 'dose'}), sep='\t') - diff --git a/build/novartispdx/build_drugs.sh b/build/novartispdx/build_drugs.sh deleted file mode 100755 index 53abb4ec..00000000 --- a/build/novartispdx/build_drugs.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -set -euo pipefail - -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR - -echo "Running script with token and drugFile $1" -# for running locally (from build directory): -python3 -m novartispdx.03-drugs-novartispdx --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv -#python3 novar -#python3 03-drugs-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/novartispdx_drugs.tsv - -echo "Running build_drug_desc.py..." -#for running locally: -python3 utils/build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz -#python3 build_drug_desc.py --drugtable /tmp/novartispdx_drugs.tsv --desctable /tmp/novartispdx_drug_descriptors.tsv.gz \ No newline at end of file diff --git a/build/novartispdx/build_experiments.sh b/build/novartispdx/build_experiments.sh deleted file mode 100755 index a333402d..00000000 --- a/build/novartispdx/build_experiments.sh +++ /dev/null @@ -1,5 +0,0 @@ - -#python3 04-experiments-novartispdx.py --token $SYNAPSE_AUTH_TOKEN - -python3 -m novartispdx.04-experiments-novartispdx --token $SYNAPSE_AUTH_TOKEN -o ~/Projects/CoderData/dev-environment/novartispdx/novartispdx_curvedata.tsv -python3 utils/calc_pdx_metrics.py /tmp/novartispdx_curvedata.tsv --drugfile=/tmp/novartispdx_drugs.tsv --outprefix=/tmp/novartispdx --study='Novartis PDX Gao etal 2015' --source='Synapse' \ No newline at end of file diff --git a/build/novartispdx/build_omics.sh b/build/novartispdx/build_omics.sh deleted file mode 100644 index 39585d8c..00000000 --- a/build/novartispdx/build_omics.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -euo pipefail - -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR - -echo "Running script with token, curSamples $2, and genes $1." -# for mutation data (-m) -python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -m -# for expressiondata (-e) -python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -e -# for copynumber -python3 02-omics-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -c \ No newline at end of file diff --git a/build/novartispdx/build_samples.sh b/build/novartispdx/build_samples.sh deleted file mode 100755 index 562f74a8..00000000 --- a/build/novartispdx/build_samples.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -euo pipefail - -trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR - -echo "Running 01-samples-novartispdx.py with token and previous sample file $1" -python3 01-samples-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -p $1 \ No newline at end of file diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index b25e4f1e..e0f4c05a 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,12 +184,7 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - #print('at line 187') - #print(time.shape) - #print(time.dtype) - #print(np.max(time.astype(int))) - #print('auc is : ') - #print(auc) + #print(time) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -275,15 +270,10 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - #print('drug name is ' + drug_name) - data['exp_type'] = data['exp_type'].astype('category') - data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) - #print(data) - #print(data['exp_type'].cat.categories) + # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' - #print(data['exp_type'].cat.categories) # Fit the model model = mixedlm(formula, data, groups=data['model_id']) fit = model.fit() @@ -294,7 +284,6 @@ def lmm(time, volume, treatment, drug_name): # time_coef_value = fit.params['time'] #print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] - #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: # coef_value = None # Handle the case when the interaction term is not present @@ -312,8 +301,6 @@ def main(): parser.add_argument('curvefile') parser.add_argument('--drugfile') parser.add_argument('--outprefix',default='/tmp/') - parser.add_argument('--study') - parser.add_argument('--source') args = parser.parse_args() @@ -327,21 +314,20 @@ def main(): expsing = expsing.dropna() # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value - if combos.shape[0]> 0: - combos[['drug1','drug2']]=combos['drug'].str.split('+',expand=True) - - combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() - - expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb[['source']]=args.source - expcomb[['study']]=args.study - expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") - - expsing[['source']]=args.source - expsing[['study']]=args.study + + combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + + expcomb[['source']]='Synapse' + expcomb[['study']]='MPNST PDX in vivo' + + expsing[['source']]='Synapse' + expsing[['study']]='MPNST PDX in vivo' expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t") - #expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") @@ -355,25 +341,21 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) - #print('line 355') - #print(name[0]) - #print(drugs) + print(name[0]) + print(drugs) mod = list(set(group.model_id))[0] ctl_data = group[group.treatment == control] ctl_time = np.array(ctl_data.time) ctl_volume = np.array(ctl_data.volume) - if (ctl_volume.shape[0] < 2): - continue + ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - #print('is our drug a string or dict?') - #print(str(d)) - d_data = group[group.treatment == str(d)] + print(d) + d_data = group[group.treatment == d] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) - if (treat_volume.shape[0] < 2): - continue + # Get ABC for group treat_auc = AUC(treat_time, treat_volume) treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume) @@ -386,7 +368,6 @@ def get_drug_stats(df, control='control'): #llm comb = pd.concat([ctl_data, d_data]) - #print(comb) lmm_res = lmm(comb.time, comb.volume, comb.treatment, d) lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'}) if '+' in d: diff --git a/build/utils/get_copy_call.py b/build/utils/get_copy_call.py deleted file mode 100644 index f2b3864c..00000000 --- a/build/utils/get_copy_call.py +++ /dev/null @@ -1,22 +0,0 @@ -def get_copy_call(a): - """ - Heler Function - Determine copy call for a value. - """ - - if a is None: - return float('nan') - - if math.isnan(a): - return float('nan') - - a_val = math.log2(float(a)+0.000001) - if a_val < 0.5210507: - return 'deep del' - elif a_val < 0.7311832: - return 'het loss' - elif a_val < 1.214125: - return 'diploid' - elif a_val < 1.422233: - return 'gain' - else: - return 'amp' From b958e83a98753c64d2b67e26f47ce95b57743d8c Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 09:34:19 -0700 Subject: [PATCH 17/20] Revert "Made update in response to merge from novartis branch" This reverts commit e8119697d121108a50ae75fa524cfce2a84cfa09. --- build/mpnst/03_get_experiments.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/build/mpnst/03_get_experiments.R b/build/mpnst/03_get_experiments.R index 5d1a8c4c..db7beb25 100644 --- a/build/mpnst/03_get_experiments.R +++ b/build/mpnst/03_get_experiments.R @@ -239,15 +239,13 @@ fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), s message("Wrote PDX curve data") - system(sprintf( - "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s --source 'NF Data Portal' --study 'MPNST PDX'", - paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), - drugfile, - paste0("/tmp/", out_prefix, "_pdx") + "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s", + paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), + drugfile, + paste0("/tmp/", out_prefix, "_pdx") )) - message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations") From fca76bdd811bc5df95486ec07f85b511209e799f Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 09:36:35 -0700 Subject: [PATCH 18/20] Fixed git issue. Adding pdx update from novartis and local change --- build/mpnst/03_get_experiments.R | 10 +++--- build/utils/calc_pdx_metrics.py | 61 +++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/build/mpnst/03_get_experiments.R b/build/mpnst/03_get_experiments.R index db7beb25..a430cae8 100644 --- a/build/mpnst/03_get_experiments.R +++ b/build/mpnst/03_get_experiments.R @@ -240,12 +240,14 @@ fwrite(pdx_data, file.path("/tmp", paste0(out_prefix, "_pdx_curve_data.tsv")), s message("Wrote PDX curve data") system(sprintf( - "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s", - paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), - drugfile, - paste0("/tmp/", out_prefix, "_pdx") + "/opt/venv/bin/python calc_pdx_metrics.py %s --drugfile %s --outprefix %s --source 'NF Data Portal' --study 'MPNST PDX'", + paste0("/tmp/", out_prefix, "_pdx_curve_data.tsv"), + drugfile, + paste0("/tmp/", out_prefix, "_pdx") )) + + message("Wrote PDX experiments to ", "/tmp/", out_prefix, "_pdx_experiments.tsv and combinations") diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index e0f4c05a..b25e4f1e 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -184,7 +184,12 @@ def AUC(time, volume, time_normalize=True): dict: Dictionary containing the AUC value. """ auc = trapz_auc(time, volume) - #print(time) + #print('at line 187') + #print(time.shape) + #print(time.dtype) + #print(np.max(time.astype(int))) + #print('auc is : ') + #print(auc) if time_normalize: auc = auc/np.max(time) return {"metric": "auc", "value": auc, 'time':np.max(time)} @@ -270,10 +275,15 @@ def lmm(time, volume, treatment, drug_name): raise ValueError("These columns must be present: 'model_id', 'volume', 'time', 'exp_type'") data['log_volume'] = np.log(data['volume']) - + #print('drug name is ' + drug_name) + data['exp_type'] = data['exp_type'].astype('category') + data['exp_type']=pd.Categorical(data['exp_type'],categories = ['control',drug_name], ordered=True) + #print(data) + #print(data['exp_type'].cat.categories) # Define the formula for mixed linear model formula = 'log_volume ~ time*exp_type' + #print(data['exp_type'].cat.categories) # Fit the model model = mixedlm(formula, data, groups=data['model_id']) fit = model.fit() @@ -284,6 +294,7 @@ def lmm(time, volume, treatment, drug_name): # time_coef_value = fit.params['time'] #print(fit.params) i_coef_value = fit.params['time:exp_type[T.'+drug_name+']'] + #i_coef_value = fit.params['time:exp_type['+drug_name+']'] # else: # coef_value = None # Handle the case when the interaction term is not present @@ -301,6 +312,8 @@ def main(): parser.add_argument('curvefile') parser.add_argument('--drugfile') parser.add_argument('--outprefix',default='/tmp/') + parser.add_argument('--study') + parser.add_argument('--source') args = parser.parse_args() @@ -314,20 +327,21 @@ def main(): expsing = expsing.dropna() # source improve_sample_id improve_drug_id study time time_unit dose_response_metric dose_response_value - - combos[['drug1','drug2']]=combos.drug.str.split('+',expand=True) - combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() - - expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] - - expcomb[['source']]='Synapse' - expcomb[['study']]='MPNST PDX in vivo' - - expsing[['source']]='Synapse' - expsing[['study']]='MPNST PDX in vivo' + if combos.shape[0]> 0: + combos[['drug1','drug2']]=combos['drug'].str.split('+',expand=True) + + combos = combos.rename({'metric':'drug_combination_metric','value':'drug_combination_value','sample':'improve_sample_id'},axis=1).dropna() + + expcomb = combos.rename({'drug1':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_1'},axis=1)[['improve_drug_1','drug2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb = expcomb.rename({'drug2':'chem_name'},axis=1).merge(drugs,on='chem_name',how='left').rename({'improve_drug_id':'improve_drug_2'},axis=1)[['improve_drug_1','improve_drug_2','improve_sample_id','time_unit','time','drug_combination_metric','drug_combination_value']] + expcomb[['source']]=args.source + expcomb[['study']]=args.study + expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + + expsing[['source']]=args.source + expsing[['study']]=args.study expsing.to_csv(args.outprefix+'_experiments.tsv',index=False, sep="\t") - expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") + #expcomb.to_csv(args.outprefix+'_combinations.tsv',index=False, sep="\t") @@ -341,21 +355,25 @@ def get_drug_stats(df, control='control'): for name, group in tqdm(groups): # Each group contains multiple treatments and a control drugs = set(group.treatment) - set([control]) - print(name[0]) - print(drugs) + #print('line 355') + #print(name[0]) + #print(drugs) mod = list(set(group.model_id))[0] ctl_data = group[group.treatment == control] ctl_time = np.array(ctl_data.time) ctl_volume = np.array(ctl_data.volume) - + if (ctl_volume.shape[0] < 2): + continue ctl_auc = AUC(ctl_time, ctl_volume) for d in drugs: - print(d) - d_data = group[group.treatment == d] + #print('is our drug a string or dict?') + #print(str(d)) + d_data = group[group.treatment == str(d)] treat_time = np.array(d_data.time) treat_volume = np.array(d_data.volume) - + if (treat_volume.shape[0] < 2): + continue # Get ABC for group treat_auc = AUC(treat_time, treat_volume) treat_abc = ABC(ctl_time, ctl_volume, treat_time, treat_volume) @@ -368,6 +386,7 @@ def get_drug_stats(df, control='control'): #llm comb = pd.concat([ctl_data, d_data]) + #print(comb) lmm_res = lmm(comb.time, comb.volume, comb.treatment, d) lmm_res.update({'sample': mod, 'drug': d, 'time': np.max(treat_time), 'time_unit': 'days'}) if '+' in d: From 953370323d3367604f2f30bfecb022fa03ce7c7f Mon Sep 17 00:00:00 2001 From: Jeremy Date: Thu, 24 Jul 2025 16:54:18 -0700 Subject: [PATCH 19/20] All working now --- build/mpnst/build_exp.sh | 1 + build/utils/calc_pdx_metrics.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/build/mpnst/build_exp.sh b/build/mpnst/build_exp.sh index f83096d4..14506cfe 100644 --- a/build/mpnst/build_exp.sh +++ b/build/mpnst/build_exp.sh @@ -5,4 +5,5 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit echo "Running 03_get_experiments.R with $SYNAPSE_AUTH_TOKEN, $1, and $2." Rscript 03_get_experiments.R $SYNAPSE_AUTH_TOKEN $1 $2 mpnst +rm /tmp/mpnst_pdx_experiments.tsv /tmp/mpnst_mt_experiments.tsv /tmp/mpnst_mt_curve_data.tsv /tmp/mpnst_pdx_curve_data.tsv diff --git a/build/utils/calc_pdx_metrics.py b/build/utils/calc_pdx_metrics.py index b25e4f1e..83876dd3 100755 --- a/build/utils/calc_pdx_metrics.py +++ b/build/utils/calc_pdx_metrics.py @@ -415,4 +415,4 @@ def get_drug_stats(df, control='control'): return sing, comb if __name__=='__main__': - main() + main() \ No newline at end of file From 8c2f4c0a15ad4fe36a0a2bb2bb2a9a88189cb3a3 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Fri, 25 Jul 2025 08:55:28 -0700 Subject: [PATCH 20/20] removed cptac --- build/build_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/build_all.py b/build/build_all.py index 0bea6583..7004dd74 100644 --- a/build/build_all.py +++ b/build/build_all.py @@ -40,7 +40,7 @@ def main(): parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.") parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands") parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.") - parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.') + parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,mpnst',help='Datasets to process. Defaults to all available.') parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.') parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.') parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.') pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy