Skip to content

Commit e237db5

Browse files
authored
Merge pull request #365 from PNNL-CompBio/bladder_pdo
BladderPDO and Schema Fixes
2 parents 943a399 + 5bb3e62 commit e237db5

File tree

7 files changed

+7671
-6632
lines changed

7 files changed

+7671
-6632
lines changed

build/bladderpdo/00_createBladderPDOSampleFile.py

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,49 @@
1+
#!/usr/bin/env python3
12
import synapseclient
23
import pandas as pd
34
import numpy as np
45
import argparse
56
import os
7+
import re
8+
import subprocess
69

10+
# Helper functions
11+
def _clean_geo_id(s):
12+
"""
13+
Normalise GEO sample IDs so they match Synapse naming.
14+
• 11.2 → 11_2
15+
• **_Tumor → *_Parental
16+
• *_orgP2 → *_Organoid_P2
17+
• *_xenoorgP4 → *_XenoOrganoid_P4
18+
"""
19+
s = s.strip()
20+
s = re.sub(r"(?<=\d)\.(?=\d)", "_", s) # dots between digits
21+
s = s.replace("_tumor", "_Parental") # tumour alias
22+
# lower-case 'orgP' / 'xenoorgP' fix
23+
s = re.sub(r"_(org)P(\d+)", r"_Organoid_P\2", s, flags=re.IGNORECASE)
24+
s = re.sub(r"_(xenoorg)P(\d+)", r"_XenoOrganoid_P\2", s, flags=re.IGNORECASE)
25+
return s
726

27+
28+
def _parse_model_type(sample_id):
29+
"""Derive model_type from Sample ID."""
30+
low = sample_id.lower()
31+
if "_xenoorganoid" in low:
32+
return "xenograft derived organoid"
33+
if "_organoid" in low:
34+
return "organoid"
35+
if "_xenograft" in low:
36+
return "patient derived xenograft"
37+
if "_parental" in low:
38+
return "tumor"
39+
return "unknown"
40+
41+
#Generate Samples Data
842
def get_bladder_pdo_samples(synLoginObject, maxval):
943

44+
45+
#Part 1: Get Data from Synapse
46+
1047
# download from Synapse..
1148
samples_syn = synLoginObject.get('syn64765486')
1249
# and read the file
@@ -19,7 +56,43 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
1956
samples.loc[:,['other_id_source']] = 'Synapse'
2057
samples.loc[:,['other_names'] ]= ''
2158
samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
22-
samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
59+
samples["model_type"] = samples["other_id"].apply(_parse_model_type)
60+
61+
#Part 2: Get Data from Geo
62+
subprocess.call (["Rscript", "--vanilla", "obtainGSMidLink.R"])
63+
GEO_ids_link = "./gsmlinkDf.csv"
64+
65+
geo_map = pd.read_csv(GEO_ids_link)
66+
geo_ids = geo_map["sampleid"].dropna().map(_clean_geo_id).unique()
67+
missing = sorted(set(geo_ids) - set(samples["other_id"]))
68+
69+
if missing:
70+
print(f"Adding {len(missing)} GEO samples not in Synapse sheet")
71+
72+
rows = []
73+
for oid in missing:
74+
common = oid.split("_")[0]
75+
ctype = (
76+
samples.loc[samples["common_name"] == common, "cancer_type"]
77+
.iloc[0]
78+
if (samples["common_name"] == common).any()
79+
else "bladder urothelial carcinoma"
80+
)
81+
rows.append(
82+
{
83+
"other_id": oid,
84+
"common_name": common,
85+
"cancer_type": ctype,
86+
"model_type": _parse_model_type(oid),
87+
"species": "Homo sapiens(Human)",
88+
"other_id_source": "GEO",
89+
"other_names": "",
90+
}
91+
)
92+
if rows:
93+
samples = pd.concat([samples, pd.DataFrame(rows)], ignore_index=True)
94+
95+
samples = samples.sort_values("other_id").reset_index(drop=True)
2396

2497
samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
2598

@@ -29,11 +102,8 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
29102
if __name__ == "__main__":
30103

31104
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
32-
33105
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
34-
35106
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
36-
37107
args = parser.parse_args()
38108

39109
print("Logging into Synapse")
@@ -46,5 +116,4 @@ def get_bladder_pdo_samples(synLoginObject, maxval):
46116
prev_max_improve_id = 0
47117

48118
bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
49-
50119
bladder_pdo_samples.to_csv("/tmp/bladderpdo_samples.csv", index=False)

build/bladderpdo/01_createBladderPDOOmicsFiles.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import gzip
88
import subprocess
99
import math
10+
import re
1011

1112
def get_copy_call(a):
1213
"""
@@ -31,7 +32,21 @@ def get_copy_call(a):
3132
else:
3233
return 'amp'
3334

34-
return pd.Series([get_copy_call(a) for a in arr])
35+
36+
def normalise_id(s):
37+
"""
38+
Make GEO sample IDs line up with 'other_id' in bladderpdo_samples.csv.
39+
"""
40+
if pd.isna(s):
41+
return s
42+
s = s.strip()
43+
s = re.sub(r"(?<=\d)\.(?=\d)", "_", s) # dots → underscore
44+
s = s.replace("_tumor", "_Parental") # tumour alias
45+
s = re.sub(r"_(org)P(\d+)", r"_Organoid_P\2", s, flags=re.IGNORECASE)
46+
s = re.sub(r"_(xenoorg)P(\d+)", r"_XenoOrganoid_P\2", s, flags=re.IGNORECASE)
47+
return s
48+
49+
3550

3651
def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
3752

@@ -40,30 +55,42 @@ def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
4055
transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
4156
subprocess.call (["/usr/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
4257

43-
GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
58+
GEO_ids = pd.read_csv(GEO_id_link_table)
59+
print(GEO_ids)
4460
fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
4561
transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
4662
transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
4763
mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
4864
# transform data to long format
65+
print(mapped_df)
4966

50-
mapped_df.drop('other_id', axis=1)
67+
mapped_df = mapped_df.drop('other_id', axis=1)
5168
value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
5269
melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
5370
# use info from GEO to get Sample IDS
54-
txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
71+
m1 = melted_txomics.merge(GEO_ids, how="left", left_on="sample_name", right_on="RNAid")
72+
m1["sampleid"] = m1["sampleid"].apply(normalise_id)
73+
print(m1)
74+
print(m1.sampleid.unique())
5575
# use samplesheet to link sample_ids to improve ids
56-
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
57-
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
58-
txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
59-
final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
60-
final_transcriptomics['source'] = "Gene Expression Omnibus"
61-
final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
62-
final_transcriptomics.rename({'value' : 'transcriptomics' })
63-
# remove duplicates
64-
toreturn = final_transcriptomics.drop_duplicates()
65-
66-
return toreturn
76+
tx_with_ids = m1.merge(
77+
samples, left_on="sampleid", right_on="other_id", how="left"
78+
)
79+
print(tx_with_ids)
80+
81+
final_tx = (
82+
tx_with_ids[["entrez_id", "value", "improve_sample_id"]]
83+
.drop_duplicates()
84+
.assign(source="Gene Expression Omnibus",
85+
study="Lee et al. 2018 Bladder PDOs")
86+
)
87+
final_tx.rename(columns= {"value":"transcriptomics"},inplace=True)
88+
final_tx = final_tx.drop_duplicates()
89+
final_tx = final_tx.dropna(subset=["entrez_id"])
90+
final_tx["improve_sample_id"] = final_tx["improve_sample_id"].astype(int)
91+
final_tx["entrez_id"] = final_tx["entrez_id"].astype(int)
92+
93+
return final_tx
6794

6895
def get_bladder_pdo_mutations(synObject, samples, genes):
6996
print(samples.head)
@@ -74,10 +101,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
74101
selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
75102
merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
76103
merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
77-
print(merged_mutations_renamed.head)
78104
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
79105
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
80-
print(final_mutations.head)
106+
final_mutations = final_mutations.dropna(subset=["entrez_id"])
107+
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
108+
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
81109
return final_mutations
82110

83111
def get_bladder_pdo_copynumber(synObject, samples, genes):
@@ -94,7 +122,9 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
94122
final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
95123
final_copynumber['source'] = "Synapse"
96124
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
97-
125+
final_copynumber = final_copynumber.dropna(subset=["entrez_id"])
126+
final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int)
127+
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
98128
return final_copynumber
99129

100130

@@ -108,7 +138,7 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
108138
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
109139
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
110140
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
111-
parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
141+
parser.add_argument('-i', '--geolink', default = "./gsmlinkDf.csv", help=".csv file that is the output of 'CNV-segfile-anotation.R")
112142
parser.add_argument('-t', '--token', help='Synapse token')
113143

114144
args = parser.parse_args()
@@ -129,4 +159,4 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
129159
get_bladder_pdo_mutations(synObject, samples, genes).to_csv('/tmp/bladderpdo_mutations.csv', index=False)
130160

131161
if args.copy:
132-
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copynumber.csv", index=False)
162+
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copy_number.csv", index=False)

build/bladderpdo/build_exp.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
66
echo "Running 04-drug_dosage_and_curves.py with drugfile $2 and curSampleFile $1"
77
python3 03_createBladderPDOExperimentFile.py --token $SYNAPSE_AUTH_TOKEN --drugfile $2 --curSampleFile $1 --output /tmp/bladderpdo_doserep.tsv
88

9-
python3 fit_curve.py --input /tmp/bladderpdo_doserep.tsv --output /tmp/bladderpdo_experiments.tsv
9+
python3 fit_curve.py --input /tmp/bladderpdo_doserep.tsv --output /tmp/bladderpdo_experiments.tsv
10+
rm /tmp/bladderpdo_doserep.tsv
11+
mv /tmp/bladderpdo_experiments.tsv.0 /tmp/bladderpdo_experiments.tsv

build/improve_drug_mapping.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"metadata": {
33
"builds": [
44
{
5-
"build_date": "01_24_25",
5+
"build_date": "2025-01-24",
66
"version": "2.0.0"
77
},
88
{

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy