7
7
import gzip
8
8
import subprocess
9
9
import math
10
+ import re
10
11
11
12
def get_copy_call (a ):
12
13
"""
@@ -31,7 +32,21 @@ def get_copy_call(a):
31
32
else :
32
33
return 'amp'
33
34
34
- return pd .Series ([get_copy_call (a ) for a in arr ])
35
+
36
+ def normalise_id (s ):
37
+ """
38
+ Make GEO sample IDs line up with 'other_id' in bladderpdo_samples.csv.
39
+ """
40
+ if pd .isna (s ):
41
+ return s
42
+ s = s .strip ()
43
+ s = re .sub (r"(?<=\d)\.(?=\d)" , "_" , s ) # dots → underscore
44
+ s = s .replace ("_tumor" , "_Parental" ) # tumour alias
45
+ s = re .sub (r"_(org)P(\d+)" , r"_Organoid_P\2" , s , flags = re .IGNORECASE )
46
+ s = re .sub (r"_(xenoorg)P(\d+)" , r"_XenoOrganoid_P\2" , s , flags = re .IGNORECASE )
47
+ return s
48
+
49
+
35
50
36
51
def get_bladder_pdo_transcriptomics (GEO_id_link_table , samples , genes ):
37
52
@@ -40,30 +55,42 @@ def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
40
55
transcriptomics = pd .read_csv (transcriptomic_txt , compression = 'gzip' , sep = "\t " )
41
56
subprocess .call (["/usr/bin/Rscript" , "--vanilla" , "obtainGSMidLink.R" ])
42
57
43
- GEO_ids_link = pd .read_csv ("./gsmlinkDf.csv" )
58
+ GEO_ids = pd .read_csv (GEO_id_link_table )
59
+ print (GEO_ids )
44
60
fpkm_totals = transcriptomics .iloc [:, 1 :43 ].sum ()
45
61
transcriptomics .iloc [:, 1 :43 ] = transcriptomics .iloc [:, 1 :43 ].div (fpkm_totals ).mul (1e6 )
46
62
transcriptomics ['ensembl' ] = transcriptomics ['Unnamed: 0' ].str .split ("_" , expand = True )[0 ]
47
63
mapped_df = transcriptomics .merge (genes [['entrez_id' , 'other_id' ]].drop_duplicates (), left_on = 'ensembl' , right_on = 'other_id' , how = 'left' )
48
64
# transform data to long format
65
+ print (mapped_df )
49
66
50
- mapped_df .drop ('other_id' , axis = 1 )
67
+ mapped_df = mapped_df .drop ('other_id' , axis = 1 )
51
68
value_variables = transcriptomics .columns [transcriptomics .columns .str .contains ("M" )]
52
69
melted_txomics = mapped_df .melt (id_vars = "entrez_id" , value_vars = value_variables , var_name = 'sample_name' )
53
70
# use info from GEO to get Sample IDS
54
- txomics_with_GEOid = melted_txomics .merge (GEO_ids_link , how = 'left' , left_on = "sample_name" , right_on = 'RNAid' )
71
+ m1 = melted_txomics .merge (GEO_ids , how = "left" , left_on = "sample_name" , right_on = "RNAid" )
72
+ m1 ["sampleid" ] = m1 ["sampleid" ].apply (normalise_id )
73
+ print (m1 )
74
+ print (m1 .sampleid .unique ())
55
75
# use samplesheet to link sample_ids to improve ids
56
- txomics_with_GEOid ['sampleid' ] = txomics_with_GEOid ['sampleid' ].str .replace ("org" , "Organoid_" )
57
- txomics_with_GEOid ['sampleid' ] = txomics_with_GEOid ['sampleid' ].str .replace ("tumor" , "Tumor" )
58
- txomics_with_improveid = txomics_with_GEOid .merge (samples , left_on = "sampleid" , right_on = "other_id" , how = "left" )
59
- final_transcriptomics = txomics_with_improveid [['entrez_id' , 'value' , 'improve_sample_id' ]]
60
- final_transcriptomics ['source' ] = "Gene Expression Omnibus"
61
- final_transcriptomics ['study' ] = "Lee etal 2018 Bladder PDOs"
62
- final_transcriptomics .rename ({'value' : 'transcriptomics' })
63
- # remove duplicates
64
- toreturn = final_transcriptomics .drop_duplicates ()
65
-
66
- return toreturn
76
+ tx_with_ids = m1 .merge (
77
+ samples , left_on = "sampleid" , right_on = "other_id" , how = "left"
78
+ )
79
+ print (tx_with_ids )
80
+
81
+ final_tx = (
82
+ tx_with_ids [["entrez_id" , "value" , "improve_sample_id" ]]
83
+ .drop_duplicates ()
84
+ .assign (source = "Gene Expression Omnibus" ,
85
+ study = "Lee et al. 2018 Bladder PDOs" )
86
+ )
87
+ final_tx .rename (columns = {"value" :"transcriptomics" },inplace = True )
88
+ final_tx = final_tx .drop_duplicates ()
89
+ final_tx = final_tx .dropna (subset = ["entrez_id" ])
90
+ final_tx ["improve_sample_id" ] = final_tx ["improve_sample_id" ].astype (int )
91
+ final_tx ["entrez_id" ] = final_tx ["entrez_id" ].astype (int )
92
+
93
+ return final_tx
67
94
68
95
def get_bladder_pdo_mutations (synObject , samples , genes ):
69
96
print (samples .head )
@@ -74,10 +101,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
74
101
selectioncols_mutations = mutations_df [['Entrez_Gene_Id' ,"Variant_Classification" , "Tumor_Sample_Barcode" , "mutation" ]]
75
102
merged_mutations = selectioncols_mutations .merge (samples , left_on = "Tumor_Sample_Barcode" , right_on = "other_id" , how = "left" )
76
103
merged_mutations_renamed = merged_mutations .rename ({"Entrez_Gene_Id" : 'entrez_id' , 'Variant_Classification' : "variant_classification" }, axis = 1 )
77
- print (merged_mutations_renamed .head )
78
104
final_mutations = merged_mutations_renamed [['entrez_id' , "mutation" , "variant_classification" , "improve_sample_id" ]]
79
105
final_mutations ['study' ] = "Lee etal 2018 Bladder PDOs"
80
- print (final_mutations .head )
106
+ final_mutations = final_mutations .dropna (subset = ["entrez_id" ])
107
+ final_mutations ["improve_sample_id" ] = final_mutations ["improve_sample_id" ].astype (int )
108
+ final_mutations ["entrez_id" ] = final_mutations ["entrez_id" ].astype (int )
81
109
return final_mutations
82
110
83
111
def get_bladder_pdo_copynumber (synObject , samples , genes ):
@@ -94,7 +122,9 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
94
122
final_copynumber = copynumber_with_correct_colnames [['entrez_id' , 'improve_sample_id' , 'copy_number' , 'copy_call' ]]
95
123
final_copynumber ['source' ] = "Synapse"
96
124
final_copynumber ['study' ] = "Lee etal 2018 Bladder PDOs"
97
-
125
+ final_copynumber = final_copynumber .dropna (subset = ["entrez_id" ])
126
+ final_copynumber ["improve_sample_id" ] = final_copynumber ["improve_sample_id" ].astype (int )
127
+ final_copynumber ["entrez_id" ] = final_copynumber ["entrez_id" ].astype (int )
98
128
return final_copynumber
99
129
100
130
@@ -108,7 +138,7 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
108
138
parser .add_argument ('-c' , '--copy' , help = 'Flag to capture copy number data' , action = 'store_true' , default = False )
109
139
parser .add_argument ('-m' , '--mutation' , help = 'Flag to capture mutation data' , action = 'store_true' , default = False )
110
140
parser .add_argument ('-e' , '--expression' , help = 'Flag to capture transcriptomic data' , action = 'store_true' , default = False )
111
- parser .add_argument ('-i' , '--geolink' , help = ".csv file that is the output of 'CNV-segfile-anotation.R" )
141
+ parser .add_argument ('-i' , '--geolink' , default = "./gsmlinkDf.csv" , help = ".csv file that is the output of 'CNV-segfile-anotation.R" )
112
142
parser .add_argument ('-t' , '--token' , help = 'Synapse token' )
113
143
114
144
args = parser .parse_args ()
@@ -129,4 +159,4 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
129
159
get_bladder_pdo_mutations (synObject , samples , genes ).to_csv ('/tmp/bladderpdo_mutations.csv' , index = False )
130
160
131
161
if args .copy :
132
- get_bladder_pdo_copynumber (synObject , samples , genes ).to_csv ("/tmp/bladderpdo_copynumber .csv" , index = False )
162
+ get_bladder_pdo_copynumber (synObject , samples , genes ).to_csv ("/tmp/bladderpdo_copy_number .csv" , index = False )
0 commit comments