Skip to content

Commit 5bb3e62

Browse files
authored
Merge branch 'main' into bladder_pdo
2 parents f2590a8 + b96e135 commit 5bb3e62

23 files changed

+1340
-171
lines changed

build/beatAML/GetBeatAML.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ def map_and_combine(df, data_type, entrez_map_file, improve_map_file, map_file=N
424424
mapped_df.rename(columns={"hgvsc": "mutation"}, inplace=True)
425425
mapped_df.rename(columns={"labId": "sample_id"}, inplace=True)
426426
mapped_df.rename(columns={"Entrez_Gene_Id": "entrez_id"}, inplace=True)
427+
428+
#remove (gene) information preceeding the colon - this formats it like other datasets.
429+
mapped_df["mutation"] = mapped_df["mutation"].astype(str).str.split(":", n=1).str[-1]
427430

428431
variant_mapping = {
429432
'frameshift_variant': 'Frameshift_Variant',

build/build_dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def process_docker(dataset,validate):
4646
'cptac': ['cptac'],
4747
'sarcpdo': ['sarcpdo'],
4848
'genes': ['genes'],
49-
'upload': ['upload'],
49+
'upload': ['upload'],
50+
'crcpdo': ['crcpdo'],
5051
'bladderpdo': ['bladderpdo']
5152
}
5253

@@ -129,7 +130,8 @@ def process_omics(executor, dataset, should_continue):
129130
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
130131
'sarcpdo': ['mutations', 'transcriptomics'],
131132
'pancpdo': ['transcriptomics'],
132-
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics']
133+
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
134+
'crcpdo':['copy_number', 'mutations', 'transcriptomics']
133135
}
134136

135137
expected_omics = dataset_omics_files.get(dataset, [])

build/crcpdo/01-samples-crcpdo.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
import gzip
5+
import requests
6+
import argparse
7+
import synapseclient
8+
9+
###### NOTES ######
10+
# * need to change all paths to paths relevant to docker image
11+
# * add description to parser
12+
# * run functions in ipynb to test they are working
13+
14+
def download_rnaseq(geo_url:str = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE65253&format=file&file=GSE65253%5Fcol%5Ftum%5Forg%5Fmerge%2Ecsv%2Egz", save_path:str = None):
15+
"""
16+
Retrieve data from a given GEO URL and identify the downloaded file by its name.
17+
18+
This function uses the wget tool to download a file from the provided GEO URL.
19+
By comparing the directory contents before and after the download,
20+
it identifies the newly downloaded file's name.
21+
22+
Parameters
23+
----------
24+
geo_url : str
25+
The GEO URL pointing to the data to be downloaded. Default is from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE65253
26+
27+
save_path : string
28+
Local path where the downloaded file will be saved.
29+
30+
Returns
31+
-------
32+
None
33+
"""
34+
35+
response = requests.get(geo_url)
36+
with open(save_path, 'wb') as f:
37+
f.write(response.content)
38+
39+
def download_sequencing_data(synID:str , save_path:str = None, synToken:str = None):
40+
"""
41+
Download sequencing data from Synapse at synapseID syn64961953. Requires a synapse token, which requires you to make a Synapse account
42+
and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
43+
44+
Parameters
45+
----------
46+
synID : string
47+
SynapseID of dataset to download. Default is synapseID of the sequencing dataset.
48+
49+
save_path : string
50+
Local path where the downloaded file will be saved.
51+
52+
synToken : string
53+
Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
54+
55+
Returns
56+
-------
57+
str
58+
Filepath to downloaded excel file
59+
"""
60+
61+
syn = synapseclient.Synapse()
62+
syn.login(authToken=synToken)
63+
64+
# Obtain a pointer and download the data
65+
syn64961953 = syn.get(entity=synID, downloadLocation = save_path)
66+
67+
# Get the path to the local copy of the data file
68+
sequencing_filepath = syn64961953.path
69+
return(sequencing_filepath)
70+
71+
def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str = "") -> pd.DataFrame:
72+
"""
73+
Creates sample file from sequencing data excel file. Checks the input sample file against previous sample files to make sure
74+
there are no clashing sample names and assigns improved ID's starting from where previous sample sheet left off.
75+
76+
Parameters
77+
----------
78+
sequencing_data_path : string
79+
Path to sequencing data from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#sec-4 . Supplementary Table S1
80+
81+
prev_samples_path : string
82+
Path to previous sample sheet.
83+
84+
Returns
85+
-------
86+
pd.DataFrame
87+
A DataFrame containing the combined samples data.
88+
89+
"""
90+
# reading in sequencing excel file
91+
sequencing_excel = pd.ExcelFile(open(sequencing_data_path, 'rb'))
92+
recurrent_mutations = pd.read_excel(sequencing_excel, 'TableS1I_Recurrent mutations') # table with recurrent mutation information
93+
somatic_mutations = pd.read_excel(sequencing_excel, 'TableS1J-Somatic mutations') # table with somatic mutation information
94+
copy_num = pd.read_excel(sequencing_excel, 'TableS1D-Segmented_CN')
95+
96+
# reading in previous sample file
97+
if prev_samples_path != "":
98+
prev_samples = pd.read_csv(prev_samples_path)
99+
100+
# reading in rucurent mutation info
101+
recurrent_tumor = pd.DataFrame({'other_id':recurrent_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
102+
recurrent_normal = pd.DataFrame({'other_id':recurrent_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
103+
104+
# merging somatic organoids too just in case recurrent excludes some
105+
somatic_tumor = pd.DataFrame({'other_id':somatic_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
106+
somatic_normal = pd.DataFrame({'other_id':somatic_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
107+
108+
# also merging from segmented CN bc the other two exclude P18 Tumor biopsy
109+
copy_num_patients = pd.DataFrame({'other_id':copy_num['Sample'].str.split('.',n = 1,expand=True).iloc[:,1].str.replace(".","-").unique()})
110+
111+
samples_df = pd.concat([recurrent_tumor,recurrent_normal, somatic_tumor, somatic_normal,copy_num_patients])
112+
113+
# formatting the table
114+
samples_df = samples_df.drop_duplicates('other_id')
115+
samples_df = samples_df.reset_index()
116+
samples_df['common_name'] = samples_df['other_id'].str.split('-', n = 1,expand=True).iloc[:,0] + "-"
117+
samples_df['model_type'] = ""
118+
for index, row in samples_df.iterrows():
119+
if "Tumor-Organoid" in samples_df.loc[index, 'other_id']:
120+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O"
121+
samples_df.loc[index, 'model_type'] = "organoid"
122+
if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']:
123+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B"
124+
samples_df.loc[index, 'model_type'] = "ex vivo"
125+
if "Normal-Organoid" in samples_df.loc[index, 'other_id']:
126+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O"
127+
samples_df.loc[index, 'model_type'] = "organoid"
128+
samples_df['other_id_source'] = "vandeWetering_2015"
129+
samples_df['cancer_type'] = "Colorectal Carcinoma"
130+
samples_df['species'] = "Homo sapiens (Human)"
131+
132+
# check other_id doesn't clash with previous sample names
133+
if prev_samples_path != "":
134+
if prev_samples.other_id.values in samples_df.other_id.values:
135+
print("Duplicate id names detected. Cannot proceed with generating sample sheet until resolved.")
136+
exit()
137+
if prev_samples_path == "":
138+
maxval = 0
139+
else:
140+
maxval = max(prev_samples.improve_sample_id)
141+
samples_df['improve_sample_id'] = samples_df.index + maxval + 1 # take index plus 1 to create counter, start from max value
142+
samples_df = samples_df.drop(columns = 'index')
143+
return(samples_df)
144+
145+
146+
147+
148+
if __name__ == "__main__":
149+
parser = argparse.ArgumentParser(description='###')
150+
151+
parser.add_argument('-D', '--download',action='store_true', default=False, help='Download RNA seq and sequencing data from GEO and supplemental materials from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#mmc2')
152+
parser.add_argument('-t', '--token', type=str, default=None, help='Synapse Token')
153+
parser.add_argument('-i', '--synapseID', type=str, default="syn64961953", help='SynapseID of data to download')
154+
155+
parser.add_argument('-s', '--samples', action = 'store_true', help='Only generate samples, requires previous samples',default=False)
156+
parser.add_argument('-p', '--prevSamples', nargs='?',type=str, default='', const='', help='Use this to provide previous sample file')
157+
158+
159+
160+
args = parser.parse_args()
161+
162+
163+
###########################
164+
165+
if args.download:
166+
if args.token is None:
167+
print("No synpase download tocken was provided. Cannot download data.")
168+
exit()
169+
else:
170+
print("Downloading Files from Synapse.")
171+
# Download RNA seq data
172+
download_rnaseq(save_path = "/tmp/GSE65253_col_tum_org_merge.csv.gz")
173+
# Download sequencing data
174+
sequencing_download_path = download_sequencing_data(synID = args.synapseID, synToken = args.token, save_path = "/tmp")
175+
176+
if args.samples:
177+
if args.prevSamples is None or args.prevSamples=='':
178+
print("No previous samples file provided. Starting improve_sample_id from 1. Running sample file generation")
179+
sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path)
180+
else:
181+
print("Previous sample sheet {} detected. Running sample file generation and checking for duplicate IDs.".format(args.prevSamples))
182+
sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path, prev_samples_path= args.prevSamples)
183+
sample_sheet.to_csv("/tmp/crcpdo_samples.csv", index=False)
184+
185+

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy