-
Notifications
You must be signed in to change notification settings - Fork 3
Open
Description
Currently working on the build process and ran into this issue.
Issues:
- The
genes
variable is used instead ofgenesTable
- The
samples
variable is used instead ofsamplesTable
- The line
mutation_merged[mutation_merged['entrez_id'].isna()]
doesn't properly drop values
Code chunk is located here:
def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTable):
mutationQuery=synLoginObject.tableQuery("select * from syn61894695")
mutationDF = mutationQuery.asDataFrame()
mutationDF['Sample_ID_Tumor'] = mutationDF['Sample_ID'] + "_Tumor"
# left join with genes table
mutation_merged = mutationDF.merge(genes, left_on='Gene', right_on='gene_symbol', how='left')
# drop null entrez_ids
mutation_merged[mutation_merged['entrez_id'].isna()]
#split gene name to include portion without exon
mutation_merged["Name"] = mutation_merged["Name"].str.split("[ \(|]", expand=True)[0]
# reformat variant classification column to be accepted by linkML and correct
mutation_merged["variant_classification"] =mutation_merged['Canonical_Variant_Classification']
mutation_merged.replace({'variant_classification': "Missense"}, "Missense_Mutation", inplace=True)
mutation_merged.replace({'variant_classification': "Splice_Donor"}, "Splice_Site", inplace=True)
mutation_merged.replace({'variant_classification': "Splice_Acceptor"}, "Splice_Site", inplace=True)
mutation_merged.replace({'variant_classification': "Nonsense"}, "Nonsense_Mutation", inplace=True)
mutation_merged.replace({'variant_classification': "intron"}, "Intron", inplace=True)
mutation_merged.replace({'variant_classification': "synonymous"}, "Silent", inplace=True)
mutation_merged.replace({'variant_classification': "Inframe_Del"}, "In_Frame_Del", inplace=True)
mutation_merged.replace({'variant_classification': "5_prime_UTR"}, "5'UTR", inplace=True)
mutation_merged.replace({'variant_classification': "Frameshift"}, "Frameshift_Variant", inplace=True)
mutation_merged.replace({'variant_classification': "intergenic_variant"}, "Silent", inplace=True)
mutation_merged_select = mutation_merged[['entrez_id', 'Sample_ID_Tumor', 'Name', 'variant_classification']]
#merge with improve_ids
samples['other_id_no_dash'] = samples['other_id'].str.replace("-2", "_2")
mutation_merged_2 = mutation_merged_select.merge(samples, left_on='Sample_ID_Tumor', right_on='other_id_no_dash', how='left')
# select desired columns - entrez_id, improve_sample_id, mutation, variant_classificaton, source, study
mutation_merged_2['other_id_source'] = "Synapse"
mutation_merged_2['study'] = "Landscape of Sarcoma"
mutationData = mutation_merged_2[['entrez_id', 'Name', 'variant_classification', 'improve_sample_id', 'study']]
mutationData =mutationData.rename({"Name": "mutation"}, axis=1)
# drop duplicates
mutationData = mutationData.drop_duplicates()
# make sure entrez_id is in integer format
mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)
return mutationData
Error received:
Traceback (most recent call last):
File "/usr/src/app/01_createSarcPDOOmicsFiles.py", line 115, in <module>
download_and_format_genomic_mutation(synObject, genes, samples).to_csv('/tmp/sarcpdo_mutations.csv', index=False)
File "/usr/src/app/01_createSarcPDOOmicsFiles.py", line 85, in download_and_format_genomic_mutation
mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)
File "/usr/local/lib/python3.9/site-packages/pandas/core/generic.py", line 6662, in astype
new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 430, in astype
return self.apply(
File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 363, in apply
applied = getattr(b, f)(**kwargs)
File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/blocks.py", line 784, in astype
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 237, in astype_array_safe
new_values = astype_array(values, dtype, copy=copy)
File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
values = _astype_nansafe(values, dtype, copy=copy)
File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 101, in _astype_nansafe
return _astype_float_to_int_nansafe(arr, dtype, copy)
File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 145, in _astype_float_to_int_nansafe
raise IntCastingNaNError(
pandas.errors.IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
in main
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
No status