Skip to content

Sarcpdo Mutations Failing #431

@jjacobson95

Description

@jjacobson95

Currently working on the build process and ran into this issue.

Issues:

  • The genes variable is used instead of genesTable
  • The samples variable is used instead of samplesTable
  • The line mutation_merged[mutation_merged['entrez_id'].isna()] doesn't properly drop values

Code chunk is located here:

def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTable):
    mutationQuery=synLoginObject.tableQuery("select * from syn61894695")
    mutationDF = mutationQuery.asDataFrame()
    mutationDF['Sample_ID_Tumor'] = mutationDF['Sample_ID'] + "_Tumor"
    # left join with genes table 
    mutation_merged = mutationDF.merge(genes, left_on='Gene', right_on='gene_symbol', how='left')
   # drop null entrez_ids
    mutation_merged[mutation_merged['entrez_id'].isna()]
    #split gene name to include portion without exon
    mutation_merged["Name"] = mutation_merged["Name"].str.split("[ \(|]", expand=True)[0]
    # reformat variant classification column to be accepted by linkML and correct
    mutation_merged["variant_classification"] =mutation_merged['Canonical_Variant_Classification']

    
    mutation_merged.replace({'variant_classification': "Missense"}, "Missense_Mutation", inplace=True)
    
    mutation_merged.replace({'variant_classification': "Splice_Donor"}, "Splice_Site", inplace=True)
    mutation_merged.replace({'variant_classification': "Splice_Acceptor"}, "Splice_Site", inplace=True)
    mutation_merged.replace({'variant_classification': "Nonsense"}, "Nonsense_Mutation", inplace=True)
    mutation_merged.replace({'variant_classification': "intron"}, "Intron", inplace=True)
    mutation_merged.replace({'variant_classification': "synonymous"}, "Silent", inplace=True)
    mutation_merged.replace({'variant_classification': "Inframe_Del"}, "In_Frame_Del", inplace=True)
    mutation_merged.replace({'variant_classification': "5_prime_UTR"}, "5'UTR", inplace=True)
    mutation_merged.replace({'variant_classification': "Frameshift"}, "Frameshift_Variant", inplace=True)
    mutation_merged.replace({'variant_classification': "intergenic_variant"}, "Silent", inplace=True)

    mutation_merged_select = mutation_merged[['entrez_id', 'Sample_ID_Tumor', 'Name', 'variant_classification']]
    #merge with improve_ids 
    samples['other_id_no_dash'] = samples['other_id'].str.replace("-2", "_2")
    mutation_merged_2 = mutation_merged_select.merge(samples, left_on='Sample_ID_Tumor', right_on='other_id_no_dash', how='left')
    # select desired columns - entrez_id, improve_sample_id, mutation, variant_classificaton, source, study
    mutation_merged_2['other_id_source'] = "Synapse"
    mutation_merged_2['study'] = "Landscape of Sarcoma"
    mutationData = mutation_merged_2[['entrez_id',  'Name', 'variant_classification',  'improve_sample_id', 'study']]
    mutationData =mutationData.rename({"Name": "mutation"}, axis=1)
    # drop duplicates
    mutationData = mutationData.drop_duplicates()
    # make sure entrez_id is in integer format
    mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)

    return mutationData

Error received:

Traceback (most recent call last):
  File "/usr/src/app/01_createSarcPDOOmicsFiles.py", line 115, in <module>
    download_and_format_genomic_mutation(synObject, genes, samples).to_csv('/tmp/sarcpdo_mutations.csv', index=False)
  File "/usr/src/app/01_createSarcPDOOmicsFiles.py", line 85, in download_and_format_genomic_mutation
    mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/generic.py", line 6662, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 430, in astype
    return self.apply(
  File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/internals/blocks.py", line 784, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 101, in _astype_nansafe
    return _astype_float_to_int_nansafe(arr, dtype, copy)
  File "/usr/local/lib/python3.9/site-packages/pandas/core/dtypes/astype.py", line 145, in _astype_float_to_int_nansafe
    raise IntCastingNaNError(
pandas.errors.IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
in main

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

Status

No status

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions

    pFad - Phonifier reborn

    Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

    Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


    Alternative Proxies:

    Alternative Proxy

    pFad Proxy

    pFad v3 Proxy

    pFad v4 Proxy