0% found this document useful (0 votes)
10 views13 pages

solutionsExerciseMaster11 23

The Python scripts provide solutions to bioinformatics exercises involving analyzing and manipulating sequence data in various file formats. Script 11 counts lines or reads in a FASTQ or FASTA file. Script 12 transcribes a DNA FASTA file to RNA. Script 13 calculates GC content in a multi-FASTA file. Script 14 calculates the average sequence length in a multi-FASTA file.

Uploaded by

Huy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views13 pages

solutionsExerciseMaster11 23

The Python scripts provide solutions to bioinformatics exercises involving analyzing and manipulating sequence data in various file formats. Script 11 counts lines or reads in a FASTQ or FASTA file. Script 12 transcribes a DNA FASTA file to RNA. Script 13 calculates GC content in a multi-FASTA file. Script 14 calculates the average sequence length in a multi-FASTA file.

Uploaded by

Huy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

Suggested solutions exercises 11-23

11.py
Code

#!/usr/bin/python3

import sys

with open(sys.argv[1], 'r') as in_fh:

counter = 0

# You can test if a substring is found in a string in a similar way


# as you do a membership test for a list
if '.fastq' in sys.argv[1] or '.fq' in sys.argv[1]:

for line in in_fh:


counter += 1
print(int(counter/4))

else:

for line in in_fh:


if line.starswith('>'):
counter += 1
print(counter)

1
12.py
Code

#!/usr/bin/python3

# Exercise: Transcribe DNA fasta to RNA


# Author: Martin Basterrechea

import sys

if len(sys.argv) < 3:
print('Error: Not enough arguments. Usage: 12.py inputfile outputfile')
sys.exit()

inputname = sys.argv[1]
outputname = sys.argv[2]

with open(inputname, 'r') as inf, open(outputname, 'w') as outf:


for line in inf:
line = line.rstrip()
if line[0] != '>':
line = line.replace('t', 'u').replace('T', 'U')
print('{}'.format(line), file=outf)
else:
print('{}'.format(line), file=outf)

2
13.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in multi-fasta file
# Author: Martin Basterrechea

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 13.py inputfile')
sys.exit()

inputname = sys.argv[1]

with open(inputname, 'r') as inf:


for line in inf:
if line[0] != '>':
line = line.rstrip().lower()
# Don't use regular expressions for this, it's slower
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

3
14.py
Code

#!/usr/bin/python3
# Exercise: Calculate average length of sequences in multi-fasta file
# Author: Martin Basterrechea

import sys

if len(sys.argv) < 2: # First argument is second in the sys.argv list!


print('Error: Not enough arguments. Usage: 14.py inputfile')
sys.exit()

inputname = sys.argv[1]

# We get the sum of the sequence lengths and divide it by the number of sequences
total_length = 0
seq_count = 0

with open(inputname, 'r') as inf:


for line in inf:
line = line.rstrip()
if line[0] == '>':
seq_count += 1
else:
total_length += len(line)

print(round(total_length/seq_count,1))

4
15.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in fastq file

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 15.py fastqFile')
sys.exit()

inputname = sys.argv[1]
countLines = 0

with open(inputname, 'r') as inf:


for line in inf:
countLines += 1
if countLines % 2:
line = line.rstrip().lower()
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

5
16.py
Code

#!/usr/bin/python3

import sys
import re

# Compiling the pattern is not necessary, but faster than using the string directly
lengthPattern = re.compile('length=(\d+)')

countId = 0
countNt = 0
headerLength = 0

with open(sys.argv[1], 'r') as in_fh:

for line in in_fh:


line = line.rstrip()

# Note that there is an empty line of the input file


# This is an empty string and evaluates to False
if line and line[0] == '>':
countId += 1
matchObject = re.search(lengthPattern, line)
headerLength += int(matchObject.group(1))
else:
countNt += len(line)

print('IDs: {}'.format(countId))
print('Total counted nucl: {} nt, Average: {} nt'.format(countNt, round(countNt / countId, 1)))
print('Total label length: {} nt, Average: {} nt'.format(headerLength, round(headerLength / cou

6
17.py
Code

#!/usr/bin/python3

import sys
import re

target_id = sys.argv[1]
found_id = False

'''
# A regular expression solution
# The added \s to the pattern match a trailing space, tab or newline,
# preventing a match with only part of the ID
search_pattern = re.compile('>' + target_id + '\s')

with open(sys.argv[2], 'r') as in_fh:


for line in in_fh:
line = line.rstrip()
if re.match(search_pattern, line):
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break
'''

# A string solution, much faster for bigger files.


with open(sys.argv[2], 'r') as in_fh:
for line in in_fh:
line = line.rstrip()
# We split the line on () which means on any white space.
# We return the first element. If there was no whitespace
# the entire line will be in the first element
if '>' + target_id in line.split()[0]:
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break

if found_id is False: # This if statement is really not needed


print('The ID {} was not found'.format(target_id))

7
18.py
Code

#!/usr/bin/python3

#Usage: ./18.py input_fasta

i = 0
with open(sys.argv[1], 'r') as f:

for line in f:
i += 1
line = line.rstrip()

if '>' in line:
if i == 1:
print(line)
else:
print('\n' + line)
else:
print(line, end='')

print() # Print a final newline

8
19.py
Code

#!/usr/bin/python3
# Exercise: Transcribe DNA fasta to RNA
# Author: Martin Basterrechea

import sys

mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,


'Y' : 163.1760, 'M' : 131.1926, 'G' : 57.0519, 'H' : 137.1411,
'C' : 103.1388, 'E' : 129.1155, 'S' : 87.0782, 'F' : 147.1766,
'I' : 113.1594, 'A' : 71.0788, 'W' : 186.2132, 'N' : 114.1038,
'Q' : 128.1307, 'L' : 113.1594, 'R' : 156.1875, 'K' : 128.1741 }

prot = input('Enter an amino acid sequence: ')

total_mass = 0

for aa in prot:
# This will return the value if it exists, or None if it doesn't
mass = mass_dict.get(aa.upper())
if mass: #Any number (except 0) will be considered as true
total_mass += mass
else: #None is considered as false
print('Invalid amino acid!: {}'.format(aa))
sys.exit()
print('Total mass: ' + str(total_mass))

9
20.py
Code

#!/usr/bin/python3

#Usage: ./20_getSeq.py oligo fasta_file


#Expects single line fasta format

import sys
import re

targetOligo = sys.argv[1]
reverseOligo = targetOligo[::-1]
reverseOligo = reverseOligo.translate(str.maketrans('acgtACGT', 'tgcaTGCA'))

print('#Oligo\t{}'.format(targetOligo))
print('#id\tabundance')

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:

if line[0] == '>':
line = line.rstrip() # Save for future use
idLine = line.split()[0]
numberOfOligos = 0
else:
numberOfOligos += len(re.findall(targetOligo, line, re.I))
numberOfOligos += len(re.findall(reverseOligo, line, re.I))
if numberOfOligos > 0:
print('{}\t{}'.format(idLine, numberOfOligos))

10
21.py
Code

#!/usr/bin/env python3
#Exercise: Translate DNA into AA
#Note that this script only works with non interleaved sequences

import sys

if len(sys.argv) < 3:
print('Error, not enough arguments. Usage: 21.py input.fasta output.faa')
sys.exit()

gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',


'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',
'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',
'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'}

with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:


for line in inf:
line = line.rstrip()
if line[0] == '>':
print(line, file = outf)
else:
### Translation block
aa_seq = '' # Remember to empty the variable for each new translation
for n in range(0,len(line), 3):
codon = line[n:n+3].upper()
#False at the end if the sequence length is not a multiple of 3
if len(codon) == 3:
aa_seq += gen_code[codon]

print('{}'.format(aa_seq), file = outf)


### End of translation block

11
22.py
Code

#!/usr/bin/python3

# Runs as 22.py rRNAlistFile inFastaFile outFastaFile

import sys

# We create a set of the ids found in the rRNA list


rRNAset = set()
with open(sys.argv[1], 'r') as inrRNA:
for line in inrRNA:
line = line.rstrip()
rRNAset.add(line)

# Parse the fasta file


rRNAfound = False # Test if the sequence is a rRNA
with open(sys.argv[2], 'r') as inFasta, open(sys.argv[3], 'w') as outFasta:
for line in inFasta:
line = line.rstrip()
if line[0] == '>':
seqid = line.split()[0]
seqid = seqid[1:] # Remove the >
found = False
if seqid in rRNAset:
found = True
continue
if found is False:
print(line, file=outFasta)

12
23.py
Code

#!/usr/bin/python3

import sys

if len(sys.argv) != 2:
print('Usage: {} gene_fasta_file'.format(sys.argv[0]))

codon_total_count = [0, 0, 0]
codon_gc_count = [0, 0, 0]
allowed_letters = ['a', 'c', 'g', 't']
gc_letters = ['c', 'g']

countLines = 0
with open(sys.argv[1], 'r') as in_fh:
for line in in_fh:
line = line.lower().rstrip()

countLines += 1
if countLines % 10000 == 0:
print('{} lines processed'.format(countLines), file=sys.stderr, end='\r')

if not line[0] == '>':


for n in range(len(line)):

# Get the current codon position, here 0, 1 or 2


current_pos = n % 3
letter = line[n]

if letter in allowed_letters:
codon_total_count[current_pos] += 1

if letter in gc_letters:
codon_gc_count[current_pos] += 1

for pos in range(3):


gc_frac = codon_gc_count[pos] / codon_total_count[pos]
print('GC content for position {} is {}%'.format(pos+1, round(100*gc_frac, 2)))

13

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy