0% found this document useful (0 votes)

10 views13 pages

solutionsExerciseMaster11 23

The Python scripts provide solutions to bioinformatics exercises involving analyzing and manipulating sequence data in various file formats. Script 11 counts lines or reads in a FASTQ or FASTA file. Script 12 transcribes a DNA FASTA file to RNA. Script 13 calculates GC content in a multi-FASTA file. Script 14 calculates the average sequence length in a multi-FASTA file.

Uploaded by

Huy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

10 views13 pages

solutionsExerciseMaster11 23

Uploaded by

Huy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 13

Suggested solutions exercises 11-23

11.py
Code

#!/usr/bin/python3

import sys

with open(sys.argv[1], 'r') as in_fh:

counter = 0

# You can test if a substring is found in a string in a similar way

# as you do a membership test for a list
if '.fastq' in sys.argv[1] or '.fq' in sys.argv[1]:

for line in in_fh:

counter += 1
print(int(counter/4))

else:

for line in in_fh:

if line.starswith('>'):
counter += 1
print(counter)

1
12.py
Code

#!/usr/bin/python3

# Exercise: Transcribe DNA fasta to RNA

# Author: Martin Basterrechea

import sys

if len(sys.argv) < 3:
print('Error: Not enough arguments. Usage: 12.py inputfile outputfile')
sys.exit()

inputname = sys.argv[1]
outputname = sys.argv[2]

with open(inputname, 'r') as inf, open(outputname, 'w') as outf:

for line in inf:
line = line.rstrip()
if line[0] != '>':
line = line.replace('t', 'u').replace('T', 'U')
print('{}'.format(line), file=outf)
else:
print('{}'.format(line), file=outf)

2
13.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in multi-fasta file
# Author: Martin Basterrechea

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 13.py inputfile')
sys.exit()

inputname = sys.argv[1]

with open(inputname, 'r') as inf:

for line in inf:
if line[0] != '>':
line = line.rstrip().lower()
# Don't use regular expressions for this, it's slower
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

3
14.py
Code

#!/usr/bin/python3
# Exercise: Calculate average length of sequences in multi-fasta file
# Author: Martin Basterrechea

import sys

if len(sys.argv) < 2: # First argument is second in the sys.argv list!

print('Error: Not enough arguments. Usage: 14.py inputfile')
sys.exit()

inputname = sys.argv[1]

# We get the sum of the sequence lengths and divide it by the number of sequences
total_length = 0
seq_count = 0

with open(inputname, 'r') as inf:

for line in inf:
line = line.rstrip()
if line[0] == '>':
seq_count += 1
else:
total_length += len(line)

print(round(total_length/seq_count,1))

4
15.py
Code

#!/usr/bin/python3
# Exercise: Calculate GC content in fastq file

import sys

GC = 0
AT = 0

if len(sys.argv) < 2:
print('Error: Not enough arguments. Usage: 15.py fastqFile')
sys.exit()

inputname = sys.argv[1]
countLines = 0

with open(inputname, 'r') as inf:

for line in inf:
countLines += 1
if countLines % 2:
line = line.rstrip().lower()
GC += line.count('c') + line.count('g')
AT += line.count('a') + line.count('t')

print(round(GC/(GC+AT), 2))

5
16.py
Code

#!/usr/bin/python3

import sys
import re

# Compiling the pattern is not necessary, but faster than using the string directly
lengthPattern = re.compile('length=(\d+)')

countId = 0
countNt = 0
headerLength = 0

with open(sys.argv[1], 'r') as in_fh:

for line in in_fh:

line = line.rstrip()

# Note that there is an empty line of the input file

# This is an empty string and evaluates to False
if line and line[0] == '>':
countId += 1
matchObject = re.search(lengthPattern, line)
headerLength += int(matchObject.group(1))
else:
countNt += len(line)

print('IDs: {}'.format(countId))
print('Total counted nucl: {} nt, Average: {} nt'.format(countNt, round(countNt / countId, 1)))
print('Total label length: {} nt, Average: {} nt'.format(headerLength, round(headerLength / cou

6
17.py
Code

#!/usr/bin/python3

import sys
import re

target_id = sys.argv[1]
found_id = False

'''
# A regular expression solution
# The added \s to the pattern match a trailing space, tab or newline,
# preventing a match with only part of the ID
search_pattern = re.compile('>' + target_id + '\s')

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:
line = line.rstrip()
if re.match(search_pattern, line):
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break
'''

# A string solution, much faster for bigger files.

with open(sys.argv[2], 'r') as in_fh:
for line in in_fh:
line = line.rstrip()
# We split the line on () which means on any white space.
# We return the first element. If there was no whitespace
# the entire line will be in the first element
if '>' + target_id in line.split()[0]:
found_id_line = line
found_id = True
print('{}'.format(line))
elif found_id is True:
print('{}'.format(line))
break

if found_id is False: # This if statement is really not needed

print('The ID {} was not found'.format(target_id))

7
18.py
Code

#!/usr/bin/python3

#Usage: ./18.py input_fasta

i = 0
with open(sys.argv[1], 'r') as f:

for line in f:
i += 1
line = line.rstrip()

if '>' in line:
if i == 1:
print(line)
else:
print('\n' + line)
else:
print(line, end='')

print() # Print a final newline

8
19.py
Code

#!/usr/bin/python3
# Exercise: Transcribe DNA fasta to RNA
# Author: Martin Basterrechea

import sys

mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,

'Y' : 163.1760, 'M' : 131.1926, 'G' : 57.0519, 'H' : 137.1411,
'C' : 103.1388, 'E' : 129.1155, 'S' : 87.0782, 'F' : 147.1766,
'I' : 113.1594, 'A' : 71.0788, 'W' : 186.2132, 'N' : 114.1038,
'Q' : 128.1307, 'L' : 113.1594, 'R' : 156.1875, 'K' : 128.1741 }

prot = input('Enter an amino acid sequence: ')

total_mass = 0

for aa in prot:
# This will return the value if it exists, or None if it doesn't
mass = mass_dict.get(aa.upper())
if mass: #Any number (except 0) will be considered as true
total_mass += mass
else: #None is considered as false
print('Invalid amino acid!: {}'.format(aa))
sys.exit()
print('Total mass: ' + str(total_mass))

9
20.py
Code

#!/usr/bin/python3

#Usage: ./20_getSeq.py oligo fasta_file

#Expects single line fasta format

import sys
import re

targetOligo = sys.argv[1]
reverseOligo = targetOligo[::-1]
reverseOligo = reverseOligo.translate(str.maketrans('acgtACGT', 'tgcaTGCA'))

print('#Oligo\t{}'.format(targetOligo))
print('#id\tabundance')

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:

if line[0] == '>':
line = line.rstrip() # Save for future use
idLine = line.split()[0]
numberOfOligos = 0
else:
numberOfOligos += len(re.findall(targetOligo, line, re.I))
numberOfOligos += len(re.findall(reverseOligo, line, re.I))
if numberOfOligos > 0:
print('{}\t{}'.format(idLine, numberOfOligos))

10
21.py
Code

#!/usr/bin/env python3
#Exercise: Translate DNA into AA
#Note that this script only works with non interleaved sequences

import sys

if len(sys.argv) < 3:
print('Error, not enough arguments. Usage: 21.py input.fasta output.faa')
sys.exit()

gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',

'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',
'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',
'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'}

with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:

for line in inf:
line = line.rstrip()
if line[0] == '>':
print(line, file = outf)
else:
### Translation block
aa_seq = '' # Remember to empty the variable for each new translation
for n in range(0,len(line), 3):
codon = line[n:n+3].upper()
#False at the end if the sequence length is not a multiple of 3
if len(codon) == 3:
aa_seq += gen_code[codon]

print('{}'.format(aa_seq), file = outf)

### End of translation block

11
22.py
Code

#!/usr/bin/python3

# Runs as 22.py rRNAlistFile inFastaFile outFastaFile

import sys

# We create a set of the ids found in the rRNA list

rRNAset = set()
with open(sys.argv[1], 'r') as inrRNA:
for line in inrRNA:
line = line.rstrip()
rRNAset.add(line)

# Parse the fasta file

rRNAfound = False # Test if the sequence is a rRNA
with open(sys.argv[2], 'r') as inFasta, open(sys.argv[3], 'w') as outFasta:
for line in inFasta:
line = line.rstrip()
if line[0] == '>':
seqid = line.split()[0]
seqid = seqid[1:] # Remove the >
found = False
if seqid in rRNAset:
found = True
continue
if found is False:
print(line, file=outFasta)

12
23.py
Code

#!/usr/bin/python3

import sys

if len(sys.argv) != 2:
print('Usage: {} gene_fasta_file'.format(sys.argv[0]))

codon_total_count = [0, 0, 0]
codon_gc_count = [0, 0, 0]
allowed_letters = ['a', 'c', 'g', 't']
gc_letters = ['c', 'g']

countLines = 0
with open(sys.argv[1], 'r') as in_fh:
for line in in_fh:
line = line.lower().rstrip()

countLines += 1
if countLines % 10000 == 0:
print('{} lines processed'.format(countLines), file=sys.stderr, end='\r')

if not line[0] == '>':

for n in range(len(line)):

# Get the current codon position, here 0, 1 or 2

current_pos = n % 3
letter = line[n]

if letter in allowed_letters:
codon_total_count[current_pos] += 1

if letter in gc_letters:
codon_gc_count[current_pos] += 1

for pos in range(3):

gc_frac = codon_gc_count[pos] / codon_total_count[pos]
print('GC content for position {} is {}%'.format(pos+1, round(100*gc_frac, 2)))

Python For Biologist
No ratings yet
Python For Biologist
24 pages
DWM EXP 1 To 14 C - Merged - Compressed
No ratings yet
DWM EXP 1 To 14 C - Merged - Compressed
104 pages
Computational and Systems Biology Assignment Help
100% (1)
Computational and Systems Biology Assignment Help
15 pages
02 Handling Files
No ratings yet
02 Handling Files
18 pages
Epsc 121 Notes (Revised) - 1
100% (2)
Epsc 121 Notes (Revised) - 1
30 pages
04 Functions
No ratings yet
04 Functions
16 pages
Lab 6 Pseudocode
No ratings yet
Lab 6 Pseudocode
2 pages
Primers For GFP
No ratings yet
Primers For GFP
7 pages
MOOC Project Work - Sequence Analysis - Data Analysis With Python 2021
No ratings yet
MOOC Project Work - Sequence Analysis - Data Analysis With Python 2021
29 pages
DNA RNA Protein
No ratings yet
DNA RNA Protein
5 pages
2B Strings
No ratings yet
2B Strings
23 pages
Bio Lab 1 Set A
No ratings yet
Bio Lab 1 Set A
2 pages
Phylip Via Emboss - Tree Building:: Phylip (Phylogeny Inference Programs)
No ratings yet
Phylip Via Emboss - Tree Building:: Phylip (Phylogeny Inference Programs)
17 pages
Exam Sample Questions
No ratings yet
Exam Sample Questions
6 pages
IDC306 Assignment 5 MS21009
No ratings yet
IDC306 Assignment 5 MS21009
4 pages
Assignment - Idc306
No ratings yet
Assignment - Idc306
6 pages
INFO390C DNDS Pset05
No ratings yet
INFO390C DNDS Pset05
9 pages
Exam Programming Exercises
No ratings yet
Exam Programming Exercises
7 pages
Lab 2
No ratings yet
Lab 2
7 pages
Cs Project
No ratings yet
Cs Project
65 pages
L1 Exercises Solutions
100% (1)
L1 Exercises Solutions
15 pages
Python Basics Exercises
No ratings yet
Python Basics Exercises
4 pages
Group17 2
No ratings yet
Group17 2
9 pages
Function Solutions
No ratings yet
Function Solutions
10 pages
BINP16 Programming Exam 2016-10-25 Solutions
No ratings yet
BINP16 Programming Exam 2016-10-25 Solutions
5 pages
p3 Python Project
No ratings yet
p3 Python Project
4 pages
solutionsExerciseMaster1 10
No ratings yet
solutionsExerciseMaster1 10
9 pages
p2 Python Project
No ratings yet
p2 Python Project
3 pages
Lec 2 PDF
No ratings yet
Lec 2 PDF
28 pages
L1 Chapters 1 2 Solutions
No ratings yet
L1 Chapters 1 2 Solutions
3 pages
Assignment 01
No ratings yet
Assignment 01
4 pages
Week 10 Tutorial Sample Answers
No ratings yet
Week 10 Tutorial Sample Answers
9 pages
RIP Tutorials Bioinformatics
No ratings yet
RIP Tutorials Bioinformatics
19 pages
Python
No ratings yet
Python
9 pages
University of Mauritius
No ratings yet
University of Mauritius
9 pages
CSE 5370: Bioinformatics Homework 2: Due Thursday, February 24th, 2022 at 4:59PM CST
No ratings yet
CSE 5370: Bioinformatics Homework 2: Due Thursday, February 24th, 2022 at 4:59PM CST
3 pages
BT3040 - BIOINFORMATICS - Assignment 4: Question 1
No ratings yet
BT3040 - BIOINFORMATICS - Assignment 4: Question 1
9 pages
In-Linear-Time: Check This Web Site
No ratings yet
In-Linear-Time: Check This Web Site
4 pages
Linux Tutorial
No ratings yet
Linux Tutorial
3 pages
Assignment 1
No ratings yet
Assignment 1
3 pages
BRCA1
No ratings yet
BRCA1
2 pages
Ass 2 Bioinformatics
No ratings yet
Ass 2 Bioinformatics
8 pages
Assignment 1
No ratings yet
Assignment 1
5 pages
BIO Code Report
No ratings yet
BIO Code Report
6 pages
Bioinf575 hw07 Dmeghana
No ratings yet
Bioinf575 hw07 Dmeghana
34 pages
Exercise 1
No ratings yet
Exercise 1
11 pages
Biology Project TRANSGENIC ANIMALS Class 12
No ratings yet
Biology Project TRANSGENIC ANIMALS Class 12
19 pages
2023s2 Cosc122 Assignment1 Handout
No ratings yet
2023s2 Cosc122 Assignment1 Handout
9 pages
A202001006 - PUTRI WAHYUNI - Primer3 Output (Primer3 - Results - Cgi Release 4.1.0)
No ratings yet
A202001006 - PUTRI WAHYUNI - Primer3 Output (Primer3 - Results - Cgi Release 4.1.0)
2 pages
A202001002 Novianti Erlina Ningsi
No ratings yet
A202001002 Novianti Erlina Ningsi
2 pages
Primer3 Output (Primer3 - Results - Cgi Release 0.4.0)
No ratings yet
Primer3 Output (Primer3 - Results - Cgi Release 0.4.0)
2 pages
ATRX Alpha Thalassemia
No ratings yet
ATRX Alpha Thalassemia
3 pages
Florida A Short History Revised Michael Gannon Instant Download
No ratings yet
Florida A Short History Revised Michael Gannon Instant Download
30 pages
Manual de Ejercicios de Python
No ratings yet
Manual de Ejercicios de Python
1 page
HW 13
No ratings yet
HW 13
6 pages
Manual of Myokintics PDF
100% (1)
Manual of Myokintics PDF
24 pages
Ergot The Genus Claviceps (Medicinal and Aromatic Plants - Industrial Profiles) PDF
No ratings yet
Ergot The Genus Claviceps (Medicinal and Aromatic Plants - Industrial Profiles) PDF
501 pages
Loops: Genome 559: Introduction To Statistical and Computational Genomics Prof. James H. Thomas
No ratings yet
Loops: Genome 559: Introduction To Statistical and Computational Genomics Prof. James H. Thomas
27 pages
Nature Magazine 7125 - 2007-01-18
No ratings yet
Nature Magazine 7125 - 2007-01-18
104 pages
Osmosis Tonicity Worksheet
No ratings yet
Osmosis Tonicity Worksheet
2 pages
Dunya Chapter 12 Races of Dunya Complete
No ratings yet
Dunya Chapter 12 Races of Dunya Complete
170 pages
Physics: The Ambitious Combined Entry Test 2019 (Etea/Fmdc/Nums/Uhs-Mdcat)
No ratings yet
Physics: The Ambitious Combined Entry Test 2019 (Etea/Fmdc/Nums/Uhs-Mdcat)
19 pages
Effectiveness of Planned Teaching Programme On Reproductive Health Among Adolescent Girls
100% (2)
Effectiveness of Planned Teaching Programme On Reproductive Health Among Adolescent Girls
5 pages
Strasinger CM
No ratings yet
Strasinger CM
46 pages
? 5 Ways To Improve Your Sleep (B1 - B2)
No ratings yet
? 5 Ways To Improve Your Sleep (B1 - B2)
30 pages
FarAway InstructionManual OCR
No ratings yet
FarAway InstructionManual OCR
28 pages
PHYSIOLOGY OF NOSE & PNS (Recovered)
No ratings yet
PHYSIOLOGY OF NOSE & PNS (Recovered)
72 pages
Paradigm of Human-Environment Systems (Scholzund & Binder, 2003)
No ratings yet
Paradigm of Human-Environment Systems (Scholzund & Binder, 2003)
29 pages
Set 2 Bioenergetics Grades 1 3
No ratings yet
Set 2 Bioenergetics Grades 1 3
21 pages
Ecology
No ratings yet
Ecology
19 pages
BSC Biomedical Science
No ratings yet
BSC Biomedical Science
11 pages
4th Week 4.2 LAS Science 10
No ratings yet
4th Week 4.2 LAS Science 10
8 pages
Protein Synthesis
No ratings yet
Protein Synthesis
19 pages
Module 1
No ratings yet
Module 1
6 pages
Lecture 2 (Variance)
No ratings yet
Lecture 2 (Variance)
20 pages
HbA1C Riview
No ratings yet
HbA1C Riview
10 pages
FASTA Result1
No ratings yet
FASTA Result1
6 pages
Does It Respire or Not
No ratings yet
Does It Respire or Not
9 pages
The Legend of Malin Kundang
No ratings yet
The Legend of Malin Kundang
20 pages
Cambridge Large1 Template Aog and Jog
No ratings yet
Cambridge Large1 Template Aog and Jog
3 pages
Heparin
No ratings yet
Heparin
6 pages
SST 307 Applied Statistical Methods - Cat2 Answer All Question
No ratings yet
SST 307 Applied Statistical Methods - Cat2 Answer All Question
2 pages
Carrying Capacity For TANAW de Rizal Park
No ratings yet
Carrying Capacity For TANAW de Rizal Park
9 pages
Lsm3212 CA3 Essay
No ratings yet
Lsm3212 CA3 Essay
3 pages
Lisp Interpreter in Rust
From Everand
Lisp Interpreter in Rust
Vishal Patil
1/5 (1)
C++ Functions and tutorial
From Everand
C++ Functions and tutorial
Nino Paiotta
No ratings yet
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

solutionsExerciseMaster11 23

Uploaded by

solutionsExerciseMaster11 23

Uploaded by

Suggested solutions exercises 11-23

with open(sys.argv[1], 'r') as in_fh:

# You can test if a substring is found in a string in a similar way

for line in in_fh:

for line in in_fh:

# Exercise: Transcribe DNA fasta to RNA

with open(inputname, 'r') as inf, open(outputname, 'w') as outf:

with open(inputname, 'r') as inf:

if len(sys.argv) < 2: # First argument is second in the sys.argv list!

with open(inputname, 'r') as inf:

with open(inputname, 'r') as inf:

with open(sys.argv[1], 'r') as in_fh:

for line in in_fh:

# Note that there is an empty line of the input file

with open(sys.argv[2], 'r') as in_fh:

# A string solution, much faster for bigger files.

if found_id is False: # This if statement is really not needed

#Usage: ./18.py input_fasta

print() # Print a final newline

mass_dict = { 'P' : 97.1167, 'D' : 115.0886, 'T' : 101.1051, 'V' : 99.1326,

prot = input('Enter an amino acid sequence: ')

#Usage: ./20_getSeq.py oligo fasta_file

with open(sys.argv[2], 'r') as in_fh:

for line in in_fh:

gen_code = {'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',

with open(sys.argv[1], 'r') as inf, open(sys.argv[2], 'w') as outf:

print('{}'.format(aa_seq), file = outf)

# Runs as 22.py rRNAlistFile inFastaFile outFastaFile

# We create a set of the ids found in the rRNA list

# Parse the fasta file

if not line[0] == '>':

# Get the current codon position, here 0, 1 or 2

for pos in range(3):

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.