Matt Shirley ([email protected])
18 April 2013
Only half of the class expressed an interest in using Python for sequence analysis. Why should I care?
FASTQ: it's like FASTA, except with quality scores for each base call
@HWI-EAS179_0001:5:1:7:119#0/1
CAGGGCGCGAATGNTTTGAGAGGGANATTGGAAANNNNNGATAGANNGGNCTATNNTGNNNNNNNNNNNNNNNNNN
+HWI-EAS179_0001:5:1:7:119#0/1
HIHHHGHHHFDHH#EHHH?HHHDH>#[email protected]@?##########################################
import sys
with open('1000_reads.fastq', 'rU') as f:
data = f.readlines()
bytes = sys.getsizeof(''.join(data))
kbytes = float(bytes) / 1024
mbytes = kbytes / 1024
print mbytes
## 2.10134029388
Our small sample file takes up 2.1013
MB on disk and 2.1013
MB in memory.
Uh oh! My exome sequence is 10.5 GB. Do I need 10.5 GB of RAM?
Instead of reading in our entire data at once (like ordering a super-size value meal) we can grab and use only as much as we need at once (like making multiple trips to the salad bar).
In Python, this type of interaction is known as a generator object.
def countTo(n):
""" Count to n in increments of one """
num, nums = 0, []
while num <= n:
nums.append(num)
num += 1
return nums
counting = countTo(5)
print counting
## [0, 1, 2, 3, 4, 5]
Let's turn this function into a generator:
def countTo(n):
""" Count to n in increments of one """
num = 0
while num <= n:
yield num
num += 1
counting = countTo(5)
print counting
print [i for i in counting]
## <generator object countTo at 0x1039a2410>
## [0, 1, 2, 3, 4, 5]
We can treat the generator like a list and get our results as we iterate.
def countTo(n):
""" Count to n in increments of one """
num = 0
while num <= n:
yield num
num += 1
counting = countTo(5)
print [i for i in counting]
counting = countTo(5)
print iter(counting).next()
print iter(counting).next()
print iter(counting).next()
print iter(counting).next()
print iter(counting).next()
print iter(counting).next()
## [0, 1, 2, 3, 4, 5]
## 0
## 1
## 2
## 3
## 4
## 5
Each iteration yields
a result when you request it. That way you never make the entire list in memory.
class fastqReader:
"""
A class to read the name, sequence, strand and qualities from a fastq file
file = the file name of a fastq file
"""
def __init__(self, file):
self.file = open(file, 'rU')
self.read = read
def __iter__(self):
"""
Return read class: (name, sequence, strand, qualities).
"""
for i, line in enumerate(self.file):
if i % 4 == 0:
name = line.strip()[1:]
elif i % 4 == 1:
sequence = line.strip()
elif i % 4 == 2:
strand = line.strip()
elif i % 4 == 3:
qualities = line.rstrip('\n\r')
yield self.read(name, sequence, strand, qualities)
def __enter__(self):
return self
def __exit__(self, *args):
self.file.close()
%
= "modulus", more commonly known as the remainder after division
class read(object):
"""
A class to hold features from fastq reads.
"""
def __init__(self, name, seq, strand, qual):
self.name = name
self.seq = seq
self.strand = strand
self.qual = qual
self.dict = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
def __getitem__(self, key):
return self.__class__(self.name, self.seq[key], self.strand, self.qual[key])
def __repr__(self):
return 'name: {0}\nseq: {1}\nstrand: {2}\nqual: {3}\n'.format(\
self.name, self.seq, self.strand, self.qual)
def seqlen(self):
return len(self.seq)
def reverse(self):
""" Reverse the order of self """
return self.__class__(self.name, self.seq[::-1], self.strand, self.qual[::-1])
def complement(self):
""" Take the compliment of read.seq """
compseq = ''.join(map(lambda x: self.dict[x], self.seq))
return self.__class__(self.name, compseq, self.strand, self.qual)
import sys
import lecture12
with lecture12.fastqReader('1000_reads.fastq') as fq:
read = iter(fq).next()
print read
read = iter(fq).next()
print read
bytes = sys.getsizeof(read.__repr__())
kbytes = float(bytes) / 1024
mbytes = kbytes / 1024
print mbytes
## name: HWI-EAS179_0001:5:1:7:119#0/1
## seq: CAGGGCGCGAATGNTTTGAGAGGGANATTGGAAANNNNNGATAGANNGGNCTATNNTGNNNNNNNNNNNNNNNNNN
## strand: +HWI-EAS179_0001:5:1:7:119#0/1
## qual: HIHHHGHHHFDHH#EHHH?HHHDH>#[email protected]@?##########################################
##
## name: HWI-EAS179_0001:5:1:7:191#0/1
## seq: GAATCGATGAAGTNCGGCTTTTTGTNGATGATGTNNNNNATGCGGNNGCNCTGTNNGTNNNNNNNNNNNNNNNNNN
## strand: +HWI-EAS179_0001:5:1:7:191#0/1
## qual: G=EFHHDHF=9D<#D9:[email protected];HH=H0#B6?B##############################################
##
## 0.000264167785645
Every time we iterate over our fastqReader
object, we receive a new read
object. Instead of using 2.1013
MB of memory, now we use only 0.0024
MB of memory! That leaves us plenty of to make our program useful.
import lecture12
from textwrap import fill
with lecture12.fastqReader('1000_reads.fastq') as fq:
starts = []
for read in fq:
starts.append(read.seq[0])
print fill(''.join(starts))
## CGTAGTCTTCAATCGTTTAGTAGGGCGACTTATGTTACAGTAGCTATCAAGAACTAAGGGGGACATGAAT
## TACAAAGGGTCCGAAGTAGATGCTAAAAGTAACCACTGGTCCAGGAGGATAAAGGATCTGTAGAAGCTTG
## TTCCTTTTTACCACTGGGAGTCTCAAACTTTGTGGCTAGAGGAAGCTAAAATAAGACGTTCTGGCAGATA
## GAGGGTCTCATTCAGAGATGAAATAGTTTAGTTATGATTTGTGAGAGCGATGACATGACAAAAGTAATAC
## AAATCTATCCTTAAGTAAACTCTTGCACTGAGCCTGAGCCGTTTAATAGAGAGCAGTTGCAGAAGGGGCA
## ACCGGCATCTCTTAGTCGCTGACGGACTGAGAGATTGCAGTTACCTGGTGAGTGCGAAATTGGTCGCAAA
## TGATAAGACATTCACTTGGAATCCATCCCTTTAAACTCGTGCTAGTAGTGGACGGTGGATGCCTAATGGT
## GTGCACCCAACAACCATACGTTAGGGTGAGGGTACTGGACCCGAGCCCAATCCGGTTGGATATGGACCGG
## TAGTGTTGTATCGGCAAAATGCACTGCGGTATTTAATTTCAGGACTTAGCTAGTAGTATGAACCAAAGGA
## CGCATTACTATAGTAGAGGAGACTTTGAATTCGATTTTTGCGANAGAAGTATGTAACGAAATCGAGAAAC
## GGGGGGCAAAGTGGTTATATCTGAGAATACCGCCGTGATCTAGTATCGATTTTGCCGAGGCCCCCGTGAG
## ACCAAAGGACCATGAATCANTGCGCAGGCGGGTTCCACGCCAGCAAGAATGTAGATTACGGACTTTGACC
## TTAAATTGTTATGAATTTTGAAAGATTAGCTGTGTGTGAAGAACTGTTATAATAGTTACAGTAGAAGCTT
## GGGCGCGCACGAGACCTTAATGTTGGTCAGAGTTACGAGTAGCTATGTGCGTTGGGAGAGCGGGAGGGTA
## CTAAAAAGATAGATATTAAGACAGTAGAAACAGCGAGATTAGATTACTAAGCGACCGAGCAATCGGAGCG
## GGGTAGCAGGACGAAAACAACTAAAGACCGAGTGACAGAAGTTTTGAATCGAGGATTTGCATGCGAGCCC
## TAAGAAACCCTAAGACTGCACTTAGAAGGAGTCAATAATAAGNGACCTCCCAGCGGTGGATTCGCGGTAG
## TCCGTGCAAAGCAAAAAACAAATACTCAATGAATCAGACCGGTGGGACCGATAATAGACCATATGATTTG
## AATGTGGATCCACTGGCTGAGATATGAAAAGAGACTCTCTTTTTAGGGTGGCGCAGTCTCCAAGAGGTAT
## TCGCCGTAGCTGAAGGTAAGCAGCCGTGAGACGTTTTGGTATAGGACTGCTCCAAAAGTCGGCTTCCAAT
## GCTTTGTTTTATCGCACACAGGAGGCTATTGCGGTATAGTCGAAAGAATGCATATGATTTATTTCACCAT
## GGGGCGGTTGCGTACGACAAAGTATTTAGAGGAACAACTTGATGGATGGTCCACGCAAGGGCTCAGTTGA
## TAGCAAATAAATACGATTCCCGGTAATGTTCATAAAATTAACTTCGAAACCAAAAGTTAAGCGTGGCACA
## TAGAGAATGACGAGGGACGTGATAGAGCAGAAAAGTTATCTTGTTGGTGCCACTAAACTAACACAGTAGA
## AGTGGACAACTTGAGAGACTGGGACGCTCAGGGTAGTAAAACGACAGAAAGTCTTTGGGGCATCGTACTC
## GGCTGAAGAGAATGGATGTAGGATCGAAATCAAAGCATTTTTACTAGTATCAGTGAATGTTAAGACAGCT
## AAACAAACGTCTTTGTGTAGAACAGATACACGGGGACAGAAATCAACAGAGCGTAGCGCCAGGTGCAGAT
## TCGGGTGATTAGACCAGGCGCGCATGAGTTGAATAATGTAGTAGTAAACGGAAATAAACAGAGTGGCAAC
## TACGACACTAAAAATGGAATATACAAGAGTAAGGGGAAGTCTGCAATAGTACTGTGCCGAATTTGGAGGG
## ATAAAAAACGACTGGGGAAGTTTCGCTGGAAAAGGTATAGAAGTAACGACAGGGTATAGCTCTAACAAGA
## AGTTCACGATAGAAGGACAGGAATAACAAACTCATATTCAGATAAGTTATCGTGGAGGGGGGTGGGAAGT
## CCTACCTGAGTGAGTGGTAGAGTGTGTCTAGCACCTAGTGAGGAACGTGTATTATGAGTGACAGAACGGG
## GACGCATGGGGTCATACTCAGAACAGTAAAGGTGACTATAATAAACGTCGACTGCTTGGGGAGTTTAGTT
## TAGTCTCGGGATCGTACCCAGGGCTAGGTAGGCACATAAACTCAGGTACGTGTGAGGGCGAGCGATGTGA
## CGATATACTCAATTGCTCTCCCCCGGAGACAGAAAGGGGTAGTATGCGAAAGTTGATCGCTCAGTGTGCG
## CACAACGGCAGTGAGTGATCGAGAGCGGCGATATGAGGCGAGTACTGAGCTAGGAGCTATATGTAGAGCT
## GTATGATGATCGAATTTCGAACAAGATGCGTTTTAGCGAACGTCTCTAATAGGTCACTATTACCACTCTT
## AGACAAAGAGACATAAGCTCAAAAATGAGTGACCGCACTTATGAACCTCTACAGAGTGATAAGTAATATA
## CTCTGAGCTTGTATAGAGATTCGTAATGGGCCTGAGATTTCATAGCTTTAAGGTCAACGAATCGGCTTAT
## CATCAGCGATAACTATGCTGGAATGATCCCTAAAAGTTCTGTCTATTGTAGAATTTACTGTATAGAATCG
## GTAAAGACATTGGTATACATTAATCTATTCGGAGATCGCCTATGCTTTGAATGAAGGACAAAATCATGAA
## TATTTTAAAACTTGATGCGGACTTCTAGCTGGTCGCTGGACACAATGTTTGGGAAAAAAATTAAACGGTA
## CGAACAAAGGCCGTGGACAGGAAAGGTTTTGACTTGGGAATTTAAAGGGGAATGAAAATCGATCATAGTC
## AATCGCCAGAAATTGATATGACATGGCCCACTAGATATGATATAGCAGAAGCAAATCTGCCGGCAGAATC
## ACGGTCCGATACTTTTCAGTTAAGGGTAAGTAACCGTTAATCGCGCTCGAGAACCAAATAACATATCCAA
## GATGCATAAACATAGCTGTTGGATCGATGTAGGATGGACTAAATAACAAAAGAAAAGTTATACGCAAGGA
## TCAAGAATAGATAAAGCTTCTAGCTAATTCACTAGAGATTAGGGGTTTCGAGAGAGTACCGAGATACTGC
## CTAGCACCCTGGTAGTAGGCCTGAAACTGGAGGGGCTTATGCTAGAAATATACGATAAGTAATCTTATCT
## GACTAGATCAAACGACACCTAGGTCTGGTCGGTACAAAAATCAGGCATAGTTAGAGGGAACGAGAGAGTG
## GTGTATGCAACTGACGAAGCAAAGTGATATAAAGCAAAGAAGCCCTTCACAGTGAAAGCCAGGTATGATA
## AAGTAGATATAAGGGAGGAAGAGACTCGAGAGAGACACGATTAGTAAGGAAGATTAAAAAGATAACAATA
## TATGTGACGAGCCTAGGCAAAAGAACTCTAGGCTACATAGTGGAACTCTATCGGACCCATCTGAACTGCG
## CTACCGAACAGATTGGTTAACCGTCATAATTGCCCAATAAGAAATTACTAGTAAAAGCAATACTCCGCTG
## GTTTGAACTTCAATGACGGTGAAAACTAGGAAATACGCCAATAGGGGGGCATATGAAGTCCTGTTCTCTC
## CAAGCTGTCGCGACATACAAAAAGAAGAGGAACTCTAGATGATTCCAAGTATAAAATGAGCACATTACAA
## TGTAACGAGGGTGCGGTGAGGGACATCATTAACTTTTATTATAGGAGAATGTTCCATGGAGTCAGGAGAT
## TAGCATCTTCAAGGGCACGTATCTCCCTTATAATGTGCAGAAAGATAGGCCGTTGCCAGCAAGTAATTAA
## AAAAAGAAGCAGTGTGAAGCAGTCGGACCAGAACTGACTTGGGTAAGGCGAAGCATACTGAAATTAGGTC
## GAGAAAAGTAGAATTGGAAAATAAGCGAGGGCAGATCGCCGAAATACGTAAAACAACAGAAAGAACTATC
## TGACCGTTATCTCTTCGTAGTATGCGTAGATTTAGGTAAATTTAAATGCAATAGGCATGGTATAAAATTG
## GAAAACGAATTCAGCCAACATGGATGAACGCCAAATTACAAGGGATCCTATCACTCTTAAAAATACTGAT
## AGCAAGTGTGGACAACAATAAGAGGATTTCAACATGTGGAGTACCCAGATATCCGATTGTATGGGCGTTG
## CATTGTGGGAGTATTGTCTGCGAAGGGGGTACGAATTACCCCAGGACTATATCGACCAAATGAAAAAAAG
## TACTCACAACCAGATGTAAGCCTACATATATCGAGTACCGATGATAAAGAGTCGTCAATATGATACATCA
## AGAGAAAAGGCGACCTATATGTGTAATAAAAGAAGTAAAGCATCCGTACAGTAGGAAGTTATTTATGAAG
## TAGGTATAGACATGAGTCAAAATAAATGATCAAATGATGAGGGAAACCAAATTAACGACCAGGTATAATA
## AGGAATATGAACGCTGATAGTCAAACCTGAAGGAAATGGAATAAATTACTGGATGAATACCTAGCAAAGC
## AAAATAACTAAAGCCAGAATTTACACATAATCCTACTATAAGGGCGGAAGCTAGGTAGAGCAGTATCTCA
## AAATCAGTTGTAAGATCGATATGGGTCGGAGAGAGCATGATATCAATGCGGCACTGGATCAGATCCACTT
## TGAATGAATTTCGGTGAACTAAAAAAGACGTTAAATCCTCCCGTGAACGAAAAATCGCAGATCCAATTAC
## CCGTATAGTCCTTAGAACAAATACAGCACGAGTTCAACTACTACCTCTGTCCGAGACACCGAGTAAAAGA
## AGTACAAGGATTGGACGATTATAAAGAGAGAATAATGTATGAAGTGAGGACAAGGTGAGTCGCGGGGATT
## GGAAATACACGGTAAAAGTCATGGCCGGCTTGCGCATAGCGGTGGAAAATATATTAATACAGCGAGTGTC
## GCCAAGCGCAGCCCAGTATATAACCAGACGCAGCCAAAACAGAAAGGATCTATTATTTTCCGTGATGGTG
## TAAAAAGGCGTACGACATAAGTTTTAATGCTGATTGTTAAGAATGTCAAATTAGTGCTGTGGCTAGATGC
## GAAGCACTTGAATTAACAGCGTGATTTTGTTACTGACCCTACACAAAATAAAGCGTCATTTGAAGAATGA
## TTACGGTATAGGATGAGTATAACGGTATAGTATTAAGGTCACGGATCGGGAAACTAAACCACCGCACGAG
## AGGCATTACGTAAGGTAACTCGGAGGACAAATCCCTCTTTAATTTTTAGGTTACCAAACAGGTAGTAAAA
## CAAGGGCTGTAAAAAAGGGTCTATCAACAGTCATATTCTAACAATCTCAGCGCTTAGATCCAAAAAATAA
## ATCGAATAACTAAGTTGTAGCTGAGAACAGACGGTAACTTAGGAAGGTCCCTCAGAATCATGTGGGAAAC
## GATTCGAATGTACCNCATATATGGAAAAGAGTCCGTTTCCCCAAGCCACATGGATGGATACAGACAATTC
## AAAATCAACTAGAAAACAATTTTGAAAACAGTGTGTTTATTCATTGAAAGGCAAAAGTAAGTACGTCTAC
## GCCCGAGCAACAAACTTACACGCGTAGAAACTTAAGCAAACCACACCGTCAATACATTCCATAAGAATGA
## TTCAGAGCCGAATAAAACGAGAAGAAACATTATAACAGGTGGAAAGACAAAGGGGGAGGCAACACGTGTA
## ACACGAATGGGCATAGGAATCTCACTTTCTACAGAGAATATATTTACTGGTGGTGGCAAGCCAATCCGCA
## TAAAATTGTTCCTTTCTAACGATGAAAGAAAAGGCAACTATCACAATCTAAATGGATAGATGACCTATCC
## GACAAGTTTAATTAGGGTTAGGTCAGGACCAGTAGAGTGAAATGTGACAGGTCCAACACTGAAACAAGAC
## TAGTATGCGCTGTTTAAATACGCAAAAGTGCAAAAAAAAAACTATTTAATTTGAAACATACGAGGTATGA
## AAAACAGGATAGTGGGATGAATGTTAGCTGAACTGTCCCCTAAGATACGCTAAGCTGCCTCAACTTAATA
## TGGGCAGCATAATTAAGCCTGGGAACGAATAGATCCAGAAGCCATTCCCAACCGGTCACCGTAAGAAACT
## TAACGAGGCAGTCAGTCTCAAATTGCGATTTGAGAAATGTGGAAAACTCGAATATGGAAAATTGGAAACC
## TAAATCAAAGGACTAATATCAATTGGCAAATCCTAACAAAAATTTTAGTTACCGTTTGAACAACTGGGAG
## CGGGCACGCTCAAACCAGGCAGACTTTAAAACCTAAGAGAATCGATGAAGGTTTTAGAAAGTGATCAAGT
## AAATGCCATTTAGCTGACAGACGTTGACGGAATGACGCGCACGTAGATTTAGGACAGTTAAAACATAAGG
## ATGTAGTTTCATCCACGGTATCTAAATCTAGCGAAATAACTTAAGGTGTTTGATAGGAAAAAACACAAGG
## TTGAATTGGAATCTTGAACACATTTATTAGCGTCAAAGAGTAAGGGGAAAGGTACACAACTTCACTAACT
## TGCAATTGAGGAATAAGTAGGTCAAGGGTAGAAATCACGAATGGGCAAGAAGTAGTGGGTCTAAACGGCA
## GTAATTTCTAGAATAGCTTCAGTCCGAGTGTGCTAAAGAAATAAACGCAGAAAGGGTGGCAAGCAAAAGT
## GTTCTTTGATTACGCTTTATGGTAGGGACTACTAAACACAGAAGGGAAATAACAAAACATTCCGTCGGCA
## ATCACAGCCATAAACAAATGGAGTGGATTAGCGAGATATACCTAGAAAAGCAAATGCTAAGGGCCGAGTT
## AGCCGTGCAATCATAACATAAAACCATATCCATCCTAAACAAATGTGACGTATTAAGAGAAGCAAAAAAA
## CTACAAGATACCACAACATATAAATTTAGTAAGCAAGGGAGCCAATTTGGGATAAAAGTAAGACAGGTAA
## GATGTGGTCACAGGGAGTCCCAAAGCACGATATAGGTTATCATCCCCCTTGCAAAGAGGATAGAGAAGCC
## AAACAATAAGCGAATCATGAATTGAAATAAACGACATAGTTAGGAATTAGAAATTATATGGGTAAGATAT
## TGTTACTGAACTGATTTATCTTCTTAATGGGACTCGAGTAGGTCCAAAATGCCTATATGGAGTAGACCAT
## ATGTGGCCAAATAGCATGAGCTAACGAGTCGAGTCGATAAACACACAATGTGATCGTTGTGTCGAACCGA
## GGGGGGTGGCTGATGCCTAAGACAACCCTTATAGACAAGGTTACCCTATCATGAAACAAAGTAGATGGGC
## AGTCAGCCATTAGCAAGGACAAGACAAAAGAACATGGGTCAAATGATTGGGCAAATGAGATCTGTAGGTA
## GGCGGGAAAGGAGCATACTCAATAGAAAGACACAGGATGGATATCTATACAGCACTAACGAAGTGCTTAA
## ATAGCGGGAGTAAGAATCAAAAAGGGGAGCCCGAGATTAGGTTAATATTAGAGTAACAATAATTTTACAC
## GAACTATGTGCATCAGAGTCAAATATTCGGAGTGGGACCTTTGGAATTTTCTTAGTAATAGGGCATAGGA
## AGAGATACGGAAAGGGAAATAGATAGTTAGAGCGGCCAATTTTACAAAACATGTTGGGCCAGGTATCACT
## AAAAGAAGGAAGAAGTCAGGTATGACCTAGCGAAGGAGAGGATAGGCCAACCATTGGTAAATTANGACAA
## CGACAACAGTGGCGACACAAACATGTGGAGCAAGAGCTTTAAAAAAGATATTAAAAGGTATACACTCGCA
## ATGTAGAGACAACGGAAATTTTTGACTGATAATAAGCAAACGTATTAAGGTAGTGAATTGCAACCATTCA
## AGATGAAGGCCTCCGGAAAGTACAATACGATAACATCATTAATGAGTGTGTGCTAGAAGACTAAGAAGCA
## TCAGAGGACCGATACATATTGCAGTAGCAAAAACTGTTCATAAGTTAGTTTATGAGAAGATATTAGTACT
## TATAAATGCGGCAAGTCCTGCGCACATAAGAAAGGAAATGCTGAGACTTCGAAGTTGTATGACATATGTG
## GCAAAACATGTGAGGTGTATCTGCAAAAGATCGCGTCATTAGGCAAGTGACTACATTCTTAGAAAAACAT
## TGAGCTAAGCCGAGTGACGTGCAGATTAGTAACGCGCACTTATACAAAATAATACTGGTGATAGNATGTC
## ACATGAACGATATGGTTTGAAAACATGAGTTTGTGCAACAGGCGCGGTACGTGTTATGCCTCCTTAAAGA
## CGAAGGAGTTGAGCATGGGAAAAGTTAGAGTTCTTGAAGATGCCTTTTTGCCTCAGCTAACCAAATAGAA
## GTGCCTACGATGTTGATCGGCGTCCAAATATCAGCAGTCAGGGCTTAAATCTAAAGACTAAATTAAAGGT
## ATTATAGAGTGAGAGTTCACCCAAACTCAACATGTAGAAGGATATCAGGTTTCATTACCATTACCAAGAG
## ATACGAACTGGAAAGGTACATCGAAGACGGAGGGGGGCGCAGCCGAAGATAACTTTTTTGAGACTCATGC
## ATTATGAATACGCGAGGGCTCGATGGGGCTCGGTGTGCAGGGAGTGTGATGAATAATTTAATTGAAGCAG
## ATGGCCCAACTAACGCATAAGAAAAACAGTCGTGATGACATAGTGTTTTGTGATTGGAATATTACAGACT
## CTCCACTACTAAAAGAGCGAGATGTACAGGTAAAGACATATTAGATAAGAGGTAAATGGCGGCTAATAAA
## GATTTATGCTGCAATGAGGAAAGAGGTGTGTGAGGGTTAGAGAGGAGCCCAGGAGGAAAACGCAAGTATT
## GTAATAGCTATCAAGTCATTAAGCGAATAGATACTAGGACTTGATGTCGAAGTAATCATCGAAGCGAGGG
## CCAAGTGAAGAGTCACTGATTCTATGGAGGATATCCGATAAATGAAAAGTAACTAGTTATGCGCCAAGTG
## GAAAAGACAGGACAACGAGGACGCAGGCAATGCATCGCATGGAGCATGATGAACCCAACATGACAACGGT
## TATTCAGTGTCAAGAGTGAGAATGCATCTATGTACTCGCCAAAGGGGAAGGAGAAATTACTGCGTCACGT
## TCAATAAATTTTTGTAAGTTTGTAAACAAGAAACTGAATGCAAAAGCAAGGATTGTATACGGTCATCGAT
## CTTATAGTGAAATAGCGAAAGATTTTCGAGAAACAATTACACAGCAAAAGCATAGAAGTGGACTTAGTCT
## AAAAGGACATGACAACATCCTAAACGGCATTGCATAACGAGAAAAGGAGTAGACGCCATGCCAGTATTTA
## GTCAGTATATAACGGACACTCACCATTAAACATAATTTAGTACCAAATGTAAAAAGCGCACGCGCGTAGC
## AATTATGGTTTAATAGAAAAAACTGGAGAAACCATACATTATAAATATCAAAGGACGGTGAGAGAAGTAG
## ATCGGGGAGGGAGAATTTATACGAAAGTAGCAAGTAGGTACCAAGCTTCGAAACTCAAATAAAGACAACC
## TTGAGAAGATCGATATGAAAGACAGAGGGGTAACTATAATCTACCCGGGGGATCAAATTCAAATTCACTA
## CGTGTGGAAGAGTCGAAGACCGACAGCGGTGGGACTGCACCAAGTCCCACAAGCTAGGCCAACTACATGT
## AATAGACTCCAGACAAGCAAAAGATCATGATTGCGAAGTAAAGCAGCAATTGTGTGGGTATGAATAAAGG
## GACATGCGTTCCGATGCTACAAGGAATGACATTACAGTACGGCTGCAATAAGAGGTTGGG
read
class with a method for calculating the "reverse complement" of the sequence1000_reads.fastq
and for each read with > 40% GC content prints:
Points will be awarded for the following criteria:
The code and python module used to generate these slides can be found at https://github.com/mdshw5/BIOF309.
Start by downloading lecture12.py, which contains the classes and methods detailed in this lecture.
Are you a hacker yet? (check out the image and see if it looks familiar)