Last active
November 20, 2016 22:22
-
-
Save stefanahman/127c1aaac06f006a3c22ec2c8a894276 to your computer and use it in GitHub Desktop.
Python lab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>where_is_the_exon | |
ATCAATTCGATTCCAAAACTGTTTTTAACTGTATTGCTAAGCTGCCTTCCACTAGGTTGG | |
TCAGGAAGGATAGTAGAGACAGAGGAAATGGCAGAAGGGACTGGGGGGGTGGGGACAAGA | |
TGCTGGCTCTGTTGCATTTTGAAGGCCACATCTGCCAGCCTGGGAGTGGCCTGTGGTGGC | |
CAGGCAACCCAGCTTGATGTTGCCTCTTCTGTCCTGAGGATGGAACAGAGGCAGGTGAGA | |
AGCTTTCTGTGGCTGCTGCAGCAAAACCAACCTATGACCAGGGGCTTTGGCTGCCATTGC | |
CCTCCCTCCAGCCAGAAGTAAGGAGAAGGGAGCCATGCTTTTGGTGGGTAAGCTCCGCGA | |
ACAAGAACTGTGGCCCTGGGAGCTTTACTCTCAGTCCTAGCTTCTCCAGGAGAGGGTCAG | |
CCACTGGATGCTTGGAGTCCACACTAACAGCACAACTGATTCTCGGCAGCTGACGACGGG | |
AGTGGAGTATATAGTCACTGTGAAGATTGGCTGGACCAAATGCAAGAGGAATGACACGAG | |
CAATTCTTCCTGCCCCCTGCAAAGCAAGAAGCTGAGAAAGGTGTGTAGTGGAAGTCATCC | |
CAAAGGGACTTGTGAAGTGAATATTTTAAAAAGCAACAGCTAGAAGTTGGCAGTTTTGCC | |
ACTTGTGTGGATTGGGGCCAGCTGGGGCCCAGCCCTGGGGCTGTCTGGCACCTCCTGAGG | |
CACCTGGGTGAAGGAGGGAGGCTTTATATTTGGGCTGGTACAGCCCTGCACCCAGGCCTC | |
TGGTTGCCTGGACCCTTCTCCAACCTGGCCTGCTCCCCAGACCCTCTGCGTGAAAGCAGT | |
GGTTCCCTTTCCTTGGGGTCCTTCTTCTGTCTTCTTTCTCCCTCTCCAGGTCTCTAAGGT | |
TGTCCAATTTCCTCCTTGTTCCCTTCTATTTTTCCTGGTCCATGAGTGGCACCCCTTGAA | |
AGCTGGCAGATACAAGGGGCTCCCCCATGCCTGGGGAGGTGCAGGAGGAGGTAACGATGC | |
TATTGCTCCTTGCTGGCCTGCCTGGCCCACATGACAGGGTCCGGCCAGGCCATAGACTCC | |
AGGTTCACCTGCTGTGGGTCCCAAGTGGCTGGGTAGGGGGAAGACAGGGTGACATTCCTG | |
TTCCAGACTGGCACTCCTTTCTGTTTCTACCCTTGACTGTGGGCAGCTCCCCTCTCTGCC | |
CATGGGGTTCATGAGCAGCCACAGGGAGAGACCTGGGGAACAGGTGCCTGTTGCTTGCCT | |
TTGAAAAATACTTCCCCCAAAGTCTGTTTTCTTTCTTCTTCTACAGAGTTTAATTTGCGA | |
GTCTTTGATATACACCATGCCCTGGATAAACTATTTCCAGCTCTGGAACAATTCCTGTCT | |
GGAGGCCGAGCATGTGGGCAGAAACCTCAGATGAGGGCTCATATGATTGAGTTGTGCACT | |
GGCTGTTATTAAACTGTAAAGGATCA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
length = int(input("Length of sequence: ")) | |
sequence = "" | |
for i in range(0, length): | |
sequence += random.choice(['A', 'G', 'C', 'T']) | |
print sequence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to run: | |
# cat Stockholm-Test.txt | python lab1_3.py | |
import fileinput | |
import re | |
sequence_names = [] | |
for line in fileinput.input(): | |
# if re.search(r'^([A-Z]|[0-9])+\.[0-9]\/[0-9]+-[0-9]+\s+(G|A|C|U|-)+\s?$', line): | |
# sequence_names.append(line.split()[0]) | |
# if not re.search(r'^[A-Z].*', line): | |
# sequence_names.append(line.split()[0]) | |
# if not re.search(r'^(#|\s|\/).*', line): | |
# sequence_names.append(line.split()[0]) | |
# if line.strip() and not (line.startswith('#') or line.startswith('/')): | |
# sequence_names.append(line.split()[0]) | |
if line.strip() and not line.startswith('#') and not line.startswith('/'): | |
sequence_name, sequence = line.split() | |
sequence_names.append(sequence_name) | |
print str(len(sequence_names)) + '\n' | |
print '\n'.join(sequence_names) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to run: | |
# python lab1_4.py < Stockholm-Test.txt | |
import sys | |
import re | |
def is_valid_sequence_line(line): | |
return re.search(r'^(\w|-|\/)+\s+(\w|-|\.)+', line) | |
def print_line_in_fasta_format(line): | |
sequence_name, sequence = line.split() | |
print '>' + sequence_name | |
sequence_chunks = re.findall('.{1,60}', sequence) | |
print '\n'.join(sequence_chunks) | |
def main(): | |
for line in sys.stdin.readlines(): | |
if is_valid_sequence_line(line): | |
print_line_in_fasta_format(line) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to run: | |
# python translate.py < translate.fa | |
import sys | |
import re | |
codontable = { | |
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', | |
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', | |
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', | |
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', | |
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', | |
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', | |
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', | |
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', | |
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', | |
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', | |
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', | |
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', | |
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', | |
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', | |
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', | |
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', | |
} | |
def prepareForNewSequence(): | |
global ongoing_sequence, longest_orf, sequence, stops | |
ongoing_sequence = False | |
longest_orf = "" | |
sequence = "" | |
stops = [] | |
def calculateLongestOrf(): | |
global longest_orf, stops | |
stops += [m.start() for m in re.finditer('(TAA|TAG|TGA)', sequence.upper())] | |
# This is the _final_ stops, when no codon's found. | |
# Example with sequence of length 10: | |
# 10 - 4 => 6,7,8. Two nucleotides left: 9,10, in which, cannot be translated. | |
stops += [len(sequence)-4, len(sequence)-3, len(sequence)-2] | |
for start in range(0,len(sequence)): | |
if start in stops: | |
# We cannot start with a stop, shift | |
continue | |
for stop in stops: | |
if stop > start and (stop - start) % 3 == 0: | |
test_orf = sequence[start:stop] | |
if len(test_orf) > len(longest_orf): | |
longest_orf = test_orf | |
break # when first relevant stop is found, shift | |
def translateLongestOrf(): | |
translated = "" | |
for i in range(0,len(longest_orf),3): | |
translated += codontable.get(longest_orf[i:i+3].upper(), 'X') | |
translated_chunks = re.findall('.{1,60}', translated) | |
print '>' + sequence_name | |
print '\n'.join(translated_chunks) | |
def main(): | |
global sequence, sequence_name, ongoing_sequence | |
prepareForNewSequence() | |
for line in sys.stdin.readlines(): | |
newSequenceMatch = re.search(r'^>(\w+).*', line) # returns match object if match else nothing | |
# newSequenceMatch will be casted to a boolean implicitly | |
if newSequenceMatch: | |
# Finish current sequence before parsing a new | |
if ongoing_sequence: | |
calculateLongestOrf() | |
translateLongestOrf() | |
prepareForNewSequence() | |
# group(1) will take what's inside the parenthesis in the regex above, i.e. (.+), in this case the name of the sequence | |
# group(0) will take the whole line | |
sequence_name = newSequenceMatch.group(1) | |
ongoing_sequence = True | |
elif ongoing_sequence: | |
sequence += line.strip() | |
# for end | |
if ongoing_sequence: | |
# Always calculate and translate at end of file when sequence is found | |
calculateLongestOrf() | |
translateLongestOrf() | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# STOCKHOLM 1.0 | |
gene4711 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT | |
hubba ACGTACGTACGTACGTACGTACGTANNNNNNNNNNTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT | |
gene4712 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# STOCKHOLM 1.0 | |
only10chars ACGTACGTAC | |
emptysequence | |
onenucleotide ---------- | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# STOCKHOLM 1.0 | |
#=GF AC RF00107 | |
#=GF ID FinP | |
#=GF DE FinP | |
#=GF AU Griffiths-Jones SR | |
#=GF SE Griffiths-Jones SR | |
#=GF SS Published; PMID:9917389 | |
#=GF GA 45.00 | |
#=GF TC 50.80 | |
#=GF NC 43.10 | |
#=GF TP Gene; | |
#=GF BM cmbuild -F CM SEED | |
#=GF CB cmcalibrate --mpi CM | |
#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB | |
#=GF DR SO; 0000644; antisense_RNA; | |
#=GF RN [1] | |
#=GF RM 9917389 | |
#=GF RT Degradation of FinP antisense RNA from F-like plasmids: the RNA-binding | |
#=GF RT protein, FinO, protects FinP from ribonuclease E. | |
#=GF RA Jerome LJ, van Biesen T, Frost LS | |
#=GF RL J Mol Biol 1999;285:1457-1473. | |
#=GF RN [2] | |
#=GF RM 14633993 | |
#=GF RT FinO is an RNA chaperone that facilitates sense-antisense RNA | |
#=GF RT interactions. | |
#=GF RA Arthur DC, Ghetu AF, Gubbins MJ, Edwards RA, Frost LS, Glover JN | |
#=GF RL EMBO J 2003;22:6346-6355. | |
#=GF CC The FinOP system regulates the transfer of F-like plasmids. FinP encodes | |
#=GF CC an antisense RNA product that is complementary to part of the 5' UTR of | |
#=GF CC the traJ mRNA. The traJ gene encodes a protein required for transcription | |
#=GF CC from the major transfer promoter, pY. The FinO protein is essential for | |
#=GF CC effective repression, acting by binding to FinP and protecting it from | |
#=GF CC RNase E degradation. | |
#=GF WK FinP | |
#=GF SQ 6 | |
AE006471.1/61593-61518 GACACAUAGGAACCUCCUCA--AAGGAUUCUAUGGACAGUCGAUGCAGGGAGU-GACAG-CUCCCUGUAUCGGCGAUUUA | |
X55896.1/1041-966 CACACAUAGGAACCUCCUCA--AAGGAUUCUAUG-ACAGUCGAUGCAGGGAGG-GACAAGCUCCCUGCAUCGGCGAUUUU | |
U01159.2/1039-961 GAUACAUAGGAACCUCCUCACAAAGGAUUCUAUGGACAGUCGAUGCAGGGAGUUCACGU-CUCCCUGCAUCGGCGAUUUU | |
X55894.1/1036-958 GAUACAUAGGAACCUCCUCACAAAGGAUUCUAUGGACAGUCGAUGCAGGGAGGAGAGAA-CUCCCUGCAUCGGCGAUUUU | |
AF389529.1/1113-1037 GAUACACAGGAGCCUCCUCA--AAGGAUUCUAUGGGUAGUCGAUGCAGGGAGG-GACCAACUCCCUGCAUCGGCGAUUUA | |
M20941.1/217-140 GAUACAUAGGAACCUCCUCUCAAAGGAUUCUAUGGACAGUCGAUGCGGGGAGGUCGCU--CUCCCUGCAUCGGCGAUUUU | |
#=GC SS_cons ::::<<<<<<<-<<<________>>>->>>>>>>----<<<<<<<<<<<<<<_______.>>>>>>>>>>>>>>:::::: | |
#=GC RF gAuACauaGgaACCuCCUCAcAAaGGAuuCuauGGAcAGuCGauGCaGGGaGgugAcaa.CuCCCuGCauCGgCGAUUUu | |
// |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment