Skip to content

Instantly share code, notes, and snippets.

@stefanahman
Last active November 20, 2016 22:22
Show Gist options
  • Save stefanahman/127c1aaac06f006a3c22ec2c8a894276 to your computer and use it in GitHub Desktop.
Save stefanahman/127c1aaac06f006a3c22ec2c8a894276 to your computer and use it in GitHub Desktop.
Python lab
>where_is_the_exon
ATCAATTCGATTCCAAAACTGTTTTTAACTGTATTGCTAAGCTGCCTTCCACTAGGTTGG
TCAGGAAGGATAGTAGAGACAGAGGAAATGGCAGAAGGGACTGGGGGGGTGGGGACAAGA
TGCTGGCTCTGTTGCATTTTGAAGGCCACATCTGCCAGCCTGGGAGTGGCCTGTGGTGGC
CAGGCAACCCAGCTTGATGTTGCCTCTTCTGTCCTGAGGATGGAACAGAGGCAGGTGAGA
AGCTTTCTGTGGCTGCTGCAGCAAAACCAACCTATGACCAGGGGCTTTGGCTGCCATTGC
CCTCCCTCCAGCCAGAAGTAAGGAGAAGGGAGCCATGCTTTTGGTGGGTAAGCTCCGCGA
ACAAGAACTGTGGCCCTGGGAGCTTTACTCTCAGTCCTAGCTTCTCCAGGAGAGGGTCAG
CCACTGGATGCTTGGAGTCCACACTAACAGCACAACTGATTCTCGGCAGCTGACGACGGG
AGTGGAGTATATAGTCACTGTGAAGATTGGCTGGACCAAATGCAAGAGGAATGACACGAG
CAATTCTTCCTGCCCCCTGCAAAGCAAGAAGCTGAGAAAGGTGTGTAGTGGAAGTCATCC
CAAAGGGACTTGTGAAGTGAATATTTTAAAAAGCAACAGCTAGAAGTTGGCAGTTTTGCC
ACTTGTGTGGATTGGGGCCAGCTGGGGCCCAGCCCTGGGGCTGTCTGGCACCTCCTGAGG
CACCTGGGTGAAGGAGGGAGGCTTTATATTTGGGCTGGTACAGCCCTGCACCCAGGCCTC
TGGTTGCCTGGACCCTTCTCCAACCTGGCCTGCTCCCCAGACCCTCTGCGTGAAAGCAGT
GGTTCCCTTTCCTTGGGGTCCTTCTTCTGTCTTCTTTCTCCCTCTCCAGGTCTCTAAGGT
TGTCCAATTTCCTCCTTGTTCCCTTCTATTTTTCCTGGTCCATGAGTGGCACCCCTTGAA
AGCTGGCAGATACAAGGGGCTCCCCCATGCCTGGGGAGGTGCAGGAGGAGGTAACGATGC
TATTGCTCCTTGCTGGCCTGCCTGGCCCACATGACAGGGTCCGGCCAGGCCATAGACTCC
AGGTTCACCTGCTGTGGGTCCCAAGTGGCTGGGTAGGGGGAAGACAGGGTGACATTCCTG
TTCCAGACTGGCACTCCTTTCTGTTTCTACCCTTGACTGTGGGCAGCTCCCCTCTCTGCC
CATGGGGTTCATGAGCAGCCACAGGGAGAGACCTGGGGAACAGGTGCCTGTTGCTTGCCT
TTGAAAAATACTTCCCCCAAAGTCTGTTTTCTTTCTTCTTCTACAGAGTTTAATTTGCGA
GTCTTTGATATACACCATGCCCTGGATAAACTATTTCCAGCTCTGGAACAATTCCTGTCT
GGAGGCCGAGCATGTGGGCAGAAACCTCAGATGAGGGCTCATATGATTGAGTTGTGCACT
GGCTGTTATTAAACTGTAAAGGATCA
import random
length = int(input("Length of sequence: "))
sequence = ""
for i in range(0, length):
sequence += random.choice(['A', 'G', 'C', 'T'])
print sequence
# How to run:
# cat Stockholm-Test.txt | python lab1_3.py
import fileinput
import re
sequence_names = []
for line in fileinput.input():
# if re.search(r'^([A-Z]|[0-9])+\.[0-9]\/[0-9]+-[0-9]+\s+(G|A|C|U|-)+\s?$', line):
# sequence_names.append(line.split()[0])
# if not re.search(r'^[A-Z].*', line):
# sequence_names.append(line.split()[0])
# if not re.search(r'^(#|\s|\/).*', line):
# sequence_names.append(line.split()[0])
# if line.strip() and not (line.startswith('#') or line.startswith('/')):
# sequence_names.append(line.split()[0])
if line.strip() and not line.startswith('#') and not line.startswith('/'):
sequence_name, sequence = line.split()
sequence_names.append(sequence_name)
print str(len(sequence_names)) + '\n'
print '\n'.join(sequence_names)
# How to run:
# python lab1_4.py < Stockholm-Test.txt
import sys
import re
def is_valid_sequence_line(line):
return re.search(r'^(\w|-|\/)+\s+(\w|-|\.)+', line)
def print_line_in_fasta_format(line):
sequence_name, sequence = line.split()
print '>' + sequence_name
sequence_chunks = re.findall('.{1,60}', sequence)
print '\n'.join(sequence_chunks)
def main():
for line in sys.stdin.readlines():
if is_valid_sequence_line(line):
print_line_in_fasta_format(line)
if __name__ == "__main__":
main()
# How to run:
# python translate.py < translate.fa
import sys
import re
codontable = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
def prepareForNewSequence():
global ongoing_sequence, longest_orf, sequence, stops
ongoing_sequence = False
longest_orf = ""
sequence = ""
stops = []
def calculateLongestOrf():
global longest_orf, stops
stops += [m.start() for m in re.finditer('(TAA|TAG|TGA)', sequence.upper())]
# This is the _final_ stops, when no codon's found.
# Example with sequence of length 10:
# 10 - 4 => 6,7,8. Two nucleotides left: 9,10, in which, cannot be translated.
stops += [len(sequence)-4, len(sequence)-3, len(sequence)-2]
for start in range(0,len(sequence)):
if start in stops:
# We cannot start with a stop, shift
continue
for stop in stops:
if stop > start and (stop - start) % 3 == 0:
test_orf = sequence[start:stop]
if len(test_orf) > len(longest_orf):
longest_orf = test_orf
break # when first relevant stop is found, shift
def translateLongestOrf():
translated = ""
for i in range(0,len(longest_orf),3):
translated += codontable.get(longest_orf[i:i+3].upper(), 'X')
translated_chunks = re.findall('.{1,60}', translated)
print '>' + sequence_name
print '\n'.join(translated_chunks)
def main():
global sequence, sequence_name, ongoing_sequence
prepareForNewSequence()
for line in sys.stdin.readlines():
newSequenceMatch = re.search(r'^>(\w+).*', line) # returns match object if match else nothing
# newSequenceMatch will be casted to a boolean implicitly
if newSequenceMatch:
# Finish current sequence before parsing a new
if ongoing_sequence:
calculateLongestOrf()
translateLongestOrf()
prepareForNewSequence()
# group(1) will take what's inside the parenthesis in the regex above, i.e. (.+), in this case the name of the sequence
# group(0) will take the whole line
sequence_name = newSequenceMatch.group(1)
ongoing_sequence = True
elif ongoing_sequence:
sequence += line.strip()
# for end
if ongoing_sequence:
# Always calculate and translate at end of file when sequence is found
calculateLongestOrf()
translateLongestOrf()
if __name__ == "__main__":
main()
# STOCKHOLM 1.0
gene4711 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
hubba ACGTACGTACGTACGTACGTACGTANNNNNNNNNNTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTT
gene4712 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
//
# STOCKHOLM 1.0
only10chars ACGTACGTAC
emptysequence
onenucleotide ----------
//
# STOCKHOLM 1.0
#=GF AC RF00107
#=GF ID FinP
#=GF DE FinP
#=GF AU Griffiths-Jones SR
#=GF SE Griffiths-Jones SR
#=GF SS Published; PMID:9917389
#=GF GA 45.00
#=GF TC 50.80
#=GF NC 43.10
#=GF TP Gene;
#=GF BM cmbuild -F CM SEED
#=GF CB cmcalibrate --mpi CM
#=GF SM cmsearch --cpu 4 --verbose --nohmmonly -E 1000 -Z 549862.597050 CM SEQDB
#=GF DR SO; 0000644; antisense_RNA;
#=GF RN [1]
#=GF RM 9917389
#=GF RT Degradation of FinP antisense RNA from F-like plasmids: the RNA-binding
#=GF RT protein, FinO, protects FinP from ribonuclease E.
#=GF RA Jerome LJ, van Biesen T, Frost LS
#=GF RL J Mol Biol 1999;285:1457-1473.
#=GF RN [2]
#=GF RM 14633993
#=GF RT FinO is an RNA chaperone that facilitates sense-antisense RNA
#=GF RT interactions.
#=GF RA Arthur DC, Ghetu AF, Gubbins MJ, Edwards RA, Frost LS, Glover JN
#=GF RL EMBO J 2003;22:6346-6355.
#=GF CC The FinOP system regulates the transfer of F-like plasmids. FinP encodes
#=GF CC an antisense RNA product that is complementary to part of the 5' UTR of
#=GF CC the traJ mRNA. The traJ gene encodes a protein required for transcription
#=GF CC from the major transfer promoter, pY. The FinO protein is essential for
#=GF CC effective repression, acting by binding to FinP and protecting it from
#=GF CC RNase E degradation.
#=GF WK FinP
#=GF SQ 6
AE006471.1/61593-61518 GACACAUAGGAACCUCCUCA--AAGGAUUCUAUGGACAGUCGAUGCAGGGAGU-GACAG-CUCCCUGUAUCGGCGAUUUA
X55896.1/1041-966 CACACAUAGGAACCUCCUCA--AAGGAUUCUAUG-ACAGUCGAUGCAGGGAGG-GACAAGCUCCCUGCAUCGGCGAUUUU
U01159.2/1039-961 GAUACAUAGGAACCUCCUCACAAAGGAUUCUAUGGACAGUCGAUGCAGGGAGUUCACGU-CUCCCUGCAUCGGCGAUUUU
X55894.1/1036-958 GAUACAUAGGAACCUCCUCACAAAGGAUUCUAUGGACAGUCGAUGCAGGGAGGAGAGAA-CUCCCUGCAUCGGCGAUUUU
AF389529.1/1113-1037 GAUACACAGGAGCCUCCUCA--AAGGAUUCUAUGGGUAGUCGAUGCAGGGAGG-GACCAACUCCCUGCAUCGGCGAUUUA
M20941.1/217-140 GAUACAUAGGAACCUCCUCUCAAAGGAUUCUAUGGACAGUCGAUGCGGGGAGGUCGCU--CUCCCUGCAUCGGCGAUUUU
#=GC SS_cons ::::<<<<<<<-<<<________>>>->>>>>>>----<<<<<<<<<<<<<<_______.>>>>>>>>>>>>>>::::::
#=GC RF gAuACauaGgaACCuCCUCAcAAaGGAuuCuauGGAcAGuCGauGCaGGGaGgugAcaa.CuCCCuGCauCGgCGAUUUu
//
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment