Skip to content

Instantly share code, notes, and snippets.

@diallobakary4
Created June 26, 2016 10:19
Show Gist options
  • Save diallobakary4/1e020faf2973866ca66826697c43a848 to your computer and use it in GitHub Desktop.
Save diallobakary4/1e020faf2973866ca66826697c43a848 to your computer and use it in GitHub Desktop.
kj
# -*- coding: utf-8 -*-
# CODE CHALLENGE: Solve the String Composition Problem.
# Input: An integer k and a string Text.
# Output: Compositionk(Text) (the k-mers can be provided in any order).
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo
#Test de la fonction StringKmerCompo
# seq = "TAATTAGTCTTCGCGAGGGGTTAAATATAGCGTGCAGATGTAATAACCAACCTTCGCAGCAGGTATGGGCGTTATGCATCCTTATACCGAATGATGTTACACATAAATCTCGGAAGAGCGCAGGATACATGGAATGAGCATAACGGGGCGCGCTATAGATAAGTTGTTTGAGCTAAGAGGTACACGTGCCGCTAAATAGGTCGCCGCATGATATTACGATTGCACGTCATAATTTTGTAGTGATAGTTATATGATTTCAACCCACCGCATTACAGCCACTCTGCGTCGCTCCTAAGGACATCCGTGCCCGAATCGCACTATCAGACCCACGAGTCCGTGGTTGACAGCTAGTACCAGGAGATATCGAATTCACCACAGGCGTCAATTCATAGGGACGCAAGTATAAGTTGCACCGTCTAGCCCGCGCCCCTGTCCCACTGCGGATCAGTAGAAAAGTGGTGCCATGGCACAATACCCCTCGACAGACGAATACACCGAGGAGTTATTCTGAGGCAACCTCGTGGATCATCTAAGTCAGCGCGCTAGAAGCGCTTGTATAAGGCAGATAAAATTTAATAAGAGCTTGGAATGCCAGGTCTACGATCACTATCCTTAGATGGCACTAACGATCCGAATTCCCTTTCGCAAAAATGGATAGGACATGTGAAGAAGGTTAATGGCACCTGGGTGTTTCCACCTTCCTGACTCCGACAACGTTTAAAGAGTCCGAGTCGGTCGTTCTAAAGCAGTTGAGGAAAATAACGCGGACTCTCAACAACGTCTCTGTGGCCAGTCTCAAGCTGGGCTTAAGTGACGTAGGAAAGCCATCCCTCGGGTGTACGAGGTGCAACCCCATGATGCTCCCGAGCTCCAGAAACATGGGGCTCGCCGGCCCCCTCGCACTTCTCACCCCTCATATACGGGCTTGCATCTGAGTTGGCTCTAACACTTAGAGAAGACCTCTGGCTACACGGTAGACAATCTGTTTGTTAATAACCTAGGTCCCTCTGGGAGATTAGTGCGATATTGGTATTATTTCATGAATCATATCGGACCGGATCGCTGTTTGGCGATAGGCATCCATACGATACATAAAAAAGATTACGTCTAGGCGCTATGCCCTGATCCTCCCGAGATAATGACATGCTTTGCAGGATTGCTATAGAATGGGTCAGTGCAGCTACATCTGATAATTAATATTCATCACCCGTGAGTGAAACGGTGAGTTCTTCAAAGAATCTCATCGTTTACATACCGGGGCCGGTTACTTATTCAAAAAGGGGGTCAAAACTTAACTGATTTCGTCTCGGCTAGATTAGACCGGACGAAAAATCGAACCGTCAAGTGCATTGTAGGCGACGGACAGTGATCGGCGGTAATCGATCATATGGCGGTTTTCCGAATTAGAAACCGGCATGCAATCACCAGCGTCGACCCAGCTAGCTATGGCGGACTCAGCGGGACCGTGGCGACGGGCACTGGAAACCTTGTTCAGATAGGTAAGGGGTTCTCCCAGTTATGTGTGTCGGTAAGCGAAAATTGGGATAACGAATTTTTGAGGTAAACCTATGTAGGTATTTCATATGTCGCAGTAACAGTCGGTCCGGCTTCGGGCACATACAACACCTAGTTTCCTTCCTGGAGACGCTCACTACGTAGAAGCCGTACTGCCGAGACATCGGGGCTTCGAGTAGGACGGCAAATAATAACAGACGCACCGATCCTCGGTTCTGCGTTGACTAAGGCTGTGCCCGTACCTCTCCGCGGGCTAGTTGATGTCTCAAACTTGTGAAGCACCGCCTCCCCCACATACAATCTGAAGCGAGAAAAATGTTTCCGAAGCATAGTAATTAAGCGAAGGCGTAATTCGGTTTCGAAACGTAGTATTTTTCCTCTACCTCCGTGTCTAGGACCCTCAGGCAGACGATGGCCACAAGACCGGACCAAACACACCAGGCGCGAAGGTATCCGCCCTGCTCTCGTGACAGTGTCCTTTAGTTACCGCCGATGGATCGTTCCACTTGTCCTAACACTCGGCTGAGATGATTATGTTCACAAGCTATATTACCGTTCCGGCCACAGGCTTTTATGCGACAGTAGCGCGCGCTGGTACGGGGTTATAGTTTTTGCATCTGTCCGAGTTGGTGTATGTACTACCACATATCTCCTATGTCGTAGCAATGTACTTATGAAGTTCAATAGTACTCTCCTCTCAACTGGAGCAGCGATCAGTTGTAGAGCCGGTTAAGACGTCAGGCAACTCAATCTGAGAATGGGCTCTCGCCGGCATACGTTGTAGTCGCGGTCCCACTGACTCGAACCTGGATTCGAATTTGACGTCCCTACGACTTAGGGCTTTACACCGGCCAACCAGTTAGAGTGGCTCCGCCGGATACTAACTGAGTGATTAAGGGGGCCGCTATAGTCGGTCATTGAGATTACGTTCGCTACGAACGGGTTAGAAGTTTCGACATTATAAACTAGGCAGAATCTCCCTATTGTGACTTGGCTACTTGAGCGAATCAGGCAGGGCCGCAGACATGGGGCGCAGCGTTAGAAAAGGTTTCCGCCGCTATTAGGCCGGAAGAAATTTTGAGCGCAGGTCTTATTCGATATCCAAAGTGGACTCGTTTGATAGCGCGAGGCCTTTTAGCCTCGATGCTCGAAATTGCCGGCAGTGCGGGGGGTGCAGCTCGCGACATAGTCACCTCCGTCTCACACCGTGAAGCTCAGATTATGCTACCTTCGACGGTACCCGTTTGCCATTCATTAAAATAATTAGGCAACAGTGGAACGTAGTGGCATCCCGGAGTCAGGTACCGATGTGTCGATCGGCTCTGCCATCTAGTGTGACCTCGAACGGAGTTAATCCTGAACATCTACCGATAACGTGCAAGTTACCAGAAGCCGTCGAGGCGAACAGAGCCATTTTCACACCAAGCTGATAAACTGTCGAGTAAACACCAGCATGGATGAGTCAATTATGTGACCAAATACGAGGACTGAAATGCAGATGGCATGCACCGACACTCTTGTGTCTGTACATATACCGTGTTTTAGTTAGATTGTCAGGTTTGGGCCCAGGGTTACGGAGCTGAAGTGGGCTTGATAGGTCAGGGCCGAAAAGACCCGACCTAACAGGTAACCGCGCTGACACATGTCGCCTCTGCCGCCTTTAAAGCGATACGACGGCAACGTCGTCCTGTTCCTGAGATTTTGTCACTTCGAGGCGTAGGAAGGAGAATGTACAGTGAACGGTTTTACTCTCTGCATGGCGGAATCTAGTCCTGTAGCGGTAGATGAAGTTACCAATGGACCGAGCATCTCAAAAGTAGCTTAGAAGCTGAGAGCGCGTCAGGAGGCTTATATGTGTTGGGACTGCCAACTTAGTTCGCGAGCCATGGGTTCTAGTTAATTCTATGAGGCCAGCCTCCCGATTATAAGCGGTTCGTGACATTGCGGGCATTTCGCCGAAGGGGCGCGCTTGCCAGTCGGAGTGTGCATATAGGCTGTCACTACGGGATAGTATCCGCGACGGGACCACTGATCAGGGTGAACTATATAGTTGGACCAGCTGCTAGCTAACCCCAAAATTTAAACATTCATAGCTTAGAATGTTAAGTGTCAGATGCTGCACGTCAGGTCCCTTACAGAATCCTCCGCGAGAGCTCTATCGAGAATGATAAAGTCGCATTCACCTCGCTTAGCATTCGAGATCAGCGATTGTTACGGACACAAGGCCAAGGGTAGGGGATTTTCTCCGATTCTTTCGGCCTCGGCAGTGAGGGCACCGTATTGCGGATCGCTTTATCGAGCGTCGCTGTCTCTCCTGTTAACCGGGCCAGATGATACAATTCGCATAACTTTTTAGCCAGTAGGCCGAAGCTACAGGTGTTAGGTTCACAGTGCAGCATGTATGGAAACCGTTCGTCGCCGGGGAGGGCTCGGCTGAGACGCTGCGTCACGACTTTTGACCTTGATCTCCCGTGATTCGTATTTGAAGACAAAATCGACTGTACAGTACAGTCAGGGTCTTGTGGGCTAGGAACATGTTCCTGTTCGACACTGACCGCAAAGAGCTCCAACGCACGCTGTCGATAGGCGCCCATACAAACCGCTCCGCCTCGGTTACCCTCTCGCTGTGGGTCGGGCGAGACAAGCTGCCTTCCTACGGGGGTTGCAGTTCGGCCGAATGAAGTGTGAATTCCAGAACAACGTAGGCGTTATCACTGCTGGGATAAGCCTTGACTTCTGTTGCCTATACGAAACCTGCGTGCTCACTCTCCGAGTGACGCACGCTATTACCATCGTAGACAAACGGTCGTCACAGCAGCAAGTCTTTGGAGAGATGTATGCCGAAATAGGCACATAATTTCCCGTAATATCCGACACCTGCGCATAAAAATGTGCAAACTAGAACTTGAGTGCTGGCACGGATATATCACGGCGAATGGAATATGAGCACTCGTGGGTAACCGGATACAACATTCCTCAGGTCTGCATAACTACCTGTGGTCTCCAATGACCGCACCGTGGACTTAGTGAGAGGATAAGACCCACCCCCATGCCAACAGAGGAAGGCGGCTTGTGCCTTCTACTAAGGAGAAATGCACTAGAGATGTCGTTTGGCTGTCCGCAGGGTCTAGGAAAATGGGATTGATACTTGGCTTTACCCCGGTGGTACTCCGATCGCGGGAGTAATACACTCAATATCACCTTGCACTGTTGAAGAACTGGGATGGAGAGTACAGGGCATCGAGGACGTTCAATACTCTGAAGCCGTCTACGCTAGGGCGCGAGGTGAGACAGCTTTTCGTCAGTTGTGGCGACAGTACACTATTTGAGGCATCAAGATGCAGACAAGCTGAGAGATGTGTAAACTCGAGAGTGCCCGCTGGTACTTGGCTAATTACCAGGTCGTCTCTTCGAAGAGCAGAGGCACGCGCACGGTGGAGGAATGTGACTGTCCCTCCAGCTCCCCCCTGCTTCTGATACCCAGT"
# for e in StringKmerCompo(seq,100):
# print e
#
# String Spelled by a Genome Path Problem. Reconstruct a string from its genome path.
# Input: A sequence of k-mers Pattern1, … ,Patternn such that the last k - 1 symbols of Patterni are
# equal to the first k-1 symbols of Patterni+1 for 1 ≤ i ≤ n-1.
# Output: A string Text of length k+n-1 such that the i-th k-mer in Text is equal to Patterni (for 1 ≤ i ≤ n).
def GenomePath(reads):
path = reads[0]
for e in reads[1:]:
path += e[len(e)-1:]
return path
#Test de la fonction GenomePath
# DNAstring_open = open("dataset_198_3.txt", 'r')
# reads = DNAstring_open.readlines()
# reads2 = []
# for e in reads:
# reads2.append(e[:-1])
#
# print GenomePath(reads2)
# CODE CHALLENGE: Solve the Overlap Graph Problem (restated below).
# Input: A collection Patterns of k-mers.
# Output: The overlap graph Overlap(Patterns), in the form of an adjacency list.
# (You may return the edges in any order.)
def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
matrix = {}
for e in reads:
matrix[e] = []
for e in matrix:
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)
return matrix
#Test de la fonction GenomePath
# DNAstring_open = open("dataset_198_3.txt", 'r')
# reads = DNAstring_open.readlines()
# reads2 = []
# for e in reads:
# reads2.append(e[:-1])
#
# result = OverlapGraph(reads2)
# output = open("output.txt", 'w')
# # Formatting the output for coursera
# for e in result:
# if result[e]:
# print e +" - > "+ result[e][0]
# output.writelines(e +" -> "+ result[e][0] +"\n")
# seq = "AAGATTCTCTAAGA"
# result = OverlapGraph(StringKmerCompo(seq,3))
# for e in result:
# if result[e]:
# print e +" - > "+ str(result[e])[1:-1].replace("'","")
# PathGraphk(Text) is the path consisting of |Text| - k + 1 edges,
# where the i-th edge of this path is labeled by the i-th k-mer in Text
# and the i-th node of the path is labeled by the i-th (k - 1)-mer in Text.
def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])
return pathGraph
#Test de la fonction PathGraph
#print PathGraph("AAGATTCTCTAAGA", 3)
# In general, given a genome Text, PathGraphk(Text) is the path consisting of |Text| - k + 1 edges,
# where the i-th edge of this path is labeled by the i-th k-mer in Text and the i-th node
# of the path is labeled by the i-th (k - 1)-mer in Text. The de Bruijn graph DeBruijnk(Text)
# is formed by gluing identically labeled nodes in PathGraphk(Text).
# Input: An integer k and a string Text.
# Output: DeBruijnk(Text), in the form of an adjacency list.
def DeBruijnGraph(seq,k):
path = PathGraph(seq,k)
i = 0
for e in path:
for b in path[i+1:]:
if e[0] == b[0] :
e.append(b[1])
i += 1
# Just remove duplicate edges
finalPath = []
check = {}
check = set(check)
for e in path:
check.add(e[0])
for e in path:
if e[0] in check:
finalPath.append(e)
check.remove(e[0])
return finalPath
#test de la fonction DeBruijnGraph
# for e in DeBruijnGraph("TCGTGTGGTCGTGAGCATTAACGCCTACGCTGCCGGCATCATCTATCCGTTCCCATTAACGGTGAGGTTTTGTTAAGGCAAATAGGCAAAGGCTGATTAGTCCCCCATAGTGCCATTGCAATTGACATATGTTCCGGAAAAAACCACAGGCGTCCCGTGGAACTATAAAGTGTCGATTCGCCGGAAGCTTCAGTTTTGATTAACCAGAGCGTTTTTTCTATGCGCCATCATCCAGATGCACAAATTCTCGATTGTAGGATAGTGACTACATGATGATCACTGGAAGGCCGGCAAGTCCACAATTTCCGGATGCGAATTAGGCTCCTATCGGATCTTATTGTTGTCAGCTCCCCCGTTCTTAATCTTCACAACTTGCAATAATAGTTATCGCGTTACTGACTGTAGCTTAAGCAAGCTTTAACGGCACACCCCAAAACCCGTCTGGTCGGATAACACAAGCGTGTAGGACCAGGAAAGCTGAGCCCTGCATATCCACCACTTCGGTGATCCAGGGGGCGTCAGTTAAGCCACCAGCAAGACCATGATCAACCCGCGGTACGCGACGTTCTTGAACGATCCCTGCAAACACATAACGAATTTTGAGAAATTTGTGGAATGAATGCCAGCTCGCTTGATTCCCAAAACGAATGACACGTCGAACCCCAGGTTACAGCCTCTGAGTAACTGGTACGGTTTACACACTCACGGTGCAGGTTCGCGTCGCGTGCGCTAGGCTATTCCGAGTTTCGGAACGCAACGAGCAACCCCGCTAATCGGAACGCCAGGCGCTCTGGGGTCCTACGCGGGCAGTTCAGTAACACCCTGTATGTGATAGACCTAGTCGGTGGCCTTATATAGGATTAATCCTTTTATGATCAGGAACACTAACTTTAAGTCCGTTAATAGAAGAGTAATCAAATTGTCAAATTCTTGATCGCAGGGAAAGGTCGGGCTCCTGACAGAGCACGATAAGGTCTCCGATCTCGTTTAGGAGCGCGATTTAATTCGCTGGAATTTAACAGGCTCTGCAGATGCGAGATAGGTCCGAATTCCCGACCAATATAGATGCGCCGGGCCCGAGGTGACGTGGTGACATGCAATCGTGCTTCAGAGTGCCCAACGTCCAACTAAGAATCACGGAACACAATATAACACGCTAATTAGGCTAAATGTTCCTCAGACGGTCTGGACGGTTTTGCCGTCTCATGAGTCTCTAAGCGTTCCGATGAGCCATTATGTGTCCCTTGAAAGCCTTTTAGCCCCATCTATGGGACTAAGTCGGGTGGTCCAAAAAAACCCAACTGTCGTTAACAGCCTTGTGGTCCCTATCCGTCCGGAAAGTAACGCGGCAATCGTACTTGCTCATTGCAAAGTCAGCCGCGGTATTGCGCTCGTAGATCTTTGACATTCGCATACGGAATGGTCGGTAGTTCAATTAAATGAGCACTCCAAGGTCTTGCGCTCTCTTGCTAAGCCGGGTAAATACCTATTCTTGCAATCGTGACCCGAGAGAGACATCCTTATCGTAACGGTCGCGGCAAATCTTCAGGGGTCCGGTTCGCTCCGTAATGTCGCCCATATTACGTTGTTTCGGGTGGTGACGTTGGCGTATGAGCGTGACTTCCACTCCGATTCATCACGTAGGGTGGCCCTGATATGACTCTGTTGAAAGGTGGCGGCACCAAGATATTGGATGGTGGCATATGGACCTGATCTAAGGAGGAACTCTGAAAAATCCTAGAAGTCAGGAAGTCTGAGAGGGGACGCCATCATGGTTCCTTCACAATAGCTCGAGACAAAAGGCGTTTTTTTCCCAGCCAGCTCGCTGTGGACACGGCGCTAGGGTCACGCACACTGAGCAGTATACAATTTTGCGGTTGATCAAAGTTAGATGATGTATTTCGTAAGGCTCCTCATTCCAATTGCAGTAGGCCTCGGGCAACCACGGCGACACATTAGGAGTCTATGTGCCAGATCAACACCGTTAGCGTGAGTCGAAG", 12):
# print e[0] +" -> "+ str(e[1:])[1:-1].replace("'","")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment