Last active
October 12, 2018 04:57
-
-
Save leandrosilva/94bc9bc2f01f44a42dc9daa8d382ade3 to your computer and use it in GitHub Desktop.
Simple k-mers algorithms (pattern count, frequent words and clump finding) in JS using little genoma snippets for test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Problem: Count the number of times a string appears as a substring in a longer text. | |
function patternCount(text, pattern) { | |
if (!text || !pattern) return -1; | |
let textLength = text.length; | |
let k = pattern.length; | |
let count = 0; | |
for (let i = 0; i < (textLength - k + 1); i++) { | |
let kmer = text.substring(i, i + k); | |
if (kmer === pattern) count += 1; | |
} | |
return count; | |
} | |
// Problem: Find the most frequent k-mers in a string. | |
function frequentWords(text, k) { | |
if (!text || !k) return {}; | |
let textLength = text.length; | |
let kmersCount = {}; | |
let maxCount = 0; | |
for (let i = 0; i < (textLength - k + 1); i++) { | |
kmer = text.substring(i, i + k); | |
let count = patternCount(text, kmer); | |
if (count > maxCount) maxCount = count; | |
kmersCount[kmer] = count; | |
} | |
let mostFrequent = {}; | |
for (let kmer in kmersCount) { | |
let count = kmersCount[kmer]; | |
if (count === maxCount) { | |
mostFrequent[kmer] = count; | |
} | |
} | |
return mostFrequent; | |
} | |
// Problem: Find patterns forming clumps in a string. All distinct k-mers forming (L, t)-clumps in Genome, i.e. | |
function clumpFinding(genome, k, chunkSize, clumpSize) { | |
if (!genome || !k || !chunkSize || !clumpSize) return {}; | |
let genomeLenth = genome.length; | |
let kmers = []; | |
for (let i = 0; i < (genomeLenth - chunkSize + 1); i++) { | |
genomeWindow = genome.substring(i, i + chunkSize); | |
let kmerCounter = {}; | |
for (let z = 0; z < (chunkSize - k + 1); z++) { | |
let kmer = genomeWindow.substring(z, z + k); | |
if (!kmers.includes(kmer)) { | |
let kmerCount = 1; | |
if (kmerCounter[kmer]) kmerCount = kmerCounter[kmer] + 1; | |
kmerCounter[kmer] = kmerCount; | |
if (kmerCount === clumpSize) kmers.push(kmer); | |
} | |
} | |
} | |
return kmers; | |
} | |
function testPatternCount() { | |
console.log("1st test group - pattern counting: "); | |
let output = patternCount(); | |
console.log(`- constraints1 => no text and pattern [${output === -1}]`) | |
output = patternCount(null, ''); | |
console.log(`- constraints2 => no text [${output === -1}]`) | |
output = patternCount('', null); | |
console.log(`- constraints3 => no pattern [${output === -1}]`) | |
output = patternCount("GCGCG", "GCG"); | |
console.log(`- test1 => ${output} == 2 [${output === 2}]`); | |
output = patternCount("ACGTACGTACGT", "CG"); | |
console.log(`- test2 => ${output} == 3 [${output === 3}]`); | |
output = patternCount("AAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAATAATTACAGAGTACACAACATCCAT", "AAA"); | |
console.log(`- test3 => ${output} == 4 [${output === 4}]`); | |
output = patternCount("AGCGTGCCGAAATATGCCGCCAGACCTGCTGCGGTGGCCTCGCCGACTTCACGGATGCCAAGTGCATAGAGGAAGCGAGCAAAGGTGGTTTCTTTCGCTTTATCCAGCGCGTTAACCACGTTCTGTGCCGACTTT", "TTT"); | |
console.log(`- test4 => ${output} == 4 [${output === 4}]`); | |
output = patternCount("GGACTTACTGACGTACG", "ACT"); | |
console.log(`- test5 => ${output} == 2 [${output === 2}]`); | |
output = patternCount("ATCCGATCCCATGCCCATG", "CC"); | |
console.log(`- test6 => ${output} == 5 [${output === 5}]`); | |
output = patternCount("CTGTTTTTGATCCATGATATGTTATCTCTCCGTCATCAGAAGAACAGTGACGGATCGCCCTCTCTCTTGGTCAGGCGACCGTTTGCCATAATGCCCATGCTTTCCAGCCAGCTCTCAAACTCCGGTGACTCGCGCAGGTTGAGTA", "CTC"); | |
console.log(`- test7 => ${output} == 9 [${output === 9}]`); | |
} | |
function testFrequentWords() { | |
console.log("\n2nd test group - frequent words: "); | |
let output = frequentWords(); | |
console.log(`- constraints1 => no text and k [${JSON.stringify(output) === "{}"}]`); | |
output = frequentWords(null, 4); | |
console.log(`- constraints2 => no text [${JSON.stringify(output) === "{}"}]`); | |
output = frequentWords('', null); | |
console.log(`- constraints3 => no k [${JSON.stringify(output) === "{}"}]`); | |
output = frequentWords("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4); | |
console.log(`- test1 => ${JSON.stringify(output)} == 2 [${output['CATG'] !== undefined && output['GCAT'] !== undefined}]`); | |
output = frequentWords("TGGTAGCGACGTTGGTCCCGCCGCTTGAGAATCTGGATGAACATAAGCTCCCACTTGGCTTATTCAGAGAACTGGTCAACACTTGTCTCTCCCAGCCAGGTCTGACCACCGGGCAACTTTTAGAGCACTATCGTGGTACAAATAATGCTGCCAC", 3); | |
console.log(`- test2 => ${JSON.stringify(output)} == 1 [${output["TGG"] !== undefined}]`); | |
output = frequentWords("CAGTGGCAGATGACATTTTGCTGGTCGACTGGTTACAACAACGCCTGGGGCTTTTGAGCAACGAGACTTTTCAATGTTGCACCGTTTGCTGCATGATATTGAAAACAATATCACCAAATAAATAACGCCTTAGTAAGTAGCTTTT", 4); | |
console.log(`- test3 => ${JSON.stringify(output)} == 1 [${output["TTTT"] !== undefined}]`); | |
output = frequentWords("ATACAATTACAGTCTGGAACCGGATGAACTGGCCGCAGGTTAACAACAGAGTTGCCAGGCACTGCCGCTGACCAGCAACAACAACAATGACTTTGACGCGAAGGGGATGGCATGAGCGAACTGATCGTCAGCCGTCAGCAACGAGTATTGTTGCTGACCCTTAACAATCCCGCCGCACGTAATGCGCTAACTAATGCCCTGCTG", 5); | |
console.log(`- test4 => ${JSON.stringify(output)} == 1 [${output["AACAA"] !== undefined}]`); | |
output = frequentWords("CCAGCGGGGGTTGATGCTCTGGGGGTCACAAGATTGCATTTTTATGGGGTTGCAAAAATGTTTTTTACGGCAGATTCATTTAAAATGCCCACTGGCTGGAGACATAGCCCGGATGCGCGTCTTTTACAACGTATTGCGGGGTAAAATCGTAGATGTTTTAAAATAGGCGTAAC", 5); | |
console.log(`- test5 => ${JSON.stringify(output)} == 3 [${output['AAAAT'] !== undefined && output['GGGGT'] !== undefined && output['TTTTA'] !== undefined}]`); | |
} | |
function testClumpFinding() { | |
console.log("\n3rd test group - clump finding: "); | |
let output = clumpFinding(); | |
console.log(`- constraints1 => no parameters [${JSON.stringify(output) === "{}"}]`); | |
output = clumpFinding("CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA", 5, 50, 4); | |
console.log(`- test1 => ${JSON.stringify(output)} == 2 [${output.includes('CGACA') && output.includes('GAAGA')}]`); | |
output = clumpFinding("AAAACGTCGAAAAA", 2, 4, 2); | |
console.log(`- test2 => ${JSON.stringify(output)} == 1 [${output.includes('AA')}]`); | |
output = clumpFinding("ACGTACGT", 1, 5, 2); | |
console.log(`- test3 => ${JSON.stringify(output)} == 4 [${output.includes('A') && output.includes('C') && output.includes('G') && output.includes('T')}]`); | |
output = clumpFinding("CCACGCGGTGTACGCTGCAAAAAGCCTTGCTGAATCAAATAAGGTTCCAGCACATCCTCAATGGTTTCACGTTCTTCGCCAATGGCTGCCGCCAGGTTATCCAGACCTACAGGTCCACCAAAGAACTTATCGATTACCGCCAGCAACAATTTGCGGTCCATATAATCGAAACCTTCAGCATCGACATTCAACATATCCAGCG", 3, 25, 3); | |
console.log(`- test4 => ${JSON.stringify(output)} == 6 [${output.includes('AAA') && output.includes('CAG') && output.includes('CAT') && output.includes('CCA') && output.includes('GCC') && output.includes('TTC')}]`); | |
} | |
testPatternCount(); | |
testFrequentWords(); | |
testClumpFinding(); |
Some sample output would be like this:
1st test group - pattern counting:
- constraints1 => no text and pattern [true]
- constraints2 => no text [true]
- constraints3 => no pattern [true]
- test1 => 2 == 2 [true]
- test2 => 3 == 3 [true]
- test3 => 4 == 4 [true]
- test4 => 4 == 4 [true]
- test5 => 2 == 2 [true]
- test6 => 5 == 5 [true]
- test7 => 9 == 9 [true]
2nd test group - frequent words:
- constraints1 => no text and k [true]
- constraints2 => no text [true]
- constraints3 => no k [true]
- test1 => {"GCAT":3,"CATG":3} == 2 [true]
- test2 => {"TGG":6} == 1 [true]
- test3 => {"TTTT":4} == 1 [true]
- test4 => {"AACAA":5} == 1 [true]
- test5 => {"GGGGT":4,"TTTTA":4,"AAAAT":4} == 3 [true]
3rd test group - clump finding:
- constraints1 => no parameters [true]
- test1 => ["CGACA","GAAGA"] == 2 [true]
- test2 => ["AA"] == 1 [true]
- test3 => ["A","C","G","T"] == 4 [true]
- test4 => ["AAA","TTC","GCC","CCA","CAG","CAT"] == 6 [true]
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Find my original Python solution right here.