Skip to content

Instantly share code, notes, and snippets.

@leandrosilva
Last active October 12, 2018 04:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leandrosilva/94bc9bc2f01f44a42dc9daa8d382ade3 to your computer and use it in GitHub Desktop.
Save leandrosilva/94bc9bc2f01f44a42dc9daa8d382ade3 to your computer and use it in GitHub Desktop.
Simple k-mers algorithms (pattern count, frequent words and clump finding) in JS using little genoma snippets for test
// Problem: Count the number of times a string appears as a substring in a longer text.
function patternCount(text, pattern) {
if (!text || !pattern) return -1;
let textLength = text.length;
let k = pattern.length;
let count = 0;
for (let i = 0; i < (textLength - k + 1); i++) {
let kmer = text.substring(i, i + k);
if (kmer === pattern) count += 1;
}
return count;
}
// Problem: Find the most frequent k-mers in a string.
function frequentWords(text, k) {
if (!text || !k) return {};
let textLength = text.length;
let kmersCount = {};
let maxCount = 0;
for (let i = 0; i < (textLength - k + 1); i++) {
kmer = text.substring(i, i + k);
let count = patternCount(text, kmer);
if (count > maxCount) maxCount = count;
kmersCount[kmer] = count;
}
let mostFrequent = {};
for (let kmer in kmersCount) {
let count = kmersCount[kmer];
if (count === maxCount) {
mostFrequent[kmer] = count;
}
}
return mostFrequent;
}
// Problem: Find patterns forming clumps in a string. All distinct k-mers forming (L, t)-clumps in Genome, i.e.
function clumpFinding(genome, k, chunkSize, clumpSize) {
if (!genome || !k || !chunkSize || !clumpSize) return {};
let genomeLenth = genome.length;
let kmers = [];
for (let i = 0; i < (genomeLenth - chunkSize + 1); i++) {
genomeWindow = genome.substring(i, i + chunkSize);
let kmerCounter = {};
for (let z = 0; z < (chunkSize - k + 1); z++) {
let kmer = genomeWindow.substring(z, z + k);
if (!kmers.includes(kmer)) {
let kmerCount = 1;
if (kmerCounter[kmer]) kmerCount = kmerCounter[kmer] + 1;
kmerCounter[kmer] = kmerCount;
if (kmerCount === clumpSize) kmers.push(kmer);
}
}
}
return kmers;
}
function testPatternCount() {
console.log("1st test group - pattern counting: ");
let output = patternCount();
console.log(`- constraints1 => no text and pattern [${output === -1}]`)
output = patternCount(null, '');
console.log(`- constraints2 => no text [${output === -1}]`)
output = patternCount('', null);
console.log(`- constraints3 => no pattern [${output === -1}]`)
output = patternCount("GCGCG", "GCG");
console.log(`- test1 => ${output} == 2 [${output === 2}]`);
output = patternCount("ACGTACGTACGT", "CG");
console.log(`- test2 => ${output} == 3 [${output === 3}]`);
output = patternCount("AAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAATAATTACAGAGTACACAACATCCAT", "AAA");
console.log(`- test3 => ${output} == 4 [${output === 4}]`);
output = patternCount("AGCGTGCCGAAATATGCCGCCAGACCTGCTGCGGTGGCCTCGCCGACTTCACGGATGCCAAGTGCATAGAGGAAGCGAGCAAAGGTGGTTTCTTTCGCTTTATCCAGCGCGTTAACCACGTTCTGTGCCGACTTT", "TTT");
console.log(`- test4 => ${output} == 4 [${output === 4}]`);
output = patternCount("GGACTTACTGACGTACG", "ACT");
console.log(`- test5 => ${output} == 2 [${output === 2}]`);
output = patternCount("ATCCGATCCCATGCCCATG", "CC");
console.log(`- test6 => ${output} == 5 [${output === 5}]`);
output = patternCount("CTGTTTTTGATCCATGATATGTTATCTCTCCGTCATCAGAAGAACAGTGACGGATCGCCCTCTCTCTTGGTCAGGCGACCGTTTGCCATAATGCCCATGCTTTCCAGCCAGCTCTCAAACTCCGGTGACTCGCGCAGGTTGAGTA", "CTC");
console.log(`- test7 => ${output} == 9 [${output === 9}]`);
}
function testFrequentWords() {
console.log("\n2nd test group - frequent words: ");
let output = frequentWords();
console.log(`- constraints1 => no text and k [${JSON.stringify(output) === "{}"}]`);
output = frequentWords(null, 4);
console.log(`- constraints2 => no text [${JSON.stringify(output) === "{}"}]`);
output = frequentWords('', null);
console.log(`- constraints3 => no k [${JSON.stringify(output) === "{}"}]`);
output = frequentWords("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4);
console.log(`- test1 => ${JSON.stringify(output)} == 2 [${output['CATG'] !== undefined && output['GCAT'] !== undefined}]`);
output = frequentWords("TGGTAGCGACGTTGGTCCCGCCGCTTGAGAATCTGGATGAACATAAGCTCCCACTTGGCTTATTCAGAGAACTGGTCAACACTTGTCTCTCCCAGCCAGGTCTGACCACCGGGCAACTTTTAGAGCACTATCGTGGTACAAATAATGCTGCCAC", 3);
console.log(`- test2 => ${JSON.stringify(output)} == 1 [${output["TGG"] !== undefined}]`);
output = frequentWords("CAGTGGCAGATGACATTTTGCTGGTCGACTGGTTACAACAACGCCTGGGGCTTTTGAGCAACGAGACTTTTCAATGTTGCACCGTTTGCTGCATGATATTGAAAACAATATCACCAAATAAATAACGCCTTAGTAAGTAGCTTTT", 4);
console.log(`- test3 => ${JSON.stringify(output)} == 1 [${output["TTTT"] !== undefined}]`);
output = frequentWords("ATACAATTACAGTCTGGAACCGGATGAACTGGCCGCAGGTTAACAACAGAGTTGCCAGGCACTGCCGCTGACCAGCAACAACAACAATGACTTTGACGCGAAGGGGATGGCATGAGCGAACTGATCGTCAGCCGTCAGCAACGAGTATTGTTGCTGACCCTTAACAATCCCGCCGCACGTAATGCGCTAACTAATGCCCTGCTG", 5);
console.log(`- test4 => ${JSON.stringify(output)} == 1 [${output["AACAA"] !== undefined}]`);
output = frequentWords("CCAGCGGGGGTTGATGCTCTGGGGGTCACAAGATTGCATTTTTATGGGGTTGCAAAAATGTTTTTTACGGCAGATTCATTTAAAATGCCCACTGGCTGGAGACATAGCCCGGATGCGCGTCTTTTACAACGTATTGCGGGGTAAAATCGTAGATGTTTTAAAATAGGCGTAAC", 5);
console.log(`- test5 => ${JSON.stringify(output)} == 3 [${output['AAAAT'] !== undefined && output['GGGGT'] !== undefined && output['TTTTA'] !== undefined}]`);
}
function testClumpFinding() {
console.log("\n3rd test group - clump finding: ");
let output = clumpFinding();
console.log(`- constraints1 => no parameters [${JSON.stringify(output) === "{}"}]`);
output = clumpFinding("CGGACTCGACAGATGTGAAGAACGACAATGTGAAGACTCGACACGACAGAGTGAAGAGAAGAGGAAACATTGTAA", 5, 50, 4);
console.log(`- test1 => ${JSON.stringify(output)} == 2 [${output.includes('CGACA') && output.includes('GAAGA')}]`);
output = clumpFinding("AAAACGTCGAAAAA", 2, 4, 2);
console.log(`- test2 => ${JSON.stringify(output)} == 1 [${output.includes('AA')}]`);
output = clumpFinding("ACGTACGT", 1, 5, 2);
console.log(`- test3 => ${JSON.stringify(output)} == 4 [${output.includes('A') && output.includes('C') && output.includes('G') && output.includes('T')}]`);
output = clumpFinding("CCACGCGGTGTACGCTGCAAAAAGCCTTGCTGAATCAAATAAGGTTCCAGCACATCCTCAATGGTTTCACGTTCTTCGCCAATGGCTGCCGCCAGGTTATCCAGACCTACAGGTCCACCAAAGAACTTATCGATTACCGCCAGCAACAATTTGCGGTCCATATAATCGAAACCTTCAGCATCGACATTCAACATATCCAGCG", 3, 25, 3);
console.log(`- test4 => ${JSON.stringify(output)} == 6 [${output.includes('AAA') && output.includes('CAG') && output.includes('CAT') && output.includes('CCA') && output.includes('GCC') && output.includes('TTC')}]`);
}
testPatternCount();
testFrequentWords();
testClumpFinding();
@leandrosilva
Copy link
Author

Find my original Python solution right here.

@leandrosilva
Copy link
Author

Some sample output would be like this:

1st test group - pattern counting:
- constraints1 => no text and pattern [true]
- constraints2 => no text [true]
- constraints3 => no pattern [true]
- test1 => 2 == 2 [true]
- test2 => 3 == 3 [true]
- test3 => 4 == 4 [true]
- test4 => 4 == 4 [true]
- test5 => 2 == 2 [true]
- test6 => 5 == 5 [true]
- test7 => 9 == 9 [true]

2nd test group - frequent words:
- constraints1 => no text and k [true]
- constraints2 => no text [true]
- constraints3 => no k [true]
- test1 => {"GCAT":3,"CATG":3} == 2 [true]
- test2 => {"TGG":6} == 1 [true]
- test3 => {"TTTT":4} == 1 [true]
- test4 => {"AACAA":5} == 1 [true]
- test5 => {"GGGGT":4,"TTTTA":4,"AAAAT":4} == 3 [true]

3rd test group - clump finding:
- constraints1 => no parameters [true]
- test1 => ["CGACA","GAAGA"] == 2 [true]
- test2 => ["AA"] == 1 [true]
- test3 => ["A","C","G","T"] == 4 [true]
- test4 => ["AAA","TTC","GCC","CCA","CAG","CAT"] == 6 [true]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment