Last active
December 30, 2015 12:49
-
-
Save LeslieK/7832001 to your computer and use it in GitHub Desktop.
Fun with string sorting and searching algorithms (suffix arrays, radix sorts, Knuth-Morris-Pratt).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "STRINGS" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": "Strings: Sorting and Searching" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\nIdeas: Suffix Array\n\nH A C K E R S C H O O L\nA C K E R S C H O O L\nC K E R S C H O O L\nK E R S C H O O L\nE R S C H O O L\nR S C H O O L\nS C H O O L\nC H O O L\nH O O L\nO O L\nO L\nL\n\nSorted Suffix Array\n\nA C K E R S C H O O L\nC H O O L\nC K E R S C H O O L\nE R S C H O O L\nH A C K E R S C H O O L\nH O O L\nK E R S C H O O L\nL\nO L\nO O L\nR S C H O O L\nS C H O O L\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "Longest Repeated String in PI" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# digits time (secs) to sort suffix array\n400 K 15\n500 K 20\n700 K 29\n800 K 33\n900 K 38\n1 M !! 43\n2 M 95\n5 M 228\n10 M !! 512 (8.5 mins)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": "PI digits: 10,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 10000", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.25 secs: done sorting SuffixArray\nlongest string: " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " 7\n{'7111369': 2, '8530614': 2}\nset([4601, 6115, 3502, 4167])\n" | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 20,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 20000", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.773000001907 secs: done sorting SuffixArray\nlongest string: " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " 8\n{'52637962': 2}\nset([15434, 15052])\n" | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 50,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 50000", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "1.94200015068 secs: done sorting SuffixArray\nlongest string: " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " 9\n{'201890888': 2}\nset([30627, 33844])\n" | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 100,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 100000", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "4.26400017738 secs: done sorting SuffixArray\nlongest string: " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " 9\n{'134926158': 2, '201890888': 2, '041021944': 2}\nset([21761, 30627, 86282, 33844, 69597, 75870])\n" | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 200,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 200000", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "9.10800004005 secs: done sorting SuffixArray\nlongest string: " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " 9\n{'357183932': 2, '852515068': 2, '008173039': 2, '626775794': 2, '644801965': 2, '536113462': 2, '835840278': 2, '146061745': 2, '448984071': 2, '577522388': 2, '134926158': 2, '103206503': 2, '361935369': 2, '201890888': 2, '041021944': 2, '404852784': 2, '435858775': 2}\nset([21761, 91782, 165640, 123913, 86282, 62603, 66322, 184982, 194393, 180514, 30627, 33844, 48567, 13880, 137680, 110282, 164799, 16192, 150209, 15426, 198473, 139338, 167244, 114256, 66001, 58325, 116184, 84825, 69597, 75870, 193633, 125416, 93929, 179691])\n" | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 500,000" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [10]: run LongestRepeatedStringPI.py \"pi10M.txt\"\ndone reading file 500000\n21.8299999237\n1508.71499991\n10\n{'7687268042': 2, '2679637592': 2, '8027590099': 2, '1292345774': 2, '1829116816': 2, '4392366484': 2, '1411868954': 2,\n'7519347205': 2, '3276181873': 2, '3207178938': 2, '8746206252': 2, '8713945527': 2}\nset([34307, 418059, 348945, 59284, 365296, 265512, 240171, 402224, 422322, 333878, 1992, 464335, 259536, 483027, 223831,\n 309848, 182105, 240479, 59495, 234601, 426352, 47859, 389110, 347260])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "PI digits: 1 million" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [16]: run LongestRepeatedStringPI.py \"pi10M.txt\"\ndone reading file 1000000\n43.0120000839\n8057.57999992\n12\n{'756130190263': 2}\nset([447673, 857982])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Moby Dick: Longest Repeated String" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [1]: run LRS_script.py \"mobydick.txt\"\ndone reading file 1193357\n56.0869998932\n12699.1800001\n85\n{',--\\n Such a funny, sporty, gamy, jesty, joky, hoky-poky lad, is the Ocean, oh!\\n\\n Th': 2}\nset([1047255, 1047423])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Bioinformatics: Longest Repeated String" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "texts = [\n\"atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaac\", \n \"ctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgacca\",\n \"cggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgactt\",\n \"gtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggatt\",\n \"acgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttagga\",\n \"tagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaat\",\n \"tgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaag\",\n \"atcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtt\",\n \"tccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc\"]\ntext = ''.join(texts)\nfrom LRS_functions import LRS\nLRS(text)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "ctcttgatcatcg 13\n{'ctcttgatcatcg': 2}\nset([523, 395])\n" | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Bioinformatics: n-grams" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from LRS_functions import ngram\ntext = \"ACAACTATGCATACTATCGGGAACTATCCT\"\nngram(text, 5)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 1, | |
"text": "{'AACTA': (2, [2, 21]),\n 'ACAAC': (1, [0]),\n 'ACTAT': (3, [3, 12, 22]),\n 'ATACT': (1, [10]),\n 'ATCCT': (1, [25]),\n 'ATCGG': (1, [15]),\n 'ATGCA': (1, [6]),\n 'CAACT': (1, [1]),\n 'CATAC': (1, [9]),\n 'CGGGA': (1, [17]),\n 'CTATC': (2, [13, 23]),\n 'CTATG': (1, [4]),\n 'GAACT': (1, [20]),\n 'GCATA': (1, [8]),\n 'GGAAC': (1, [19]),\n 'GGGAA': (1, [18]),\n 'TACTA': (1, [11]),\n 'TATCC': (1, [24]),\n 'TATCG': (1, [14]),\n 'TATGC': (1, [5]),\n 'TCGGG': (1, [16]),\n 'TGCAT': (1, [7])}" | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Bioinformatics: Thermotoga-petrophila.txt: 1.82 million characters (A, G, C, T)" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [13]: \nfrom LRS_functions import LRS\nwith open(\"Thermotoga-petrophila.txt\") as f:\n t = f.read()\nLRS(t[:50000])\n1.60000014305\nCTTCTTCGACGAGTTT 16\nAAATCAACGGAGAAAA 16\n{'CTTCTTCGACGAGTTT': 2, 'AAATCAACGGAGAAAA': 2}\nset([21540, 38178, 1452, 25846])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [15]: \nfrom LRS_functions import LRS\nwith open(\"Thermotoga-petrophila.txt\") as f:\n t = f.read()\nLRS(t[:100000])\n3.44000005722\nGAATCCCTCGGAGTGAAAG 19\n{'GAATCCCTCGGAGTGAAAG': 2}\nset([52209, 53860])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Random Data: 1 million bits: Math.random.choice" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [2]: run LRS_script \"mathrandom.txt\"\ndone reading file 1000000\n69.7239999771\n10185.8770001\n37\n{'1010101000010110000101101001101101101': 2}\nset([854477, 161527])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: all zeros\nlength # matches\n14 zeros 32\n15 zeros 15\n16 zeros 8\n17 zeros 3\n18 zeros 0\n19 zeros 0\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: all ones\nlength # matches\n14 ones 29\n15 ones 15\n16 ones 7\n17 ones 4\n18 ones 4\n19 ones 3\n20 ones 3\n21 ones 3\n22 ones 2\n23 ones 1\n24 ones 1\n25 ones 1\n26 ones 0\n27 ones 0\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: '10' repeated 10 times\n# matches\n1 at [787696]\n\npattern: '01' repeated 10 times\n# matches\n2 at [35557, 787695]\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": "Random Data: 1 million bits: Crypto.Random.random.choice" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "In [2]: run LRS_script.py \"cryptorandom.txt\"\ndone reading file 1000000\n53.4900000095\n7457.4849999\n40\n{'0100110000010101101010000100000000100010': 2}\nset([929537, 968410])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: all zeros\nlength # matches\n14 zeros 23\n15 zeros 9\n16 zeros 4\n17 zeros 4\n18 zeros 2\n19 zeros 1\n20 zeros 1\n21 zeros 0\n22 zeros 0\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: all ones\nlength # matches\n14 ones 30\n15 ones 14\n16 ones 6\n17 ones 4\n18 ones 1\n19 ones 0\n20 ones 0\n21 ones 0\n22 ones 0\n23 ones 0\n24 ones 0\n25 ones 0\n26 ones 0\n27 ones 0\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\npattern: '10' repeated 10 times\n# matches\n1 at [790108]\n\npattern: '01' repeated 10 times\n# matches\n0\n\npattern: '01' repeated 9 times\n# matches\n2\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "Plagarism: Longest Common String" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "\"HuffPost: $1 Million Lottery Ticket Stolen By New York Deli Owners, Police Allege\"\n\"GuardianExpress: Lottery Ticket Worth $1 Million Dollars Stolen From Winner\"\n\nfrom LRS_functions import LCS\nLCS(\"HuffPost.txt\", \"GuardianExpress.txt\")", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "{' was a simple mistake on a payout ': 2}\nset([1245, 1006])\n" | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "'''\nGuardianExpress: Their attorney alleges that this was a simple mistake on a payout from a lottery machine, reported Newsday.\n\nHuffPost: According to Newsday, their attorney said it was a simple mistake on a payout on a lottery machine.\n'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": "Knuth-Morris-Pratt: Matching a pattern in a Streaming Twitter Feed: No Backup in the Feed!!!" | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": "Don Knuth, Jim Morris, Vaughn Pratt: 2 theoreticians and a hacker INDEPENDENTLY discovered this algorithm (1977)" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "class DFA(object):\n\tdef __init__(self, pat):\n\t\tself.pat = pat\n\n\t\t# build dfa from pattern\n\t\tself.M = len(self.pat)\n\t\tself.dfa = np.zeros((StringUtils.R, self.M+1))\n\t\t# initialize dfa construction\n\t\tself.dfa[ord(self.pat[0])][0] = 1\n\t\t# X refers to state after matching pat on text[1:] (shifted to the right 1 character)\n\t\tX = 0\n\t\tfor j in range(1, self.M):\n\t\t\tfor c in range(StringUtils.R):\n\t\t\t\t# on a mismatch: set the state transitions from j to be identical to those from X\n\t\t\t\tself.dfa[c][j] = self.dfa[c][X]\n\t\t\t# on a match: set state transition from j to j + 1\n\t\t\tself.dfa[ord(self.pat[j])][j] = j + 1\n\t\t\t# update state X\n\t\t\tX = self.dfa[ord(self.pat[j])][X]\n\t\t# needed to find next match after a match has been found\n\t\tfor c in range(StringUtils.R):\n\t\t\tself.dfa[c][self.M] = self.dfa[c][X]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from twitter import TwitterStream, OAuth\nfrom KnuthMorrisPratt import KMP\n\npat = \"attack at dawn\"\nkmp = KMP(pat)\npattern_index = 0\n\nid_dict = {\"wikileaks\": \"16589206\", \"aljazeera\": \"76067316\", \"NYFBI\": \"211635204\", \"reutersiran\": \"47633485\", \"WhiteHouse\":\"30313925\"}\npublic_stream = TwitterStream(auth = auth, domain='stream.twitter.com')\niterator = public_stream.statuses.filter(language=\"en\", follow=','.join(id_dict.values()), track=\"terrorism, weapons, drone attack, i want to kill\")\nfor item in iterator:\n\tif item['retweeted'] == False:\n\t\ttry:\n\t\t\t#print (''.join((item['user']['screen_name'], ':', item['text'])))\n\t\t\ttext = item['text']\n\t\t\tprint text\n\t\t\tfor c in text.encode():\n\t\t\t\tpattern_index = kmp.dfa.next(c, pattern_index)\n\t\t\t\tif int(pattern_index) == len(pat):\n\t\t\t\t\tprint \"MATCH!!!!!\"\n\t\texcept:\n\t\t\tcontinue", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment