Skip to content

Instantly share code, notes, and snippets.

@LeslieK
Last active December 30, 2015 12:49
Show Gist options
  • Save LeslieK/7832001 to your computer and use it in GitHub Desktop.
Save LeslieK/7832001 to your computer and use it in GitHub Desktop.
Fun with string sorting and searching algorithms (suffix arrays, radix sorts, Knuth-Morris-Pratt).
{
"metadata": {
"name": "STRINGS"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": "Strings: Sorting and Searching"
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\nIdeas: Suffix Array\n\nH A C K E R S C H O O L\nA C K E R S C H O O L\nC K E R S C H O O L\nK E R S C H O O L\nE R S C H O O L\nR S C H O O L\nS C H O O L\nC H O O L\nH O O L\nO O L\nO L\nL\n\nSorted Suffix Array\n\nA C K E R S C H O O L\nC H O O L\nC K E R S C H O O L\nE R S C H O O L\nH A C K E R S C H O O L\nH O O L\nK E R S C H O O L\nL\nO L\nO O L\nR S C H O O L\nS C H O O L\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "Longest Repeated String in PI"
},
{
"cell_type": "code",
"collapsed": false,
"input": "# digits time (secs) to sort suffix array\n400 K 15\n500 K 20\n700 K 29\n800 K 33\n900 K 38\n1 M !! 43\n2 M 95\n5 M 228\n10 M !! 512 (8.5 mins)",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": "PI digits: 10,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 10000",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "0.25 secs: done sorting SuffixArray\nlongest string: "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 7\n{'7111369': 2, '8530614': 2}\nset([4601, 6115, 3502, 4167])\n"
}
],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 20,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 20000",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "0.773000001907 secs: done sorting SuffixArray\nlongest string: "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 8\n{'52637962': 2}\nset([15434, 15052])\n"
}
],
"prompt_number": 8
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 50,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 50000",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "1.94200015068 secs: done sorting SuffixArray\nlongest string: "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 9\n{'201890888': 2}\nset([30627, 33844])\n"
}
],
"prompt_number": 9
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 100,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 100000",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "4.26400017738 secs: done sorting SuffixArray\nlongest string: "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 9\n{'134926158': 2, '201890888': 2, '041021944': 2}\nset([21761, 30627, 86282, 33844, 69597, 75870])\n"
}
],
"prompt_number": 10
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 200,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "run LongestRepeatedStringPI.py \"pi10M.txt\" 200000",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "9.10800004005 secs: done sorting SuffixArray\nlongest string: "
},
{
"output_type": "stream",
"stream": "stdout",
"text": " 9\n{'357183932': 2, '852515068': 2, '008173039': 2, '626775794': 2, '644801965': 2, '536113462': 2, '835840278': 2, '146061745': 2, '448984071': 2, '577522388': 2, '134926158': 2, '103206503': 2, '361935369': 2, '201890888': 2, '041021944': 2, '404852784': 2, '435858775': 2}\nset([21761, 91782, 165640, 123913, 86282, 62603, 66322, 184982, 194393, 180514, 30627, 33844, 48567, 13880, 137680, 110282, 164799, 16192, 150209, 15426, 198473, 139338, 167244, 114256, 66001, 58325, 116184, 84825, 69597, 75870, 193633, 125416, 93929, 179691])\n"
}
],
"prompt_number": 11
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 500,000"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [10]: run LongestRepeatedStringPI.py \"pi10M.txt\"\ndone reading file 500000\n21.8299999237\n1508.71499991\n10\n{'7687268042': 2, '2679637592': 2, '8027590099': 2, '1292345774': 2, '1829116816': 2, '4392366484': 2, '1411868954': 2,\n'7519347205': 2, '3276181873': 2, '3207178938': 2, '8746206252': 2, '8713945527': 2}\nset([34307, 418059, 348945, 59284, 365296, 265512, 240171, 402224, 422322, 333878, 1992, 464335, 259536, 483027, 223831,\n 309848, 182105, 240479, 59495, 234601, 426352, 47859, 389110, 347260])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "PI digits: 1 million"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [16]: run LongestRepeatedStringPI.py \"pi10M.txt\"\ndone reading file 1000000\n43.0120000839\n8057.57999992\n12\n{'756130190263': 2}\nset([447673, 857982])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Moby Dick: Longest Repeated String"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [1]: run LRS_script.py \"mobydick.txt\"\ndone reading file 1193357\n56.0869998932\n12699.1800001\n85\n{',--\\n Such a funny, sporty, gamy, jesty, joky, hoky-poky lad, is the Ocean, oh!\\n\\n Th': 2}\nset([1047255, 1047423])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Bioinformatics: Longest Repeated String"
},
{
"cell_type": "code",
"collapsed": false,
"input": "texts = [\n\"atcaatgatcaacgtaagcttctaagcatgatcaaggtgctcacacagtttatccacaac\", \n \"ctgagtggatgacatcaagataggtcgttgtatctccttcctctcgtactctcatgacca\",\n \"cggaaagatgatcaagagaggatgatttcttggccatatcgcaatgaatacttgtgactt\",\n \"gtgcttccaattgacatcttcagcgccatattgcgctggccaaggtgacggagcgggatt\",\n \"acgaaagcatgatcatggctgttgttctgtttatcttgttttgactgagacttgttagga\",\n \"tagacggtttttcatcactgactagccaaagccttactctgcctgacatcgaccgtaaat\",\n \"tgataatgaatttacatgcttccgcgacgatttacctcttgatcatcgatccgattgaag\",\n \"atcttcaattgttaattctcttgcctcgactcatagccatgatgagctcttgatcatgtt\",\n \"tccttaaccctctattttttacggaagaatgatcaagctgctgctcttgatcatcgtttc\"]\ntext = ''.join(texts)\nfrom LRS_functions import LRS\nLRS(text)",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "ctcttgatcatcg 13\n{'ctcttgatcatcg': 2}\nset([523, 395])\n"
}
],
"prompt_number": 12
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Bioinformatics: n-grams"
},
{
"cell_type": "code",
"collapsed": false,
"input": "from LRS_functions import ngram\ntext = \"ACAACTATGCATACTATCGGGAACTATCCT\"\nngram(text, 5)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 1,
"text": "{'AACTA': (2, [2, 21]),\n 'ACAAC': (1, [0]),\n 'ACTAT': (3, [3, 12, 22]),\n 'ATACT': (1, [10]),\n 'ATCCT': (1, [25]),\n 'ATCGG': (1, [15]),\n 'ATGCA': (1, [6]),\n 'CAACT': (1, [1]),\n 'CATAC': (1, [9]),\n 'CGGGA': (1, [17]),\n 'CTATC': (2, [13, 23]),\n 'CTATG': (1, [4]),\n 'GAACT': (1, [20]),\n 'GCATA': (1, [8]),\n 'GGAAC': (1, [19]),\n 'GGGAA': (1, [18]),\n 'TACTA': (1, [11]),\n 'TATCC': (1, [24]),\n 'TATCG': (1, [14]),\n 'TATGC': (1, [5]),\n 'TCGGG': (1, [16]),\n 'TGCAT': (1, [7])}"
}
],
"prompt_number": 1
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Bioinformatics: Thermotoga-petrophila.txt: 1.82 million characters (A, G, C, T)"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [13]: \nfrom LRS_functions import LRS\nwith open(\"Thermotoga-petrophila.txt\") as f:\n t = f.read()\nLRS(t[:50000])\n1.60000014305\nCTTCTTCGACGAGTTT 16\nAAATCAACGGAGAAAA 16\n{'CTTCTTCGACGAGTTT': 2, 'AAATCAACGGAGAAAA': 2}\nset([21540, 38178, 1452, 25846])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [15]: \nfrom LRS_functions import LRS\nwith open(\"Thermotoga-petrophila.txt\") as f:\n t = f.read()\nLRS(t[:100000])\n3.44000005722\nGAATCCCTCGGAGTGAAAG 19\n{'GAATCCCTCGGAGTGAAAG': 2}\nset([52209, 53860])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Random Data: 1 million bits: Math.random.choice"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [2]: run LRS_script \"mathrandom.txt\"\ndone reading file 1000000\n69.7239999771\n10185.8770001\n37\n{'1010101000010110000101101001101101101': 2}\nset([854477, 161527])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: all zeros\nlength # matches\n14 zeros 32\n15 zeros 15\n16 zeros 8\n17 zeros 3\n18 zeros 0\n19 zeros 0\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: all ones\nlength # matches\n14 ones 29\n15 ones 15\n16 ones 7\n17 ones 4\n18 ones 4\n19 ones 3\n20 ones 3\n21 ones 3\n22 ones 2\n23 ones 1\n24 ones 1\n25 ones 1\n26 ones 0\n27 ones 0\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: '10' repeated 10 times\n# matches\n1 at [787696]\n\npattern: '01' repeated 10 times\n# matches\n2 at [35557, 787695]\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": "Random Data: 1 million bits: Crypto.Random.random.choice"
},
{
"cell_type": "code",
"collapsed": false,
"input": "In [2]: run LRS_script.py \"cryptorandom.txt\"\ndone reading file 1000000\n53.4900000095\n7457.4849999\n40\n{'0100110000010101101010000100000000100010': 2}\nset([929537, 968410])",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: all zeros\nlength # matches\n14 zeros 23\n15 zeros 9\n16 zeros 4\n17 zeros 4\n18 zeros 2\n19 zeros 1\n20 zeros 1\n21 zeros 0\n22 zeros 0\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: all ones\nlength # matches\n14 ones 30\n15 ones 14\n16 ones 6\n17 ones 4\n18 ones 1\n19 ones 0\n20 ones 0\n21 ones 0\n22 ones 0\n23 ones 0\n24 ones 0\n25 ones 0\n26 ones 0\n27 ones 0\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\npattern: '10' repeated 10 times\n# matches\n1 at [790108]\n\npattern: '01' repeated 10 times\n# matches\n0\n\npattern: '01' repeated 9 times\n# matches\n2\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "Plagarism: Longest Common String"
},
{
"cell_type": "code",
"collapsed": false,
"input": "\"HuffPost: $1 Million Lottery Ticket Stolen By New York Deli Owners, Police Allege\"\n\"GuardianExpress: Lottery Ticket Worth $1 Million Dollars Stolen From Winner\"\n\nfrom LRS_functions import LCS\nLCS(\"HuffPost.txt\", \"GuardianExpress.txt\")",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "{' was a simple mistake on a payout ': 2}\nset([1245, 1006])\n"
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": "'''\nGuardianExpress: Their attorney alleges that this was a simple mistake on a payout from a lottery machine, reported Newsday.\n\nHuffPost: According to Newsday, their attorney said it was a simple mistake on a payout on a lottery machine.\n'''",
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": "Knuth-Morris-Pratt: Matching a pattern in a Streaming Twitter Feed: No Backup in the Feed!!!"
},
{
"cell_type": "raw",
"metadata": {},
"source": "Don Knuth, Jim Morris, Vaughn Pratt: 2 theoreticians and a hacker INDEPENDENTLY discovered this algorithm (1977)"
},
{
"cell_type": "code",
"collapsed": false,
"input": "class DFA(object):\n\tdef __init__(self, pat):\n\t\tself.pat = pat\n\n\t\t# build dfa from pattern\n\t\tself.M = len(self.pat)\n\t\tself.dfa = np.zeros((StringUtils.R, self.M+1))\n\t\t# initialize dfa construction\n\t\tself.dfa[ord(self.pat[0])][0] = 1\n\t\t# X refers to state after matching pat on text[1:] (shifted to the right 1 character)\n\t\tX = 0\n\t\tfor j in range(1, self.M):\n\t\t\tfor c in range(StringUtils.R):\n\t\t\t\t# on a mismatch: set the state transitions from j to be identical to those from X\n\t\t\t\tself.dfa[c][j] = self.dfa[c][X]\n\t\t\t# on a match: set state transition from j to j + 1\n\t\t\tself.dfa[ord(self.pat[j])][j] = j + 1\n\t\t\t# update state X\n\t\t\tX = self.dfa[ord(self.pat[j])][X]\n\t\t# needed to find next match after a match has been found\n\t\tfor c in range(StringUtils.R):\n\t\t\tself.dfa[c][self.M] = self.dfa[c][X]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "from twitter import TwitterStream, OAuth\nfrom KnuthMorrisPratt import KMP\n\npat = \"attack at dawn\"\nkmp = KMP(pat)\npattern_index = 0\n\nid_dict = {\"wikileaks\": \"16589206\", \"aljazeera\": \"76067316\", \"NYFBI\": \"211635204\", \"reutersiran\": \"47633485\", \"WhiteHouse\":\"30313925\"}\npublic_stream = TwitterStream(auth = auth, domain='stream.twitter.com')\niterator = public_stream.statuses.filter(language=\"en\", follow=','.join(id_dict.values()), track=\"terrorism, weapons, drone attack, i want to kill\")\nfor item in iterator:\n\tif item['retweeted'] == False:\n\t\ttry:\n\t\t\t#print (''.join((item['user']['screen_name'], ':', item['text'])))\n\t\t\ttext = item['text']\n\t\t\tprint text\n\t\t\tfor c in text.encode():\n\t\t\t\tpattern_index = kmp.dfa.next(c, pattern_index)\n\t\t\t\tif int(pattern_index) == len(pat):\n\t\t\t\t\tprint \"MATCH!!!!!\"\n\t\texcept:\n\t\t\tcontinue",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment