Skip to content

Instantly share code, notes, and snippets.

@jdfreder
Last active December 21, 2015 21:49
Show Gist options
  • Save jdfreder/6371184 to your computer and use it in GitHub Desktop.
Save jdfreder/6371184 to your computer and use it in GitHub Desktop.
Digraph possibilities
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"alpha = 'abcdefghijklmnopqrstuvwxyz'\n",
"digraphs = []\n",
"for i in range(26):\n",
" for j in range(26):\n",
" digraphs.append(alpha[i] + alpha[j])\n",
"len(digraphs)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 1,
"text": [
"676"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open('wordsEn.txt', 'r') as f:\n",
" words = {}\n",
" for word in f.readlines():\n",
" word = word.strip()\n",
" words[word] = []\n",
" if len(word) > 1:\n",
" for i in range(len(word) - 1):\n",
" digraph = word[i:i+2]\n",
" if digraph not in words[word]:\n",
" if digraph in digraphs:\n",
" words[word].append(digraphs.index(digraph))\n",
"print(\"There are %d words with two or more characters in the installed English dictionary\" % len(words))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"There are 109583 words with two or more characters in the installed English dictionary\n"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"current_word = ''\n",
"current_word_digraph_count = 0\n",
"for word, digraph_indicies in words.items():\n",
" digraph_count = len(digraph_indicies)\n",
" if digraph_count > current_word_digraph_count:\n",
" current_word = word\n",
" current_word_digraph_count = digraph_count\n",
" \n",
"print(\"The single english word containing the most digraphs is %s, with a total of %d digraphs\" % (current_word, current_word_digraph_count))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"The single english word containing the most digraphs is antidisestablishmentarianism, with a total of 27 digraphs\n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"digraph_sentence = []\n",
"\n",
"from copy import copy\n",
"remaining_digraphs = copy(digraphs)\n",
"while len(remaining_digraphs):\n",
" primary_digraph = remaining_digraphs[0]\n",
" digraph_sentence.append(primary_digraph)\n",
" remaining_digraphs.remove(primary_digraph)\n",
" if len(digraph_sentence) > 1:\n",
" secondary_digraph = digraph_sentence[-2][1] + digraph_sentence[-1][0]\n",
" if secondary_digraph in remaining_digraphs:\n",
" remaining_digraphs.remove(secondary_digraph)\n",
"digraph_sentence = ''.join(digraph_sentence)\n",
"for digraph in digraphs:\n",
" assert digraph in digraph_sentence\n",
" \n",
"print(\"The following string of %d characters contains all %d digraphs:\\n\" % (len(digraph_sentence), len(digraphs)))\n",
"print(digraph_sentence)\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"The following string of 702 characters contains all 676 digraphs:\n",
"\n",
"aaabacadaeafagahaiajakalamanaoapaqarasatauavawaxayazbbbcbdbebfbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzcccdcecfcgchcicjckclcmcncocpcqcrcsctcucvcwcxcyczdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzeeefegeheiejekelemeneoepeqereseteuevewexeyezfffgfhfifjfkflfmfnfofpfqfrfsftfufvfwfxfyfzggghgigjgkglgmgngogpgqgrgsgtgugvgwgxgygzhhhihjhkhlhmhnhohphqhrhshthuhvhwhxhyhziiijikiliminioipiqirisitiuiviwixiyizjjjkjljmjnjojpjqjrjsjtjujvjwjxjyjzkkklkmknkokpkqkrksktkukvkwkxkykzlllmlnlolplqlrlsltlulvlwlxlylzmmmnmompmqmrmsmtmumvmwmxmymznnnonpnqnrnsntnunvnwnxnynzooopoqorosotouovowoxoyozpppqprpsptpupvpwpxpypzqqqrqsqtquqvqwqxqyqzrrrsrtrurvrwrxryrzssstsusvswsxsysztttutvtwtxtytzuuuvuwuxuyuzvvvwvxvyvzwwwxwywzxxxyxzyyyzza\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"found_digraphs = []\n",
"split_at = []\n",
"for i in range(len(digraph_sentence) - 1):\n",
" found_digraph = digraph_sentence[i:i+2]\n",
" if found_digraph in found_digraphs:\n",
" split_at.append(i+1)\n",
" else:\n",
" found_digraphs.append(found_digraph)\n",
"print(\"There are %d extra digraphs in the above string.\" % len(split_at))\n",
"for split_index in split_at[::-1]:\n",
" digraph_sentence = digraph_sentence[:split_index] + ' ' + digraph_sentence[split_index:]\n",
"print(\"Splitting at those positions yields:\\n\" + digraph_sentence)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"There are 25 extra digraphs in the above string.\n",
"Splitting at those positions yields:\n",
"aa abacadaeafagahaiajakalamanaoapaqarasatauavawaxayazbb bcbdbebfbgbhbibjbkblbmbnbobpbqbrbsbtbubvbwbxbybzcc cdcecfcgchcicjckclcmcncocpcqcrcsctcucvcwcxcyczdd dedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzee efegeheiejekelemeneoepeqereseteuevewexeyezff fgfhfifjfkflfmfnfofpfqfrfsftfufvfwfxfyfzgg ghgigjgkglgmgngogpgqgrgsgtgugvgwgxgygzhh hihjhkhlhmhnhohphqhrhshthuhvhwhxhyhzii ijikiliminioipiqirisitiuiviwixiyizjj jkjljmjnjojpjqjrjsjtjujvjwjxjyjzkk klkmknkokpkqkrksktkukvkwkxkykzll lmlnlolplqlrlsltlulvlwlxlylzmm mnmompmqmrmsmtmumvmwmxmymznn nonpnqnrnsntnunvnwnxnynzoo opoqorosotouovowoxoyozpp pqprpsptpupvpwpxpypzqq qrqsqtquqvqwqxqyqzrr rsrtrurvrwrxryrzss stsusvswsxsysztt tutvtwtxtytzuu uvuwuxuyuzvv vwvxvyvzww wxwywzxx xyxzyy yzza\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def get_missing_digraphs(wordset, digraphset=digraphs):\n",
" remaining_digraphs = range(len(digraphs))\n",
" for word in wordset:\n",
" for index in words[word]:\n",
" if index in remaining_digraphs:\n",
" remaining_digraphs.remove(index)\n",
" if len(remaining_digraphs) == 0:\n",
" return []\n",
" \n",
" missing_digraphs = []\n",
" for i in remaining_digraphs:\n",
" if digraphs[i] in digraphset:\n",
" missing_digraphs.append(digraphs[i])\n",
" return missing_digraphs\n",
"\n",
"missing_digraphs = get_missing_digraphs(words.keys())\n",
"print(\"There are %d digraphs missing from the installed English dictionary:\" % len(missing_digraphs))\n",
"print(', '.join(missing_digraphs))\n",
"\n",
"english_digraphs = copy(digraphs)\n",
"for missing_digraph in missing_digraphs:\n",
" english_digraphs.remove(missing_digraph)\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"There are 83 digraphs missing from the installed English dictionary:\n",
"bq, bz, cf, cj, cv, cx, fq, fv, fx, fz, gq, gv, gx, hx, hz, jb, jd, jf, jg, jh, jl, jm, jp, jq, jr, js, jt, jv, jw, jx, jy, jz, kq, kx, kz, mx, mz, pq, pv, px, qb, qc, qd, qf, qg, qh, qj, qk, ql, qm, qn, qp, qq, qv, qw, qx, qy, qz, sx, tq, vb, vf, vh, vj, vk, vm, vp, vq, vw, vx, wq, wv, wx, xd, xj, xk, xr, xz, yq, yy, zf, zr, zx\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.utils.text import wrap_paragraphs\n",
"english_words = words.keys()\n",
"\n",
"def get_words(start_index):\n",
" index = start_index\n",
" \n",
" sentence_words = []\n",
" used_digraphs = []\n",
" missing_digraph_count = len(get_missing_digraphs(sentence_words, english_digraphs))\n",
" \n",
" while missing_digraph_count > 0 and (index != start_index or len(sentence_words) == 0):\n",
" # Try adding a word to the sentence\n",
" new_missing_digraph_count = missing_digraph_count\n",
" for digraph_index in words[english_words[index]]:\n",
" if not digraph_index in used_digraphs:\n",
" used_digraphs.append(digraph_index)\n",
" new_missing_digraph_count -= 1\n",
" \n",
" if new_missing_digraph_count < missing_digraph_count:\n",
" sentence_words.append(english_words[index])\n",
" \n",
" missing_digraph_count = new_missing_digraph_count\n",
" index += 1\n",
" if index == len(english_words):\n",
" index = 0\n",
" return sentence_words\n",
"\n",
"print(wrap_paragraphs(\" \".join(get_words(4037)), 80)[0])\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"spectrometric brilliantly bilious xerography furrieries imbued disparages\n",
"federating prejudgment welched addressees survival disciplining wheedlers\n",
"moussaka crockery ferrules unready pylori mantlings heartbroken pollywog rasping\n",
"unequivocally vermont arointed whereby hurriers supervisal humble vocalizers\n",
"client focusers casbah sullenly entwist tabling wops bereave daguerreotypes\n",
"trawleys taipei suttees newton cesiums poteens calypsos extol beautifying\n",
"blowback similarities corniche wardens cowhand slouchingly deponents monotheists\n",
"evertor roentgenogram argonauts totipotency percussional balmiest hemorrhoidal\n",
"designers umbels eroded axels surgical bhangs freshens temporally nighs punkier\n",
"fluoride cirrus mazer hypnic dualities brutishness refastens finitudes\n",
"eluviating stickums zanies bobbinets dolce overgrazes slipways bandmasters\n",
"rechristens subclassified barfly swervers skidooing jaggeder disarrayed catchall\n",
"candlesticks obligate overblown plummiest overblows algicides snoopiest\n",
"illegitimating nonbeings emulous divisibleness hautboys kennelled ragtag aorta\n",
"postscript xiii foamily nimrods uglify vaultier schmaltziest jodhpur infielder\n",
"authoritarianism biorhythmicities skinny chairperson banquets gonococcic\n",
"bunkhouse silverware pachyderm wholeheartedness murkest flyby puppyish subahdars\n",
"ophthalmologist dismalness gyving standpat kludge alfa gonofs neuropsychiatry\n",
"rondeaux pigeonholing alertness knickknacks quixotes sleepwalkers workwoman\n",
"barbwire puffer wraths fbi yoni ibm hobnailed syntactically shaftings gymnosperm\n",
"exclude snapdragon peerless songbook heterosexuals endemics barnyard thumbprint\n",
"wreckful hardcovers maxwell hallways reconveys paroquets advantaged nonmalicious\n",
"methaqualone nonprofit outdistanced overexpands reinvestment fleshly carboxyl\n",
"fishhooks mawkishness nuthouse shrikes snowdrop unneighborly circumventions\n",
"telekinesis clipboards muzzling overjoy tramlines midwived aery reacquired\n",
"czechoslovakian outpayment rubaiyat skoaled squatted jujitsu bagpiper\n",
"copyrighting oxygenated folktales skycap thoroughgoing bazaar benzol albinism\n",
"chevrons unlawfulness pooh thoughtful handball unsubtly mezuzahs kishkas\n",
"substantialness exactest disjointedly reject comfortless liquefies dozenth\n",
"dogwatch disdaining heavy subjectiveness cajaputs dishwasher picnicking\n",
"spoonsful swimmy handfasted wallpapered songfully upchucks ballyhoo chowchow\n",
"vulnerable unadjudicated obviously thewy leafhoppers subfloors healthful\n",
"foolhardier jimsonweed cowgirl zemstvos sackcloth abductors israelis beachcomber\n",
"evzone weltanschauung blackbird oxgall hardihood uncircumcised folkmotes boxlike\n",
"cowpoke blancmanges fizgig jnana cavalryman kvetch auk gulfweed fjord dogcart\n",
"clerkdom exhorter blowzy ebcdic campfires fellowmen hafniums bootjacks hijacker\n",
"iraqi jellyfish dojo marquess asphyxy thanksgivings knackwursts scherzos xxv\n",
"hundredth yules thruways blackguards injector blowjob inkpots dummkopfs qaid\n",
"bowwow foxskin taiwanese exfoliate dykes overbuy handkerchief zigzags\n",
"pocketknives dogdom highjacked syzygial muckrakers dumdums farmhouses skyjackers\n",
"skivvies huh propjet gazpacho gizmo bumpkinish escarpment jinxing filmgoer\n",
"subkingdoms boomtown afghanis xmas stockjobber cpi halfpenny offcut blvd\n",
"pavlovian gingko complexness engulfment transverse ramjets mpg gumwood\n",
"exquisiteness subgenera oxbow serfdom grosz calxes tuque cwt hindquarters\n",
"pirozhki schmalz zwieback blitzkrieged mezcals cumquats gadzooks hajj czarevnas\n",
"muzjiks mitzvahs marxists sacbuts qts qophs kafka gjetost mezquites cgs whizbang\n",
"aztecan avg earthquakes dostoevsky tx pbx pulque qed hdqrs dx vt killjoys vc zn\n",
"leipzig iqs nashville samizdat nietzsche jct pirojki\n"
]
}
],
"prompt_number": 40
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment