Skip to content

Instantly share code, notes, and snippets.

@daiplusplus
Created May 5, 2023 04:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daiplusplus/d9f0df6ecd17657e382f90213db3db71 to your computer and use it in GitHub Desktop.
Save daiplusplus/d9f0df6ecd17657e382f90213db3db71 to your computer and use it in GitHub Desktop.
Compound word finder for StackOverflow 76178746
// https://stackoverflow.com/questions/76178746/finding-compound-words-in-a-text-file-of-150-000-words
Trie trie = new Trie(); // From NuGet `rm.Trie`: https://www.nuget.org/packages/rm.Trie
Stopwatch sw = Stopwatch.StartNew();
// From https://github.com/dwyl/english-words/blob/master/words.txt
const String PATH = @".\english-words-400k.txt";
// Pass 1: Population.
using( FileStream fs = OpenAsyncSequentialReadFileStream( PATH ) )
using( StreamReader rdr = new StreamReader( fs, Encoding2.Utf8NoBom ) )
{
String? line;
while( ( line = await rdr.ReadLineAsync().ConfigureAwait(false) ) != null )
{
trie.AddWord( line );
}
}
sw.Elapsed.Dump( "Read words." );
sw.Restart();
trie.UniqueCount().Dump( "Unique Count" );
trie.GetLongestWords().Dump( "Longest word" );
// Pass 2: Matching prefixes.
List<( String fullWord, String word1, String remainder )> p2 = new List<(String fullWord, String word1, String remainder)>();
using( FileStream fs = OpenAsyncSequentialReadFileStream( PATH ) )
using( StreamReader rdr = new StreamReader( fs, Encoding2.Utf8NoBom ) )
{
String? line;
while( ( line = await rdr.ReadLineAsync().ConfigureAwait(false) ) != null )
{
// For each word:
// For each prefix length (from 4 to word.Length - 4): (using 4 as a min length) (hmm, maybe work backwards?)
// if isWord( prefix ) then suffices.Add( ( word, word-without-prefix ) );
if( line.Length >= 8 )
{
for( Int32 i = 4; i < line.Length - 4; i++ )
{
String prefix = line.Substring( startIndex: 0, length: i );
if( trie.HasWord( prefix ) )
{
p2.Add( ( fullWord: line, word1: prefix, line.Substring( startIndex: i ) ) );
}
}
}
}
}
sw.Elapsed.Dump( "Find words prefixed with other words." );
sw.Restart();
//p2.Dump(); // 179680 results
// Pass 3 and 4: words from p2 where the remaining suffix is also a word:
List<( String fullWord, String word1, String word2, String remainder )> p3 = new List<(String fullWord, String word1, String word2, String remainder)>();
foreach( ( String word, String prefixIsWord, String remaining ) in p2 )
{
for( Int32 i = 4; i < remaining.Length - 4; i++ )
{
String prefixOfRemainder = remaining.Substring( startIndex: 0, length: i );
if( trie.HasWord( prefixOfRemainder ) )
{
p3.Add( (
fullWord: word,
word1: prefixIsWord,
word2: prefixOfRemainder,
remainder: remaining.Substring( startIndex: i )
) );
}
}
}
sw.Elapsed.Dump( "Pass 3: Find words prefixed with other words and also sufficed by other words." );
sw.Restart();
// p3.Dump();
//
List<( String fullWord, String word1, String word2, String word3, String remainder )> p4 = new List<(String fullWord, String word1, String word2, String word3, String remainder)>();
foreach( ( String fullWord, String word1, String word2, String remainder ) in p3 )
{
for( Int32 i = 4; i < remainder.Length - 4; i++ )
{
String prefixOfRemainder = remainder.Substring( startIndex: 0, length: i );
if( trie.HasWord( prefixOfRemainder ) )
{
p4.Add( (
fullWord: fullWord,
word1: word1,
word2: word2,
word3: prefixOfRemainder,
remainder: remainder.Substring( startIndex: i )
) );
}
}
}
sw.Elapsed.Dump( "Pass 4 (again)." );
sw.Restart();
p4.Dump();
//
static FileStream OpenAsyncSequentialReadFileStream(string filePath, int bufferSize = 1048576)
{
return OpenAsyncFileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read, bufferSize);
}
static FileStream OpenAsyncFileStream(string filePath, FileMode mode, FileAccess access, FileShare share, int bufferSize = 1048576, bool sequential = true)
{
FileOptions fo = FileOptions.Asynchronous;
fo = ((!sequential) ? (fo | FileOptions.RandomAccess) : (fo | FileOptions.SequentialScan));
return new FileStream(filePath, mode, access, share, bufferSize, fo);
}
/* `p4` has results like this, unfortunately many of these found "words" in the words.txt file aren't proper words - if I used a more selective words.txt file the results would be more impressive, I think.
// Also, there's a bug in this program: matches in p2 and p3 that are too short for p4 won't be in the `p4.Dump()` output - fixing that is an exercise for the reader.
acetylcholinesterase acetyl choli nest erase
adenylpyrophosphate adenyl pyro phos phate
aerothermodynamic aero ther mody namic
aerothermodynamics aero ther mody namics
alkylbenzenesulfonates alkyl benzene sulfo nates
allotransplantation allo trans plant ation
allotransplantation allo transp lant ation
anti-Aristotelianism anti- Aristo teli anism
anti-immigrationist anti- immi grat ionist
anti-intellectualism anti- intel lect ualism
anti-intellectualist anti- intel lect ualist
anti-intellectuality anti- intel lect uality
anti-pre-existentiary anti- pre- exist entiary
anti-Trinitarianism anti- Trini tari anism
archconfraternity arch conf rate rnity
archconfraternities arch conf rate rnities
archconfraternities arch conf rater nities
astrochronological astr ochro nolo gical
astrospherecentrosomic astr osphere cent rosomic
astrospherecentrosomic astr osphere centro somic
autotransplantation auto trans plant ation
autotransplantation auto transp lant ation
bio-electrogenesis bio- elec trog enesis
centimeter-gram-second centime ter- gram -second
chitino-arenaceous chit ino- aren aceous
cyclotrimethylenetrinitramine cyclo trim ethylene trinitramine
counterinterpretation counter inter pret ation
counterpronunciamento counter pron unci amento
counterpronunciamento counter pron uncia mento
crystallographically cryst allo graph ically
crystalloluminescence cryst allo lumine scence
diphenylaminechlorarsine diphenyl amine chlor arsine
electroamalgamation elect roam alga mation
electrocapillarity elec troca pill arity
electrocauterization elec troca uteri zation
electrodesiccation elec trode sicc ation
electrodesiccation elect rode sicc ation
electrophilically elec trop hili cally
electrophysiological elec trophy siol ogical
electrophysiologically elec trophy siol ogically
electrophysiologist elec trophy siol ogist
electrophoretically elec trop hore tically
electrophoretogram elec trop hore togram
electropneumatically elec trop neum atically
electropneumatically elec trop neuma tically
electropuncturation elec trop unct uration
electropuncturing elec trop unct uring
electrotelethermometer electro tele ther mometer
electrotelethermometer electro tele therm ometer
electrotelethermometer electro tele thermo meter
electro-ultrafiltration electro- ultra filt ration
establishmentarian estab lish ment arian
establishmentarianism estab lish ment arianism
establishmentarianism estab lish menta rianism
extrathermodynamic extra ther mody namic
fire-and-brimstone fire- and- brim stone
formaldehydesulphoxylate form aldehyde sulpho xylate
formaldehydesulphoxylic form aldehyde sulpho xylic
gentlemen-pensioners gentle men- pens ioners
half-intellectually half- intel lect ually
half-questioningly half- ques tion ingly
heterointoxication hete roin toxic ation
heterotransplantation hetero trans plant ation
heterotransplantation hetero transp lant ation
hexachlorocyclohexane hexa chloro cycl ohexane
hexachlorocyclohexane hexa chloro cyclo hexane
hexahydroxycyclohexane hexa hydroxy cycl ohexane
hexahydroxycyclohexane hexa hydroxy cyclo hexane
hydrochlorplatinous hydro chlor plat inous
hydrometallurgically hydro metal lurg ically
hyperconcentration hyper conc entr ation
hyperidealistically hype ride alist ically
hyperintellectually hyper intel lect ually
hyperintellectualness hyper intel lect ualness
hyperphosphorescence hyper phos phore scence
hyperpolysyllabically hyper poly syll abically
hyperpolysyllabically hyper poly syllab ically
hyperpolysyllabically hyper poly syllabi cally
hyperscholastically hyper scho last ically
hyperthermesthesia hyper ther mest hesia
hyposuprarenalism hypo supr aren alism
intellectualisation intel lect ualis ation
intellectualistically intel lect ualis tically
intercomplimentary inter comp lime ntary
intertransformability inter trans form ability
intertransformability inter trans forma bility
lymphosarcomatosis lymph osar coma tosis
magnetofluidmechanics magneto fluid mech anics
magnetohydrodynamically magneto hydro dynam ically
magnetothermoelectricity magneto thermo elec tricity
magnetothermoelectricity magneto thermo elect ricity
myxobacteriaceous myxo bact eria ceous
non-co-operationist non- co-op erat ionist
overconcentrating over conc entr ating
overconcentration over conc entr ation
overdeliberateness over deli berat eness
overeditorialized over edit oria lized
overeditorializing over edit oria lizing
overeditorializing over edit orial izing
overindividualistically over individ ualis tically
overintellectualism over intel lect ualism
overintellectuality over intel lect uality
overintellectualization over intel lect ualization
overintellectualization over intel lectual ization
overintellectualize over intel lect ualize
overintellectualized over intel lect ualized
overintellectualizing over intel lect ualizing
overintellectualizing over intel lectual izing
overintellectually over intel lect ually
overintellectualness over intel lect ualness
overmilitaristically over milit arist ically
overparticularness over parti cula rness
overpolemicalness over pole mica lness
overpronunciation over pron unci ation
oversentimentalism over senti ment alism
oversentimentality over senti ment ality
oversentimentalize over senti ment alize
oversentimentalized over senti ment alized
oversentimentalized over senti menta lized
oversentimentalizing over senti ment alizing
oversentimentalizing over senti menta lizing
oversentimentalizing over senti mental izing
oversystematicalness over system atic alness
oversuperstitiously over supers titi ously
oversuperstitiousness over supers titi ousness
parachromatophorous para chroma toph orous
paraphenylenediamine para phenyl ened iamine
pentamethylenediamine penta methyl ened iamine
phenylthiocarbamide phenyl thio carb amide
philosophicopsychological philos ophic opsy chological
phonautographically phon auto graph ically
phosphatidylcholine phos phat idyl choline
phosphoglyceraldehyde phospho glyc eral dehyde
photoautotrophically photo auto trop hically
photoautotrophically photo auto trophi cally
photolithographically photo litho graph ically
photophosphorescent photo phos phore scent
phototelegraphically photo tele graph ically
polytetrafluoroethylene poly tetra fluor oethylene
polyvinylpyrrolidone poly vinyl pyrrol idone
post-Transcendental post- Tran scend ental
private-enterprise priv ate- enter prise
pro-immigrationist pro- immi grat ionist
pro-pre-existentiary pro- pre- exist entiary
pseudochromesthesia pseud ochro mest hesia
pseudointellectually pseudo intel lect ually
pseudointernationalistic pseudo inter nation alistic
pseudointernationalistic pseudo intern ation alistic
pseudolamellibranchiate pseudo lamel libr anchiate
pseudophilanthropically pseudo phil anthrop ically
pseudoscholastically pseudo scho last ically
quasi-complimentary quasi- comp lime ntary
quasi-confidentially quasi- conf ident ially
quasi-everlastingly quasi- ever last ingly
quasi-extraterritorial quasi- extra territ orial
quasi-extraterritorially quasi- extra territ orially
quasi-intellectually quasi- intel lect ually
quasi-internationalistic quasi- inter nation alistic
quasi-internationalistic quasi- intern ation alistic
quasi-militaristically quasi- milit arist ically
quasi-scholastically quasi- scho last ically
self-complacential self- comp lace ntial
self-concentration self- conc entr ation
self-interpretative self- inter pret ative
self-transformation self- trans form ation
semibureaucratically semi bureau crat ically
semidictatorially semi dict ator ially
semidictatorialness semi dict ator ialness
semi-intellectualized semi- intel lect ualized
semi-intellectually semi- intel lect ually
semimountainously semi moun tain ously
semiphosphorescence semi phos phore scence
semiphosphorescent semi phos phore scent
semischolastically semi scho last ically
semisentimentalized semi senti ment alized
semisentimentalized semi senti menta lized
semisupernaturally semi super natu rally
semisupernaturalness semi super natu ralness
semisupernaturalness semi super natura lness
semitransparentness semi trans paren tness
semitransparentness semi transp aren tness
stereophotogrammetry stereo photo gram metry
subject-objectivity subj ect- object ivity
superintellectually super intel lect ually
supersubstantiality super subs tanti ality
tetrabromofluorescein tetra bromo fluor escein
tetraiodophenolphthalein tetra iodo phenol phthalein
thermohyperesthesia thermo hype rest hesia
thermophosphorescence thermo phos phore scence
thermophosphorescent thermo phos phore scent
transformationalist trans form ation alist
transformationalist trans forma tion alist
transversovertical trans vers over tical
trinitrophenylmethylnitramine trinitro phenyl methyl nitramine
ultranationalistically ultra nation alist ically
undenominationalism unde nomina tion alism
undenominationalism unden omina tion alism
undenominationalist unde nomina tion alist
undenominationalist unden omina tion alist
undenominationalize unde nomina tion alize
undenominationalize unden omina tion alize
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment