Skip to content

Instantly share code, notes, and snippets.

Created August 11, 2020 20:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Voidsay/d84a64fc94e79a31c1a005da1245c2cc to your computer and use it in GitHub Desktop.
Save Voidsay/d84a64fc94e79a31c1a005da1245c2cc to your computer and use it in GitHub Desktop.
Super inefficient text comparator based on Jaccard index with toggleable spellcheck
using System;
using WeCantSpell.Hunspell;
using System.Collections.Generic;
using System.IO;
using System.Text;
using NHunspell;//doesn't work on linux or mac
namespace TextAnalysis
public class Analysis
WordList dictionary;
string dictionaryFile = "en_US.dic";
public Analysis(string path1, string path2, bool spellcheck)
dictionary = WordList.CreateFromFiles(dictionaryFile);
var text = OpenFile(path1);
if (spellcheck)
text = Correct(text);// can be skipped to make it much faster! Or we could do it properly with nhunspell, but my mono on linux dosen't want to compile the available package
var dic1 = CountWords(text);
var text2 = OpenFile(path2);
if (spellcheck)
text2 = Correct(text2);
var dic2 = CountWords(text2);
float similarity = Jaccard(dic1, dic2);
Console.WriteLine("Jaccard index of " + path1 + " and " + path2 + " is " + similarity);
public float Jaccard(Dictionary<string, int> dictionary1, Dictionary<string, int> dictionary2)
var union = new Dictionary<string, int>();
var difference = new Dictionary<string, int>();
// calculate union and difference
foreach (string key in dictionary1.Keys)
dictionary1.TryGetValue(key, out int num1);
if (dictionary2.ContainsKey(key))// add to union
dictionary2.TryGetValue(key, out int num2);
union.Add(key, num1 + num2);
difference.Add(key, num1);//add to difference
foreach (string key in dictionary2.Keys)
dictionary2.TryGetValue(key, out int num1);
if (!dictionary1.ContainsKey(key))
difference.Add(key, num1);// add to difernece
int unionSum = 0;
int differenceSum = 0;
foreach (string key in union.Keys)
union.TryGetValue(key, out int sum);
unionSum += sum;
foreach (string key in difference.Keys)
difference.TryGetValue(key, out int sum);
differenceSum += sum;
Console.WriteLine("\n\n\nunion score: " + unionSum + " difference score: " + differenceSum);
var jac = (float)unionSum / ((float)differenceSum + (float)unionSum);
return jac;
public string Correct(string input)
Console.WriteLine("\npreforming spellcheck, this might take a while\n");
var words = input.Split(' ');
string output = null;
for (int i = 0; i < words.Length; i++)
if (!dictionary.Check(words[i]))//if misspelled or unknown
var suggestions = dictionary.Suggest(words[i]);
foreach (string str in suggestions)// get a suggestion, take the first one
words[i] = str;
output += words[i];// reassamble the sentence
output += ' ';
return output;
public Dictionary<string, int> CountWords(string input)
//List<wordCount> uniqueWords = new List<wordCount>();
var translator = new Dictionary<string, int>();
var words = input.Split(' ');
foreach (string word in words)
if (!translator.ContainsKey(word))
translator.Add(word, 1);
translator[word]++;//add one to the counter
return translator;
public string OpenFile(string path)
if (!File.Exists(path))
throw new Exception("Textfile not found: " + path);// if there is no file crash
string text = null;
using (FileStream fs = File.OpenRead(path))// Open the stream and read it back.
byte[] b = new byte[fs.Length];
fs.Read(b, 0, b.Length);
text = Encoding.UTF8.GetString(b);
if (text.Length <= 0)
throw new Exception("text file empty, nothing to do. " + path);
return text;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment