Skip to content

Instantly share code, notes, and snippets.

@sleemer
Last active June 28, 2018 15:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sleemer/f9849b7a8a2c17a30e771df30ed8c55c to your computer and use it in GitHub Desktop.
Save sleemer/f9849b7a8a2c17a30e771df30ed8c55c to your computer and use it in GitHub Desktop.
Word frequency counter algorithm based on MemoryMappedFile
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.MemoryMappedFiles;
using System.Linq;
namespace WordFrequencyCounter
{
public sealed class WordCounter
{
private static readonly string[] Separator = { " ", Environment.NewLine };
public static IDictionary<string, int> Process(string fileName)
{
var fileSize = new FileInfo(fileName).Length;
var chunkCount = 5;
var chunkSize = (int)fileSize / chunkCount;
var chunkInfos = Enumerable.Range(0, chunkCount)
.Select(i => new Tuple<long, long>(i * chunkSize, i == chunkCount - 1 ? fileSize - i * chunkSize : chunkSize))
.ToArray();
var chunks = chunkInfos.Select(info => ProcessChunk(fileName, info.Item1, info.Item2))
.ToArray();
return MergeChunkResults(chunks);
}
private static ChunkResult ProcessChunk(string fileName, long offset, long size)
{
using (var mmf = MemoryMappedFile.CreateFromFile(fileName))
using (var stream = mmf.CreateViewStream(offset, size))
using (var reader = new StreamReader(stream))
{
var dictionary = new Dictionary<string, int>(StringComparer.InvariantCultureIgnoreCase);
string line;
string firstWord = null;
string lastWord = null;
string raw = string.Empty;
while ((line = reader.ReadLine()) != null)
{
raw = string.Join(" ", raw, line);
var words = line.Split(Separator, StringSplitOptions.None);
for (var i = 0; i < words.Length; i++)
{
if (firstWord == null && words.Length > 0) firstWord = words[i];
if (!string.IsNullOrWhiteSpace(lastWord))
{
if (dictionary.ContainsKey(lastWord)) dictionary[lastWord]++;
else dictionary[lastWord.ToLower()] = 1;
}
lastWord = words[i];
}
}
return new ChunkResult
{
FirstWord = firstWord,
LastWord = lastWord,
Words = dictionary,
Raw = raw
};
}
}
private static IDictionary<string, int> MergeChunkResults(ChunkResult[] chunkResults)
{
var dictionary = new Dictionary<string, int>();
var words = chunkResults.SelectMany(x => x.Words.AsEnumerable());
foreach (var word in words)
{
if (dictionary.ContainsKey(word.Key)) dictionary[word.Key] += word.Value;
else dictionary.Add(word.Key, word.Value);
}
var additionalWords = new List<string>();
additionalWords.Add(chunkResults.First().FirstWord);
additionalWords.Add(chunkResults.Last().LastWord);
for (var i = 1; i < chunkResults.Length; i++)
{
additionalWords.Add($"{chunkResults[i - 1].LastWord}{chunkResults[i].FirstWord}".Trim());
}
foreach (var word in additionalWords)
{
if (dictionary.ContainsKey(word)) dictionary[word] += 1;
else dictionary.Add(word, 1);
}
return dictionary;
}
private struct ChunkResult
{
public string FirstWord;
public string LastWord;
public Dictionary<string, int> Words;
public string Raw;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment