Last active
June 28, 2018 15:49
-
-
Save sleemer/f9849b7a8a2c17a30e771df30ed8c55c to your computer and use it in GitHub Desktop.
Word frequency counter algorithm based on MemoryMappedFile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.IO.MemoryMappedFiles; | |
using System.Linq; | |
namespace WordFrequencyCounter | |
{ | |
public sealed class WordCounter | |
{ | |
private static readonly string[] Separator = { " ", Environment.NewLine }; | |
public static IDictionary<string, int> Process(string fileName) | |
{ | |
var fileSize = new FileInfo(fileName).Length; | |
var chunkCount = 5; | |
var chunkSize = (int)fileSize / chunkCount; | |
var chunkInfos = Enumerable.Range(0, chunkCount) | |
.Select(i => new Tuple<long, long>(i * chunkSize, i == chunkCount - 1 ? fileSize - i * chunkSize : chunkSize)) | |
.ToArray(); | |
var chunks = chunkInfos.Select(info => ProcessChunk(fileName, info.Item1, info.Item2)) | |
.ToArray(); | |
return MergeChunkResults(chunks); | |
} | |
private static ChunkResult ProcessChunk(string fileName, long offset, long size) | |
{ | |
using (var mmf = MemoryMappedFile.CreateFromFile(fileName)) | |
using (var stream = mmf.CreateViewStream(offset, size)) | |
using (var reader = new StreamReader(stream)) | |
{ | |
var dictionary = new Dictionary<string, int>(StringComparer.InvariantCultureIgnoreCase); | |
string line; | |
string firstWord = null; | |
string lastWord = null; | |
string raw = string.Empty; | |
while ((line = reader.ReadLine()) != null) | |
{ | |
raw = string.Join(" ", raw, line); | |
var words = line.Split(Separator, StringSplitOptions.None); | |
for (var i = 0; i < words.Length; i++) | |
{ | |
if (firstWord == null && words.Length > 0) firstWord = words[i]; | |
if (!string.IsNullOrWhiteSpace(lastWord)) | |
{ | |
if (dictionary.ContainsKey(lastWord)) dictionary[lastWord]++; | |
else dictionary[lastWord.ToLower()] = 1; | |
} | |
lastWord = words[i]; | |
} | |
} | |
return new ChunkResult | |
{ | |
FirstWord = firstWord, | |
LastWord = lastWord, | |
Words = dictionary, | |
Raw = raw | |
}; | |
} | |
} | |
private static IDictionary<string, int> MergeChunkResults(ChunkResult[] chunkResults) | |
{ | |
var dictionary = new Dictionary<string, int>(); | |
var words = chunkResults.SelectMany(x => x.Words.AsEnumerable()); | |
foreach (var word in words) | |
{ | |
if (dictionary.ContainsKey(word.Key)) dictionary[word.Key] += word.Value; | |
else dictionary.Add(word.Key, word.Value); | |
} | |
var additionalWords = new List<string>(); | |
additionalWords.Add(chunkResults.First().FirstWord); | |
additionalWords.Add(chunkResults.Last().LastWord); | |
for (var i = 1; i < chunkResults.Length; i++) | |
{ | |
additionalWords.Add($"{chunkResults[i - 1].LastWord}{chunkResults[i].FirstWord}".Trim()); | |
} | |
foreach (var word in additionalWords) | |
{ | |
if (dictionary.ContainsKey(word)) dictionary[word] += 1; | |
else dictionary.Add(word, 1); | |
} | |
return dictionary; | |
} | |
private struct ChunkResult | |
{ | |
public string FirstWord; | |
public string LastWord; | |
public Dictionary<string, int> Words; | |
public string Raw; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment