Skip to content

Instantly share code, notes, and snippets.

@SaschaDittmann
Created April 10, 2015 08:19
Show Gist options
  • Save SaschaDittmann/4053058f042b95b06f4b to your computer and use it in GitHub Desktop.
Save SaschaDittmann/4053058f042b95b06f4b to your computer and use it in GitHub Desktop.
MapReduce Entwurfsmuster – Numerische Aggregation (Standardabweichung 2/2)
public class MedianStdDevCombiner
: JsonOutReducerCombinerBase<MedianStdDevData>
{
public override void Reduce(string key,
IEnumerable<string> values,
JsonReducerCombinerContext<MedianStdDevData> context)
{
var query = values
.Select(int.Parse)
.GroupBy(v => v)
.Select(grp => new MedianStdDevData
{
Value = grp.Key,
Count = grp.Count(),
});
foreach (var value in query)
context.EmitKeyValue(key, value);
}
}
public class MedianStdDevData
{
public int Value { get; set; }
public int Count { get; set; }
}
public class MedianStdDevMapper : MapperBase
{
public override void Map(string inputLine, MapperContext context)
{
var parsed = XmlUtils.ParseXml(inputLine);
if (parsed == null
|| !parsed.ContainsKey("CreationDate")
|| !parsed.ContainsKey("Text"))
{
context.CoreContext.IncrementCounter(
"Median / Std. Dev. Mapper", "Invalid Rows", 1);
return;
}
DateTime creationDate;
if (!DateTime.TryParse(parsed["CreationDate"], out creationDate))
{
context.CoreContext.IncrementCounter(
"Median / Std. Dev. Mapper", "Invalid Creation Dates", 1);
return;
}
var text = parsed["Text"];
context.EmitKeyValue(
creationDate.Hour.ToString(CultureInfo.InvariantCulture),
text.Length.ToString(CultureInfo.InvariantCulture));
}
}
public class MedianStdDevReducer
: JsonInReducerCombinerBase<MedianStdDevData>
{
public override void Reduce(string key,
IEnumerable<MedianStdDevData> values,
ReducerCombinerContext context)
{
float sum = 0;
long totalComments = 0;
var commentLengthCounts = new Dictionary<int, long>();
foreach (var data in values)
{
totalComments += data.Count;
sum += data.Value * data.Count;
if (!commentLengthCounts.ContainsKey(data.Value))
commentLengthCounts.Add(data.Value, data.Count);
else
commentLengthCounts[data.Value] += data.Count;
}
// calculate median
double median = 0;
var medianIndex = totalComments / 2;
long previousComments = 0;
var prevKey = 0;
foreach (var entry in commentLengthCounts.OrderBy(e => e.Key))
{
if (previousComments <= medianIndex
&& medianIndex < previousComments + entry.Value)
{
if (totalComments % 2 == 0 && previousComments == medianIndex)
median = (entry.Key + prevKey) / 2.0f;
else
median = entry.Key;
break;
}
previousComments += entry.Value;
prevKey = entry.Key;
}
// calculate standard deviation
var avg = sum / totalComments;
var sumOfSquares = commentLengthCounts
.Sum(entry => Math.Pow(entry.Key - avg, 2) * entry.Value);
var stdDev = Math.Sqrt(sumOfSquares / (totalComments - 1));
context.EmitKeyValue(
key,
String.Format("{0}t{1}", median, stdDev));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment