Skip to content

Instantly share code, notes, and snippets.

@zaus
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zaus/4cae1a5b42475a083287 to your computer and use it in GitHub Desktop.
Save zaus/4cae1a5b42475a083287 to your computer and use it in GitHub Desktop.
Demonstrating potential side effects (multiple evaluation) due to fast partitioning method from Stack Overflow answer http://stackoverflow.com/a/13745058/1037948

Explanation

After using the suggested Partition solution on a set of data that had calculations applied to each element before being enumerated (see scenarios below), I noticed the application took much longer. Investigated via tests.cs and discovered that each item will be evaluated twice. See differences in sideeffect in results.

Original Scenario

var items = database.Fetch(filter).ToList();
var calculatedStuff = items.Select(item => new CalculatedItem(item));
// other stuff
var file = new SomeFileHelperThing("filename");
foreach(var item in calculatedStuff) file.SerializeAndAppend(item);

Partitioned Scenario

var items = database.Fetch(filter).ToList();
var calculatedStuff = items.Select(item => new CalculatedItem(item));
// other stuff
int i = 0;
foreach(var group in calculatedStuff.Partition(100)) {	// <-- enumerates `calculatedStuff`, creating a `CalculatedItem` for each, before returning each group
	var file = new SomeFileHelperThing("filename" + ++i);
	foreach(var item in group) file.SerializeAndAppend(item);  // <-- enumerates the group, again creating a `CalculatedItem` for each
}

Results

Also note that inner/outer enumeration affects the results...

Linear

result = 132, sideeffect = 12

Nawfal Partition

result = 132, sideeffect = 39

Safe Partition

result = 132, sideeffect = 12

Split

result = 132, sideeffect = 12

Segment

result = 132, sideeffect = 12

Chunks

result = 132, sideeffect = 12

void Main()
{
// investigating side effects + performance of some suggestions
// * http://stackoverflow.com/questions/3773403/linq-partition-list-into-lists-of-8-members
// * http://stackoverflow.com/questions/419019/split-list-into-sublists-with-linq
// * http://stackoverflow.com/questions/438188/split-a-collection-into-n-parts-with-linq
// examine side effects
int sideEffect = 0, result = 0, size = 5;
var list = Enumerable.Range(0, (int)Math.Round(size*2.5)).Select(i => {
sideEffect++; // just to mark how many times this occurs
return i * 2; // simulate some delayed work
});
// control case
sideEffect = 0;
result = list.Sum();
string.Format("result = {0}, sideeffect = {1}", result, sideEffect).Dump("Linear");
// compare results -- some mentions that actually evaluating may screw with ordering...
var results = new Dictionary<string, IEnumerable<IEnumerable<int>>>() {
{ "Nawfal Partition", list.Partition(size) }
, { "Safe Partition", list.SafePartition(size) }
, { "Split", list.Split(size) }
, { "Segment", list.Segment(size) }
, { "Chunks", list.Chunks(size) }
};
foreach(var r in results) {
sideEffect = 0;
result = r.Value.Select(g => g.Sum()).Sum();
string.Format("result = {0}, sideeffect = {1}", result, sideEffect).Dump(r.Key);
}
Util.HorizontalRun(string.Join(",", results.Keys), results.Values).Dump("Results");
Util.HorizontalRun(string.Join(",", results.Keys), results.Values.Select(r => r.ToList()).ToArray()).Dump("Enumerated Results");
Util.HorizontalRun(string.Join(",", results.Keys), results.Values.Select(r => r.Select(o => o.ToArray()).ToArray()).ToArray()).Dump("Inner-Enumerated Results");
Util.HorizontalRun(string.Join(",", results.Keys), results.Values.Select(r => r.ToArray().Select(o => o.ToArray())).ToArray()).Dump("Outer-Enumerated Results");
}
public static class CollectionExtensions {
// http://stackoverflow.com/a/13745058/1037948 nawfal
public static IEnumerable<IEnumerable<T>> Partition<T>(this IEnumerable<T> items,
int partitionSize)
{
if (partitionSize <= 0)
throw new ArgumentOutOfRangeException("partitionSize");
int innerListCounter = 0;
int numberOfPackets = 0;
foreach (var item in items)
{
innerListCounter++;
if (innerListCounter == partitionSize)
{
yield return items.Skip(numberOfPackets * partitionSize).Take(partitionSize);
innerListCounter = 0;
numberOfPackets++;
}
}
if (innerListCounter > 0)
yield return items.Skip(numberOfPackets * partitionSize);
}
public static IEnumerable<IEnumerable<T>> SafePartition<T>(this IEnumerable<T> items,
int partitionSize)
{
int i = 0;
return items.GroupBy(x => i++ / partitionSize);
}
// modified from http://stackoverflow.com/a/28682295/1037948
public static IEnumerable<IEnumerable<T>> Split<T>(this IEnumerable<T> items, int partitionSize) {
if (partitionSize < 1) throw new ArgumentException("Must be positive", "partitionSize");
using (var enumer = items.GetEnumerator())
while (enumer.MoveNext())
{
// weird -- if you don't enumerate the inner list on consumption, it will return a list of each original item as an enumerable using this line...
//yield return Take(enumer.Current, enumer, partitionSize);
// but if you use the following, without enumerating the inner list on consumption it will return the appropriate number of chunks with 1 item each (the first from that segment)
var remaining = partitionSize;
yield return Take(enumer.Current, enumer, () => --remaining == 0);
// must discard elements skipped by inner iterator so that `.ToList()` (rather than `.Select(o => o.ToList()).ToList()`) doesn't return n segments instead
while (--remaining > 0 && enumer.MoveNext()) { }
}
}
private static IEnumerable<T> Take<T>(T head, IEnumerator<T> tail, int len) {
return Take(head, tail, () => --len == 0);
}
private static IEnumerable<T> Take<T>(T head, IEnumerator<T> tail, Func<bool> stop) {
while (true) {
yield return head;
if (stop()) break;
if (tail.MoveNext()) head = tail.Current;
else break;
}
}
// http://stackoverflow.com/a/4835541/1037948
private static IEnumerable<T> TakeFromCurrent<T>(this IEnumerator<T> enumerator, int count) {
while (count > 0) {
yield return enumerator.Current;
if (--count > 0 && !enumerator.MoveNext()) yield break;
}
}
public static IEnumerable<IEnumerable<T>> Segment<T>(this IEnumerable<T> seq, int partitionSize) {
var enumerator = seq.GetEnumerator();
while (enumerator.MoveNext()) {
yield return enumerator.TakeFromCurrent(partitionSize);
}
}
// http://stackoverflow.com/a/20953521/1037948
public static IEnumerable<IEnumerable<T>> Chunks<T>(this IEnumerable<T> enumerable, int chunkSize) {
if (chunkSize < 1) throw new ArgumentException("chunkSize must be positive");
using (var e = enumerable.GetEnumerator())
while (e.MoveNext()) {
var remaining = chunkSize; // elements remaining in the current chunk
var innerMoveNext = new Func<bool>(() => --remaining > 0 && e.MoveNext());
yield return e.GetChunk(innerMoveNext);
while (innerMoveNext()) {/* discard elements skipped by inner iterator */}
}
}
private static IEnumerable<T> GetChunk<T>(this IEnumerator<T> e, Func<bool> innerMoveNext) {
do yield return e.Current;
while (innerMoveNext());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment