Skip to content

Instantly share code, notes, and snippets.

@embix
Created October 29, 2015 21:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save embix/1cfaaddfee934c3a7969 to your computer and use it in GitHub Desktop.
Save embix/1cfaaddfee934c3a7969 to your computer and use it in GitHub Desktop.
//public static class MyExtensions
//{
// // Write custom extension methods here. They will be available to all queries.
// public static IEnumerable<String> ReadLines(this FileInfo file)
// {
// if (null == file) throw new ArgumentNullException(nameof(file));
// return File.ReadLines(file.FullName, Encoding.UTF8);
// }
//}
// datasource: http://popstats.unhcr.org/en/asylum_seekers_monthly
// all years, all months, all countries of asylum, all origins, all data items => csv
FileInfo dataFile = new FileInfo(
Path.Combine(
@"C:\", "Users", "embix", "Downloads",
"unhcr_popstats_export_asylum_seekers_monthly_2015_10_29_214011.csv"
));
// Quick and Dirty LINQPad script to fire queries against that data
void Main()
{
var entries = ParseUnhcrFile();
entries
.Where(e =>
e.From.Name.ToLower().StartsWith("united states")
||e.From.Name.ToLower().StartsWith("USA ")
)
.GroupBy(e => e.To, (k, g) => new
{
To = k.Name,
Count = g.Sum(e => e.Count),
ByMonth = g.Select(e => new { e.When, e.Count, From=e.From.Name}).OrderBy(e=>e.When)
})
.OrderByDescending(g=>g.Count)
.Dump("fleed from US", 1);
// I somehow miss the one who fled from US to Russia.
}
public RefugyMovement[] ParseUnhcrFile()
{
//dataFile.Length.Dump("Bytes");
var lines = dataFile.ReadLines().ToArray();
//lines.Length.Dump("Lines incl. header");
var entries = lines
.Skip(1)
.Select(l => ParseRefugyMovementOrNull(l))
.Where(r => r != null)
.Where(r => r.Count!=0)
.ToArray();
// debug support:
//Country.All.OrderBy(c => c.Name).Dump();
//_lastFieldErrors
// .GroupBy(f => f, (k, g) => new { LastFieldEntry = k, Count = g.Count() })
// .OrderByDescending(lfe => lfe.Count)
// .Dump("input data errors: Last field should be a number for count of refugees");
//_anomalies
// .GroupBy(a => a.Message, (k, g) =>
// new { Message = k, Count = g.Count(), Lines = g.Select(e => e.Line).ToArray() })
// .OrderByDescending(ag => ag.Count)
// .Dump("input data errors: other");
return entries;
}
public RefugyMovement ParseRefugyMovementOrNull(String line)
{
try
{
var fields = ParseCsvLine(",", "\"", line);
if (fields.Length != 5) throw new ArgumentException(line, nameof(line));
Country to = Country.FromString(fields[0]);
Country from = Country.FromString(fields[1]);
var year = Int32.Parse(fields[2]);
var month = ParseMonth(fields[3]);
var count = Int32.Parse(fields[4]);
var when = new DateTime(year, month, 1);
return new RefugyMovement(to, from, when, count);
}
catch (Exception ex)
{
_anomalies.Add(new Anomaly(ex.Message, line));
return null;
}
}
List<Anomaly> _anomalies = new List<Anomaly>();
public class Anomaly
{
public String Message { get; }
public String Line { get; set; }
public Anomaly(String message, String line)
{
Message = message;
Line = line;
}
}
public String[] ParseCsvLine(String separator, String escape, String line)
{
Char sepChar = separator.ToCharArray().Single();//works for me :P
Char esChar = escape.ToCharArray().Single();
var parser = new LineConsumer(sepChar, esChar);
var parsedFields = parser.Parse(line);
Int32 lastField;
String lastFieldString = parsedFields.Last();
if (!Int32.TryParse(lastFieldString, out lastField)){
_lastFieldErrors.Add(lastFieldString);
// correct field to at least have a valid entry
var length=parsedFields.Length;
parsedFields[length-1]="0";
}
return parsedFields;
}
List<String> _lastFieldErrors = new List<String>();
public Int32 ParseMonth(String name)
{
return _months[name];
}
private Dictionary<String, Int32> _months = new Dictionary<String, Int32>()
{
{"January", 1},
{"February", 2},
{"March", 3},
{"April", 4},
{"May", 5},
{"June", 6},
{"July", 7},
{"August", 8},
{"September", 9},
{"October", 10},
{"November", 11},
{"December", 12},
};
// Define other methods and classes here
public class RefugyMovement
{
public Country To { get; }
public Country From { get; }
public DateTime When { get; }
public Int32 Count { get; }
public RefugyMovement(Country to, Country from, DateTime when, Int32 count)
{
To=to;
From=from;
When=when;
Count=count;
}
}
public class Country
{
public String Name { get;}
// not thread safe!
private static Dictionary<String, Country> _countries = new Dictionary<String, Country>();
public static Country[] All => _countries.Values.ToArray();
public static Country FromString(String countryName)
{
Country countryEntry;
if (!_countries.TryGetValue(countryName, out countryEntry))
{
countryEntry = new Country(countryName);
_countries.Add(countryName, countryEntry);
}
return countryEntry;
}
private Country(String countryName)
{
Name = countryName;
}
}
// generate month lookup
//
//var enUs = CultureInfo.GetCultureInfo("en-US");
//Thread.CurrentThread.CurrentCulture = enUs;
//
//Enumerable.Range(1, 12)
//.Select(i => new DateTime(2015, i, 1))
//.Select(d => "{\"" + d.ToString("MMMM") + "\", " + d.Month + "},")
//.Aggregate((a, b) => a + "\n" + b)
//.Dump();
public class LineConsumer
{
public Char Separator { get; }
public Char Escape { get; }
private State _state;
private State SStart;
private State SEsc;
private State SUnesc;
private State SExpectSep;
private List<Char> CurrentField;
private List<String> Fields = new List<String>();
public LineConsumer(Char separator, Char escape)
{
Separator = separator;
Escape = escape;
SStart = new FieldStarting(this);
_state = SStart;
_state.Init();
SEsc = new Escaping(this);
SUnesc = new Unescaped(this);
SExpectSep = new ExpectSeparator(this);
}
public String[] Parse(String line)
{
var chars = line.ToCharArray();
foreach (var c in chars)
{
_state.Parse(c);
}
_state.Flush();
return Fields.ToArray();
}
public abstract class State
{
protected LineConsumer _consumer { get; }
protected State(LineConsumer consumer) { _consumer = consumer; }
public void Parse(Char character)
{
if (character.Equals(_consumer.Separator))
{
Sep(character);
return;
}
if (character.Equals(_consumer.Escape))
{
Esc(character);
return;
}
Letter(character);
}
public abstract void Sep(Char letter);
public abstract void Esc(Char letter);
public abstract void Letter(Char letter);
public virtual void Error() { throw new Exception(); }
public void Flush() { _consumer.Fields.Add(new String(_consumer.CurrentField.ToArray())); }
public void Init() { _consumer.CurrentField = new List<Char>(); }
public void ToBuffer(Char letter)
{
_consumer.CurrentField.Add(letter);
}
}
public class FieldStarting : State
{
public FieldStarting(LineConsumer consumer) : base(consumer) { }
public override void Sep(Char letter)
{
Flush();
Init();
}
public override void Esc(Char letter)
{
_consumer._state = _consumer.SEsc;
}
public override void Letter(Char letter)
{
ToBuffer(letter);
_consumer._state = _consumer.SUnesc;
}
}
public class Escaping : State
{
public Escaping(LineConsumer consumer) : base(consumer) { }
public override void Sep(Char letter)
{
ToBuffer(letter);
}
public override void Esc(Char letter)
{
_consumer._state = _consumer.SExpectSep;
}
public override void Letter(Char letter)
{
ToBuffer(letter);
}
}
public class Unescaped : State
{
public Unescaped(LineConsumer consumer) : base(consumer) { }
public override void Sep(Char letter)
{
Flush();
Init();
_consumer._state = _consumer.SStart;
}
public override void Esc(Char letter)
{
Error();
}
public override void Letter(Char letter)
{
ToBuffer(letter);
}
}
public class ExpectSeparator : State
{
public ExpectSeparator(LineConsumer consumer) : base(consumer) { }
public override void Sep(Char letter)
{
Flush();
Init();
_consumer._state = _consumer.SStart;
}
public override void Esc(Char letter)
{
Error();
}
public override void Letter(Char letter)
{
Error();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment