Created
October 29, 2015 21:21
-
-
Save embix/1cfaaddfee934c3a7969 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//public static class MyExtensions | |
//{ | |
// // Write custom extension methods here. They will be available to all queries. | |
// public static IEnumerable<String> ReadLines(this FileInfo file) | |
// { | |
// if (null == file) throw new ArgumentNullException(nameof(file)); | |
// return File.ReadLines(file.FullName, Encoding.UTF8); | |
// } | |
//} | |
// datasource: http://popstats.unhcr.org/en/asylum_seekers_monthly | |
// all years, all months, all countries of asylum, all origins, all data items => csv | |
FileInfo dataFile = new FileInfo( | |
Path.Combine( | |
@"C:\", "Users", "embix", "Downloads", | |
"unhcr_popstats_export_asylum_seekers_monthly_2015_10_29_214011.csv" | |
)); | |
// Quick and Dirty LINQPad script to fire queries against that data | |
void Main() | |
{ | |
var entries = ParseUnhcrFile(); | |
entries | |
.Where(e => | |
e.From.Name.ToLower().StartsWith("united states") | |
||e.From.Name.ToLower().StartsWith("USA ") | |
) | |
.GroupBy(e => e.To, (k, g) => new | |
{ | |
To = k.Name, | |
Count = g.Sum(e => e.Count), | |
ByMonth = g.Select(e => new { e.When, e.Count, From=e.From.Name}).OrderBy(e=>e.When) | |
}) | |
.OrderByDescending(g=>g.Count) | |
.Dump("fleed from US", 1); | |
// I somehow miss the one who fled from US to Russia. | |
} | |
public RefugyMovement[] ParseUnhcrFile() | |
{ | |
//dataFile.Length.Dump("Bytes"); | |
var lines = dataFile.ReadLines().ToArray(); | |
//lines.Length.Dump("Lines incl. header"); | |
var entries = lines | |
.Skip(1) | |
.Select(l => ParseRefugyMovementOrNull(l)) | |
.Where(r => r != null) | |
.Where(r => r.Count!=0) | |
.ToArray(); | |
// debug support: | |
//Country.All.OrderBy(c => c.Name).Dump(); | |
//_lastFieldErrors | |
// .GroupBy(f => f, (k, g) => new { LastFieldEntry = k, Count = g.Count() }) | |
// .OrderByDescending(lfe => lfe.Count) | |
// .Dump("input data errors: Last field should be a number for count of refugees"); | |
//_anomalies | |
// .GroupBy(a => a.Message, (k, g) => | |
// new { Message = k, Count = g.Count(), Lines = g.Select(e => e.Line).ToArray() }) | |
// .OrderByDescending(ag => ag.Count) | |
// .Dump("input data errors: other"); | |
return entries; | |
} | |
public RefugyMovement ParseRefugyMovementOrNull(String line) | |
{ | |
try | |
{ | |
var fields = ParseCsvLine(",", "\"", line); | |
if (fields.Length != 5) throw new ArgumentException(line, nameof(line)); | |
Country to = Country.FromString(fields[0]); | |
Country from = Country.FromString(fields[1]); | |
var year = Int32.Parse(fields[2]); | |
var month = ParseMonth(fields[3]); | |
var count = Int32.Parse(fields[4]); | |
var when = new DateTime(year, month, 1); | |
return new RefugyMovement(to, from, when, count); | |
} | |
catch (Exception ex) | |
{ | |
_anomalies.Add(new Anomaly(ex.Message, line)); | |
return null; | |
} | |
} | |
List<Anomaly> _anomalies = new List<Anomaly>(); | |
public class Anomaly | |
{ | |
public String Message { get; } | |
public String Line { get; set; } | |
public Anomaly(String message, String line) | |
{ | |
Message = message; | |
Line = line; | |
} | |
} | |
public String[] ParseCsvLine(String separator, String escape, String line) | |
{ | |
Char sepChar = separator.ToCharArray().Single();//works for me :P | |
Char esChar = escape.ToCharArray().Single(); | |
var parser = new LineConsumer(sepChar, esChar); | |
var parsedFields = parser.Parse(line); | |
Int32 lastField; | |
String lastFieldString = parsedFields.Last(); | |
if (!Int32.TryParse(lastFieldString, out lastField)){ | |
_lastFieldErrors.Add(lastFieldString); | |
// correct field to at least have a valid entry | |
var length=parsedFields.Length; | |
parsedFields[length-1]="0"; | |
} | |
return parsedFields; | |
} | |
List<String> _lastFieldErrors = new List<String>(); | |
public Int32 ParseMonth(String name) | |
{ | |
return _months[name]; | |
} | |
private Dictionary<String, Int32> _months = new Dictionary<String, Int32>() | |
{ | |
{"January", 1}, | |
{"February", 2}, | |
{"March", 3}, | |
{"April", 4}, | |
{"May", 5}, | |
{"June", 6}, | |
{"July", 7}, | |
{"August", 8}, | |
{"September", 9}, | |
{"October", 10}, | |
{"November", 11}, | |
{"December", 12}, | |
}; | |
// Define other methods and classes here | |
public class RefugyMovement | |
{ | |
public Country To { get; } | |
public Country From { get; } | |
public DateTime When { get; } | |
public Int32 Count { get; } | |
public RefugyMovement(Country to, Country from, DateTime when, Int32 count) | |
{ | |
To=to; | |
From=from; | |
When=when; | |
Count=count; | |
} | |
} | |
public class Country | |
{ | |
public String Name { get;} | |
// not thread safe! | |
private static Dictionary<String, Country> _countries = new Dictionary<String, Country>(); | |
public static Country[] All => _countries.Values.ToArray(); | |
public static Country FromString(String countryName) | |
{ | |
Country countryEntry; | |
if (!_countries.TryGetValue(countryName, out countryEntry)) | |
{ | |
countryEntry = new Country(countryName); | |
_countries.Add(countryName, countryEntry); | |
} | |
return countryEntry; | |
} | |
private Country(String countryName) | |
{ | |
Name = countryName; | |
} | |
} | |
// generate month lookup | |
// | |
//var enUs = CultureInfo.GetCultureInfo("en-US"); | |
//Thread.CurrentThread.CurrentCulture = enUs; | |
// | |
//Enumerable.Range(1, 12) | |
//.Select(i => new DateTime(2015, i, 1)) | |
//.Select(d => "{\"" + d.ToString("MMMM") + "\", " + d.Month + "},") | |
//.Aggregate((a, b) => a + "\n" + b) | |
//.Dump(); | |
public class LineConsumer | |
{ | |
public Char Separator { get; } | |
public Char Escape { get; } | |
private State _state; | |
private State SStart; | |
private State SEsc; | |
private State SUnesc; | |
private State SExpectSep; | |
private List<Char> CurrentField; | |
private List<String> Fields = new List<String>(); | |
public LineConsumer(Char separator, Char escape) | |
{ | |
Separator = separator; | |
Escape = escape; | |
SStart = new FieldStarting(this); | |
_state = SStart; | |
_state.Init(); | |
SEsc = new Escaping(this); | |
SUnesc = new Unescaped(this); | |
SExpectSep = new ExpectSeparator(this); | |
} | |
public String[] Parse(String line) | |
{ | |
var chars = line.ToCharArray(); | |
foreach (var c in chars) | |
{ | |
_state.Parse(c); | |
} | |
_state.Flush(); | |
return Fields.ToArray(); | |
} | |
public abstract class State | |
{ | |
protected LineConsumer _consumer { get; } | |
protected State(LineConsumer consumer) { _consumer = consumer; } | |
public void Parse(Char character) | |
{ | |
if (character.Equals(_consumer.Separator)) | |
{ | |
Sep(character); | |
return; | |
} | |
if (character.Equals(_consumer.Escape)) | |
{ | |
Esc(character); | |
return; | |
} | |
Letter(character); | |
} | |
public abstract void Sep(Char letter); | |
public abstract void Esc(Char letter); | |
public abstract void Letter(Char letter); | |
public virtual void Error() { throw new Exception(); } | |
public void Flush() { _consumer.Fields.Add(new String(_consumer.CurrentField.ToArray())); } | |
public void Init() { _consumer.CurrentField = new List<Char>(); } | |
public void ToBuffer(Char letter) | |
{ | |
_consumer.CurrentField.Add(letter); | |
} | |
} | |
public class FieldStarting : State | |
{ | |
public FieldStarting(LineConsumer consumer) : base(consumer) { } | |
public override void Sep(Char letter) | |
{ | |
Flush(); | |
Init(); | |
} | |
public override void Esc(Char letter) | |
{ | |
_consumer._state = _consumer.SEsc; | |
} | |
public override void Letter(Char letter) | |
{ | |
ToBuffer(letter); | |
_consumer._state = _consumer.SUnesc; | |
} | |
} | |
public class Escaping : State | |
{ | |
public Escaping(LineConsumer consumer) : base(consumer) { } | |
public override void Sep(Char letter) | |
{ | |
ToBuffer(letter); | |
} | |
public override void Esc(Char letter) | |
{ | |
_consumer._state = _consumer.SExpectSep; | |
} | |
public override void Letter(Char letter) | |
{ | |
ToBuffer(letter); | |
} | |
} | |
public class Unescaped : State | |
{ | |
public Unescaped(LineConsumer consumer) : base(consumer) { } | |
public override void Sep(Char letter) | |
{ | |
Flush(); | |
Init(); | |
_consumer._state = _consumer.SStart; | |
} | |
public override void Esc(Char letter) | |
{ | |
Error(); | |
} | |
public override void Letter(Char letter) | |
{ | |
ToBuffer(letter); | |
} | |
} | |
public class ExpectSeparator : State | |
{ | |
public ExpectSeparator(LineConsumer consumer) : base(consumer) { } | |
public override void Sep(Char letter) | |
{ | |
Flush(); | |
Init(); | |
_consumer._state = _consumer.SStart; | |
} | |
public override void Esc(Char letter) | |
{ | |
Error(); | |
} | |
public override void Letter(Char letter) | |
{ | |
Error(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment