Created
July 5, 2019 15:15
-
-
Save maltegoetz/58045cf6562d56ff7c2309d62aa316dc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Newtonsoft.Json.Linq; | |
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Net; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
namespace SemModAnabinParser | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
ParseUniversityList("https://anabin.kmk.org/index.php?eID=user_anabin_institutionen&conf=institutionsergebnisliste&sEcho=7&iColumns=13&iDisplayStart=0&iDisplayLength=99999&bRegex=false&sSearch_6=Finnland&iSortingCols=1&iSortCol_0=2&sSortDir_0=asc&land=15"); //you might wanna change the country here | |
Console.ReadLine(); | |
} | |
static void ParseUniversityList(string url) | |
{ | |
Console.WriteLine("Generating universites.csv..."); | |
var unicsv = "Id, Name, Type, City" + Environment.NewLine; | |
var ids = new List<string>(); | |
var wc = new WebClient(); | |
var content = wc.DownloadString(new Uri(url)); | |
var result = JObject.Parse(content); | |
var unilist = result["aaData"]; | |
foreach(var uni in unilist) | |
{ | |
var type = uni[4].ToString(); | |
if (type != "H+") | |
{ | |
var id = uni[1].ToString(); | |
ids.Add(id); | |
var name = uni[2].ToString(); | |
var city = uni[3].ToString(); | |
unicsv += $"{id}, {EscapeString(name)}, {EscapeString(type)}, {EscapeString(city)}" + Environment.NewLine; | |
} | |
} | |
File.WriteAllText(@"C:\Users\xxx\universities.csv", unicsv, Encoding.Unicode); | |
Console.WriteLine("Generated universites.csv"); | |
Console.WriteLine("Generating degrees.csv..."); | |
var degreecsv = "Degree, Type, University Id" + Environment.NewLine; | |
foreach (var id in ids) | |
{ | |
Console.WriteLine($"Parsing university (id:{id})"); | |
degreecsv += ParseUniversityDetails(id); | |
} | |
File.WriteAllText(@"C:\Users\xxx\degrees.csv", degreecsv, Encoding.Unicode); | |
Console.WriteLine("Generated degrees.csv"); | |
} | |
static string ParseUniversityDetails(string id) | |
{ | |
var url = $"https://anabin.kmk.org/index.php?eID=user_anabin_institutionen&conf=institutionen&uid={id}"; | |
var wc = new WebClient(); | |
var content = wc.DownloadString(new Uri(url)); | |
var degreeListRegex = new Regex("<tbody>((.|\n)*)</tbody>"); | |
var degreeListResult = degreeListRegex.Match(content); | |
var csv = string.Empty; | |
foreach(var degreeEntry in degreeListResult.Groups[1].Value.Split("<tr").Skip(1)) | |
{ | |
var degreeRegex = new Regex("<td>(.*?)</td>"); | |
var degreeResult = degreeRegex.Matches(degreeEntry); | |
var degreeName = degreeResult[0].Groups[1].Value; | |
var degreeType = degreeResult[1].Groups[1].Value; | |
csv += $"{EscapeString(degreeName)}, {EscapeString(degreeType)}, {id}" + Environment.NewLine; | |
} | |
return csv; | |
} | |
private static string EscapeString(string str) | |
{ | |
bool mustQuote = (str.Contains(",") || str.Contains("\"") || str.Contains("\r") || str.Contains("\n")); | |
if (mustQuote) | |
{ | |
StringBuilder sb = new StringBuilder(); | |
sb.Append("\""); | |
foreach (char nextChar in str) | |
{ | |
sb.Append(nextChar); | |
if (nextChar == '"') | |
sb.Append("\""); | |
} | |
sb.Append("\""); | |
return sb.ToString(); | |
} | |
return str; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment