Created
March 27, 2016 03:59
-
-
Save mindingdata/b2f5bd5e996728e7f039 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using LinqToWiki.Generated; | |
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace UFCKnockouts.Downloader | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
//A set of redirects (Some events are labelled completely differently than their wikipedia pages). | |
Dictionary<string, string> eventRedirects = new Dictionary<string, string>(); | |
eventRedirects.Add("TUF 22 Finale", "The Ultimate Fighter: Team McGregor vs. Team Faber"); | |
eventRedirects.Add("TUF 21 Finale", "The Ultimate Fighter: American Top Team vs. Blackzilians"); | |
eventRedirects.Add("TUF 20 Finale", "The Ultimate Fighter: A Champion Will Be Crowned"); | |
eventRedirects.Add("TUF 19 Finale", "The Ultimate Fighter: Team Edgar vs. Team Penn"); | |
eventRedirects.Add("TUF Brazil 3 Finale", "The Ultimate Fighter: Brazil 3"); | |
eventRedirects.Add("TUF China Finale", "The Ultimate Fighter: China"); | |
eventRedirects.Add("TUF 18 Finale", "The Ultimate Fighter: Team Rousey vs. Team Tate"); | |
eventRedirects.Add("UFC on FX 1", "UFC on FX: Guillard vs. Miller"); | |
var wiki = new Wiki("UFCKnockoutLosers/1.0 (http://mindingdata.com, wade@mindingdata.com)", "en.wikipedia.org"); | |
//This library is straight garbage for getting the actual page content.... | |
var ufcAwardPageContent = wiki.CreateTitlesSource("List of UFC bonus award recipients").Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single(); | |
HtmlDocument ufcAwardPage = new HtmlDocument(); | |
ufcAwardPage.LoadHtml(ufcAwardPageContent); | |
List<UFCPerformanceResult> resultSet = new List<UFCPerformanceResult>(); | |
var awardReceiptientTable = ufcAwardPage.DocumentNode.SelectNodes("//table")[1]; | |
string ufcEvent = string.Empty; | |
//So first we get all the fighters who have won Performance of the night (Or Sub/KO), and the event it happened at. | |
//Start at 1 to skip header row. | |
//This became a complete mess because the HTML for wikipedia has tonnes of special row/col span rules. | |
for (int i=1; i < awardReceiptientTable.SelectNodes("tr").Count;i++) | |
{ | |
//This means there was double FOTN for the event, which means there was no POTN on this row. | |
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count == 3) | |
continue; | |
int performanceRowPush = 0; | |
//Sometimes the rowspan is 2, so we need to hold the same event (Don't ask why....). | |
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count <= 2) | |
{ | |
} | |
else | |
{ | |
ufcEvent = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[0].InnerText.Trim(); | |
//If there is no fight of the night, the table is a bit different. | |
performanceRowPush = 2; | |
if(awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[1].InnerText.Trim() != "N/A") | |
{ | |
performanceRowPush = 4; | |
} | |
} | |
var result1 = new UFCPerformanceResult(); | |
result1.Event = ufcEvent; | |
result1.Winner = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[performanceRowPush].InnerText.Trim(); | |
if(result1.Winner != "N/A") | |
resultSet.Add(result1); | |
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count > 1) | |
{ | |
var result2 = new UFCPerformanceResult(); | |
result2.Event = ufcEvent; | |
result2.Winner = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[performanceRowPush + 1].InnerText.Trim(); | |
if (result2.Winner != "N/A") | |
resultSet.Add(result2); | |
} | |
} | |
//OK so now we have a list of events and their respective KO/Sub artists. Now we need to go to each event page, and pull the losers. | |
foreach(var result in resultSet) | |
{ | |
if (!string.IsNullOrEmpty(result.Loser)) | |
continue; | |
var eventName = eventRedirects.ContainsKey(result.Event) ? eventRedirects[result.Event] : result.Event; | |
var fightersToLookup = resultSet.Where(x => x.Event == result.Event).Select(x => x.Winner); | |
var eventPageContent = string.Empty; | |
try | |
{ | |
eventPageContent = wiki.CreateTitlesSource(eventName).Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single(); | |
} | |
catch(Exception ex) | |
{ | |
Console.WriteLine(result.Event + " : " + ex.Message); | |
continue; | |
} | |
var eventPage = new HtmlDocument(); | |
eventPage.LoadHtml(eventPageContent); | |
//There is a redirect in play, we need to reload the "new" document. | |
if(eventPage.DocumentNode.SelectSingleNode("//p").InnerText == "Redirect to:") | |
{ | |
eventPageContent = wiki.CreateTitlesSource(eventPage.DocumentNode.SelectSingleNode("//a").InnerText).Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single(); | |
eventPage.LoadHtml(eventPageContent); | |
} | |
foreach(var tableRow in eventPage.DocumentNode.SelectNodes("//table[@class='toccolours']/tr")) | |
{ | |
foreach(var fighter in fightersToLookup) | |
{ | |
if (tableRow.SelectSingleNode("td") == null) | |
continue; | |
if (tableRow.SelectNodes("td")[1].InnerText.Replace("(c)", "").Replace("(ic)", "").Trim() == fighter) | |
{ | |
resultSet.Single(x => x.Event == result.Event && x.Winner == fighter).Loser = tableRow.SelectNodes("td")[3].InnerText.Replace("(c)", "").Replace("(ic)", "").Trim(); | |
} | |
} | |
} | |
} | |
//Now we just write out the results to a CSV file. | |
var outputLines = new List<string>(); | |
outputLines.Add("Event, Winner, Loser"); | |
foreach(var result in resultSet) | |
{ | |
outputLines.Add(string.Format("{0},{1},{2}", result.Event, result.Winner, result.Loser)); | |
} | |
File.WriteAllLines("output.txt", outputLines); | |
} | |
} | |
class UFCPerformanceResult | |
{ | |
public string Event { get; set; } | |
public string Winner { get; set; } | |
public string Loser { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment