Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
using HtmlAgilityPack;
using LinqToWiki.Generated;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace UFCKnockouts.Downloader
{
class Program
{
static void Main(string[] args)
{
//A set of redirects (Some events are labelled completely differently than their wikipedia pages).
Dictionary<string, string> eventRedirects = new Dictionary<string, string>();
eventRedirects.Add("TUF 22 Finale", "The Ultimate Fighter: Team McGregor vs. Team Faber");
eventRedirects.Add("TUF 21 Finale", "The Ultimate Fighter: American Top Team vs. Blackzilians");
eventRedirects.Add("TUF 20 Finale", "The Ultimate Fighter: A Champion Will Be Crowned");
eventRedirects.Add("TUF 19 Finale", "The Ultimate Fighter: Team Edgar vs. Team Penn");
eventRedirects.Add("TUF Brazil 3 Finale", "The Ultimate Fighter: Brazil 3");
eventRedirects.Add("TUF China Finale", "The Ultimate Fighter: China");
eventRedirects.Add("TUF 18 Finale", "The Ultimate Fighter: Team Rousey vs. Team Tate");
eventRedirects.Add("UFC on FX 1", "UFC on FX: Guillard vs. Miller");
var wiki = new Wiki("UFCKnockoutLosers/1.0 (http://mindingdata.com, wade@mindingdata.com)", "en.wikipedia.org");
//This library is straight garbage for getting the actual page content....
var ufcAwardPageContent = wiki.CreateTitlesSource("List of UFC bonus award recipients").Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single();
HtmlDocument ufcAwardPage = new HtmlDocument();
ufcAwardPage.LoadHtml(ufcAwardPageContent);
List<UFCPerformanceResult> resultSet = new List<UFCPerformanceResult>();
var awardReceiptientTable = ufcAwardPage.DocumentNode.SelectNodes("//table")[1];
string ufcEvent = string.Empty;
//So first we get all the fighters who have won Performance of the night (Or Sub/KO), and the event it happened at.
//Start at 1 to skip header row.
//This became a complete mess because the HTML for wikipedia has tonnes of special row/col span rules.
for (int i=1; i < awardReceiptientTable.SelectNodes("tr").Count;i++)
{
//This means there was double FOTN for the event, which means there was no POTN on this row.
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count == 3)
continue;
int performanceRowPush = 0;
//Sometimes the rowspan is 2, so we need to hold the same event (Don't ask why....).
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count <= 2)
{
}
else
{
ufcEvent = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[0].InnerText.Trim();
//If there is no fight of the night, the table is a bit different.
performanceRowPush = 2;
if(awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[1].InnerText.Trim() != "N/A")
{
performanceRowPush = 4;
}
}
var result1 = new UFCPerformanceResult();
result1.Event = ufcEvent;
result1.Winner = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[performanceRowPush].InnerText.Trim();
if(result1.Winner != "N/A")
resultSet.Add(result1);
if (awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td").Count > 1)
{
var result2 = new UFCPerformanceResult();
result2.Event = ufcEvent;
result2.Winner = awardReceiptientTable.SelectNodes("tr")[i].SelectNodes("td")[performanceRowPush + 1].InnerText.Trim();
if (result2.Winner != "N/A")
resultSet.Add(result2);
}
}
//OK so now we have a list of events and their respective KO/Sub artists. Now we need to go to each event page, and pull the losers.
foreach(var result in resultSet)
{
if (!string.IsNullOrEmpty(result.Loser))
continue;
var eventName = eventRedirects.ContainsKey(result.Event) ? eventRedirects[result.Event] : result.Event;
var fightersToLookup = resultSet.Where(x => x.Event == result.Event).Select(x => x.Winner);
var eventPageContent = string.Empty;
try
{
eventPageContent = wiki.CreateTitlesSource(eventName).Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single();
}
catch(Exception ex)
{
Console.WriteLine(result.Event + " : " + ex.Message);
continue;
}
var eventPage = new HtmlDocument();
eventPage.LoadHtml(eventPageContent);
//There is a redirect in play, we need to reload the "new" document.
if(eventPage.DocumentNode.SelectSingleNode("//p").InnerText == "Redirect to:")
{
eventPageContent = wiki.CreateTitlesSource(eventPage.DocumentNode.SelectSingleNode("//a").InnerText).Select(p => p.revisions().Where(r => r.parse).ToEnumerable().FirstOrDefault().value).ToEnumerable().Single();
eventPage.LoadHtml(eventPageContent);
}
foreach(var tableRow in eventPage.DocumentNode.SelectNodes("//table[@class='toccolours']/tr"))
{
foreach(var fighter in fightersToLookup)
{
if (tableRow.SelectSingleNode("td") == null)
continue;
if (tableRow.SelectNodes("td")[1].InnerText.Replace("(c)", "").Replace("(ic)", "").Trim() == fighter)
{
resultSet.Single(x => x.Event == result.Event && x.Winner == fighter).Loser = tableRow.SelectNodes("td")[3].InnerText.Replace("(c)", "").Replace("(ic)", "").Trim();
}
}
}
}
//Now we just write out the results to a CSV file.
var outputLines = new List<string>();
outputLines.Add("Event, Winner, Loser");
foreach(var result in resultSet)
{
outputLines.Add(string.Format("{0},{1},{2}", result.Event, result.Winner, result.Loser));
}
File.WriteAllLines("output.txt", outputLines);
}
}
class UFCPerformanceResult
{
public string Event { get; set; }
public string Winner { get; set; }
public string Loser { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment