Created
May 14, 2011 11:07
-
-
Save hibri/972117 to your computer and use it in GitHub Desktop.
Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Net; | |
using NUnit.Framework; | |
namespace Songkick | |
{ | |
[TestFixture] | |
public class Scraper | |
{ | |
private string cityEndMarker = "</span>"; | |
private string cityStartMarker = "class=\"venuetown\">"; | |
const string listingActStartMarker = "<div class=\"ListingAct\">"; | |
const string listingActEndMarker = "</h3>"; | |
const string actNameStartMarker = "class=\"event_link\">"; | |
const string actNameEndMarker = "</a>"; | |
[Test] | |
public void ScrapeWeGotTickets() | |
{ | |
string url = "http://www.wegottickets.com/searchresults/page/1/all"; | |
WebClient webClient = new WebClient(); | |
string html = webClient.DownloadString(url); | |
List<string> artistNames = ScrapeArtistNames(html); | |
List<string> scrapeCities = ScrapeCities(html); | |
Console.WriteLine("Artist,City"); | |
for (int count = 0; count < artistNames.Count; count++) | |
{ | |
Console.WriteLine("{0},{1}", artistNames[count], scrapeCities[count]); | |
} | |
} | |
private List<string> ScrapeArtistNames(string html) | |
{ | |
int startIndex = 0; | |
List<string> artistNames = new List<string>(); | |
while (startIndex > -1) | |
{ | |
startIndex = html.IndexOf(listingActStartMarker, startIndex); | |
if (startIndex == -1) | |
break; | |
int endIndex = html.IndexOf(listingActEndMarker, startIndex); | |
string listingAct = html.Substring(startIndex, endIndex - startIndex); | |
string actName = GetActName(listingAct); | |
startIndex = endIndex; | |
artistNames.Add(actName); | |
} | |
return artistNames; | |
} | |
private List<string> ScrapeCities(string html) | |
{ | |
int startIndex = 0; | |
List<string> cities = new List<string>(); | |
while (startIndex > -1) | |
{ | |
startIndex = html.IndexOf(cityStartMarker, startIndex); | |
if (startIndex == -1) | |
break; | |
int endIndex = html.IndexOf(cityEndMarker, startIndex); | |
string city = html.Substring(startIndex, endIndex - startIndex).Replace(cityStartMarker, "").Replace(":", ""); | |
startIndex = endIndex; | |
cities.Add(city); | |
} | |
return cities; | |
} | |
private string GetActName(string listingAct) | |
{ | |
int actNameStart = listingAct.IndexOf(actNameStartMarker); | |
int actNameEnd = listingAct.IndexOf(actNameEndMarker, actNameStart); | |
return listingAct.Substring(actNameStart, actNameEnd - actNameStart).Replace(actNameStartMarker, ""); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment