Skip to content

Instantly share code, notes, and snippets.

@hibri
Created May 14, 2011 11:07
Show Gist options
  • Save hibri/972117 to your computer and use it in GitHub Desktop.
Save hibri/972117 to your computer and use it in GitHub Desktop.
Scraper
using System;
using System.Collections.Generic;
using System.Net;
using NUnit.Framework;
namespace Songkick
{
[TestFixture]
public class Scraper
{
private string cityEndMarker = "</span>";
private string cityStartMarker = "class=\"venuetown\">";
const string listingActStartMarker = "<div class=\"ListingAct\">";
const string listingActEndMarker = "</h3>";
const string actNameStartMarker = "class=\"event_link\">";
const string actNameEndMarker = "</a>";
[Test]
public void ScrapeWeGotTickets()
{
string url = "http://www.wegottickets.com/searchresults/page/1/all";
WebClient webClient = new WebClient();
string html = webClient.DownloadString(url);
List<string> artistNames = ScrapeArtistNames(html);
List<string> scrapeCities = ScrapeCities(html);
Console.WriteLine("Artist,City");
for (int count = 0; count < artistNames.Count; count++)
{
Console.WriteLine("{0},{1}", artistNames[count], scrapeCities[count]);
}
}
private List<string> ScrapeArtistNames(string html)
{
int startIndex = 0;
List<string> artistNames = new List<string>();
while (startIndex > -1)
{
startIndex = html.IndexOf(listingActStartMarker, startIndex);
if (startIndex == -1)
break;
int endIndex = html.IndexOf(listingActEndMarker, startIndex);
string listingAct = html.Substring(startIndex, endIndex - startIndex);
string actName = GetActName(listingAct);
startIndex = endIndex;
artistNames.Add(actName);
}
return artistNames;
}
private List<string> ScrapeCities(string html)
{
int startIndex = 0;
List<string> cities = new List<string>();
while (startIndex > -1)
{
startIndex = html.IndexOf(cityStartMarker, startIndex);
if (startIndex == -1)
break;
int endIndex = html.IndexOf(cityEndMarker, startIndex);
string city = html.Substring(startIndex, endIndex - startIndex).Replace(cityStartMarker, "").Replace(":", "");
startIndex = endIndex;
cities.Add(city);
}
return cities;
}
private string GetActName(string listingAct)
{
int actNameStart = listingAct.IndexOf(actNameStartMarker);
int actNameEnd = listingAct.IndexOf(actNameEndMarker, actNameStart);
return listingAct.Substring(actNameStart, actNameEnd - actNameStart).Replace(actNameStartMarker, "");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment