Skip to content

Instantly share code, notes, and snippets.

@AnEmortalKid
Last active October 20, 2015 01:33
Show Gist options
  • Save AnEmortalKid/db109459c0f05959b2dd to your computer and use it in GitHub Desktop.
Save AnEmortalKid/db109459c0f05959b2dd to your computer and use it in GitHub Desktop.
package com.anemortalid.essex.whatson.scrape;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Scrapes the domain for some info
*
*/
public class Scraper {
private static final String ESSEX_DOMAIN = "http://www.essexstudent.com";
public static void main(String[] args) {
List<EssexEvent> events = new Scraper().scrubWhatson();
System.out.println("There are " + events.size() + " events.");
for (EssexEvent essexEvent : events) {
System.out.println(essexEvent);
}
}
public List<EssexEvent> scrubWhatson() {
List<EssexEvent> events = new ArrayList<Scraper.EssexEvent>();
try {
Document document = Jsoup.connect(ESSEX_DOMAIN + "/whatson/").get();
// Get events only, they most likely all have this class
Elements eventItems = document.getElementsByClass("event_item");
// extract the info, we don't really need to visit their links
for (Element eventItem : eventItems) {
EssexEvent event = new EssexEvent();
Elements eventNames = eventItem.select("a.msl_event_name");
Element aTag = eventNames.get(0);
String link = aTag.attr("href");
String title = aTag.text();
event.setTitle(title);
event.setEventLink(link);
// These other tags have some more info, they're in a
// div->dl->dd section
Elements eventTimes = eventItem.getElementsByClass("msl_event_time");
String eventTime = eventTimes.text();
event.setTime(eventTime);
Elements eventLocations = eventItem.getElementsByClass("msl_event_location");
String eventLocation = eventLocations.text();
event.setLocation(eventLocation);
Elements eventDescriptions = eventItem.getElementsByClass("msl_event_description");
String eventDescription = eventDescriptions.text();
event.setDescription(eventDescription);
// find the image at the end, not all event items have a
// span>msl_event_image
Elements eventImageElems = eventItem.select("span.msl_event_image");
if (eventImageElems.size() > 0) {
Element spanTag = eventImageElems.first();
Elements spanChildren = spanTag.children();
Element imgTag = spanChildren.first();
String imgSource = imgTag.attr("src");
event.setImgSrc(imgSource);
}
events.add(event);
}
} catch (Exception e) {
// yolo
e.printStackTrace();
}
return events;
}
/**
* Represents an Essex event
*
*/
private class EssexEvent {
private String title;
private String eventLink;
private String imgSrc;
private String time;
private String location;
private String description;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getEventLink() {
return eventLink;
}
public void setEventLink(String eventLink) {
this.eventLink = eventLink;
}
public String getImgSrc() {
return imgSrc;
}
public void setImgSrc(String imgSrc) {
this.imgSrc = imgSrc;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getLocation() {
return location;
}
public void setLocation(String location) {
this.location = location;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getImageLocationLink() {
if (imgSrc == null) {
return "NO_IMAGE";
}
int questionMark = imgSrc.indexOf("?");
return ESSEX_DOMAIN + imgSrc.substring(0, questionMark);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("EssexEvent [title=");
builder.append(title);
builder.append(", eventLink=");
builder.append(eventLink);
builder.append(", imgSrc=");
builder.append(imgSrc);
builder.append(", imgOnlyLink=");
builder.append(getImageLocationLink());
builder.append(", time=");
builder.append(time);
builder.append(", location=");
builder.append(location);
builder.append(", description=");
builder.append(description);
builder.append("]");
return builder.toString();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment