collegeman/cleveland-international-film-festival-crawler-2016.js

## cleveland-international-film-festival-crawler-2016.js
/*
To use this script, open Chrome and browse to
http://www.clevelandfilm.org/schedule
Open the JavaScript console (CMD + ALT + J on Mac), and
paste the whole script into the command line.
The script is asynchronous, meaning that when you run it,
it will appear to finish immediately, but in fact the
script makes one request for each film-like thing it finds
on the schedule page. You can see it running by clicking
on the Network tab of the debugger window (already open
if you did CMD + ALT + J above). When all of the
network requests stop, it's safe to dump the contents
of the global variable "films". You dump by running
JSON.stringify(films). The output will be JSON encoded—
a string that you can then use to generate a CSV. The PHP
script I use to transform this JSON file is here:
https://gist.github.com/collegeman/e243e774d70bb80f7b98
*/
// this array will store the films:
var films = [];
/**
 * Download, parse, and store details for the film
 * at the given index in films.
 * @param int The index
 */
function getFilmDetails(i) {
  // get the film at i
	var film = films[i];
	// do an async get request to the details page on cleveland's site
	jQuery.get(film.url, function(html) {
	  // parse the HTML so that we can extract data from it
		var details = jQuery(html);
		// try to get the e-mail address in the right-hand column
		film.email = details.find('#film-detail a[href^="mailto"]').text();
		// couldn't find it? we'll look harder
		if (!film.email) {
		  // isolate that column
			var search = details.find('#film-detail').html();
			// if the column exists (normally it does)...
			if (search) {
			  // use a regular express to seek out e-mail addresses
				var res = search.match(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi);
				// no e-mail address? report as an error in the log
				if (!res) {
					console.error("Couldn't identify e-mail", film.url);
				// otherwise grab the first one we find
				} else if (res.length) {
					film.email = res[0];
				}
			}
		}
		// next look for info elements: year released, run-time, country
		film.info = [];
		var info = details.find('p.info').html();
		if (info) {
		  // these elements are in a block of text, separated by <br> (linebreak) elements
			film.info = info.split('<br>').map(function(html) {
			  // we have to wrap each section in an HTML tag so that we can parse it
			  var snip = jQuery.trim(jQuery('<span>' + html + '</span>').text());
			  // then split on the colon, and take the last part (the value, not the label)
			  return jQuery.trim(snip.split(':')[1]);
			});
		}

		// sometimes the website URL for the film is listed
		// if it is, it's usually the last anchor tag in the column
		film.website = details.find('#film-detail a:last').attr('href');
		if (film.email == film.website) {
			film.website = null;
		}

		// if the URL doesn't have "http" in it, then it's not
		// a film website, but is instead a reference to a page
		// on cleveland's site, which we don't care about
		if (film.website && film.website.indexOf('http') < 0) {
			film.website = null;
		}

		// update the date we have on the film
		films[i] = film;
	});
}

// this is the bit that goes through the
// schedule page and finds all the films
jQuery('.film .title a').each(function() {
  // for every film we find, grab the title and
  // the URL for the details page, and stash it
  // for later
  films.push({
    title: jQuery(this).text(),
    url: 'http://www.clevelandfilm.org/' + jQuery(this).attr('href')
  });
  // then kick off the process that downloads
  // the details and parses them
  getFilmDetails(films.length-1);
});
// wait for crawler to finish, then run JSON.stringify(films)
	/*
	To use this script, open Chrome and browse to
	http://www.clevelandfilm.org/schedule
	Open the JavaScript console (CMD + ALT + J on Mac), and
	paste the whole script into the command line.
	The script is asynchronous, meaning that when you run it,
	it will appear to finish immediately, but in fact the
	script makes one request for each film-like thing it finds
	on the schedule page. You can see it running by clicking
	on the Network tab of the debugger window (already open
	if you did CMD + ALT + J above). When all of the
	network requests stop, it's safe to dump the contents
	of the global variable "films". You dump by running
	JSON.stringify(films). The output will be JSON encoded—
	a string that you can then use to generate a CSV. The PHP
	script I use to transform this JSON file is here:
	https://gist.github.com/collegeman/e243e774d70bb80f7b98
	*/
	// this array will store the films:
	var films = [];
	/**
	* Download, parse, and store details for the film
	* at the given index in films.
	* @param int The index
	*/
	function getFilmDetails(i) {
	// get the film at i
	var film = films[i];
	// do an async get request to the details page on cleveland's site
	jQuery.get(film.url, function(html) {
	// parse the HTML so that we can extract data from it
	var details = jQuery(html);
	// try to get the e-mail address in the right-hand column
	film.email = details.find('#film-detail a[href^="mailto"]').text();
	// couldn't find it? we'll look harder
	if (!film.email) {
	// isolate that column
	var search = details.find('#film-detail').html();
	// if the column exists (normally it does)...
	if (search) {
	// use a regular express to seek out e-mail addresses
	var res = search.match(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi);
	// no e-mail address? report as an error in the log
	if (!res) {
	console.error("Couldn't identify e-mail", film.url);
	// otherwise grab the first one we find
	} else if (res.length) {
	film.email = res[0];
	}
	}
	}
	// next look for info elements: year released, run-time, country
	film.info = [];
	var info = details.find('p.info').html();
	if (info) {
	// these elements are in a block of text, separated by <br> (linebreak) elements
	film.info = info.split('<br>').map(function(html) {
	// we have to wrap each section in an HTML tag so that we can parse it
	var snip = jQuery.trim(jQuery('<span>' + html + '</span>').text());
	// then split on the colon, and take the last part (the value, not the label)
	return jQuery.trim(snip.split(':')[1]);
	});
	}

	// sometimes the website URL for the film is listed
	// if it is, it's usually the last anchor tag in the column
	film.website = details.find('#film-detail a:last').attr('href');
	if (film.email == film.website) {
	film.website = null;
	}

	// if the URL doesn't have "http" in it, then it's not
	// a film website, but is instead a reference to a page
	// on cleveland's site, which we don't care about
	if (film.website && film.website.indexOf('http') < 0) {
	film.website = null;
	}

	// update the date we have on the film
	films[i] = film;
	});
	}

	// this is the bit that goes through the
	// schedule page and finds all the films
	jQuery('.film .title a').each(function() {
	// for every film we find, grab the title and
	// the URL for the details page, and stash it
	// for later
	films.push({
	title: jQuery(this).text(),
	url: 'http://www.clevelandfilm.org/' + jQuery(this).attr('href')
	});
	// then kick off the process that downloads
	// the details and parses them
	getFilmDetails(films.length-1);
	});
	// wait for crawler to finish, then run JSON.stringify(films)