Skip to content

Instantly share code, notes, and snippets.

@axemclion
Created July 26, 2009 16:56
Show Gist options
  • Save axemclion/155859 to your computer and use it in GitHub Desktop.
Save axemclion/155859 to your computer and use it in GitHub Desktop.
Screen Scraper Template
// ==UserScript==
// @name ScreenScraperTemplate
// @namespace http://dy-verse.blogspot.com
// @include http://page.you.wanna.scrape/path?pageName?pageNumber=*
// ==/UserScript==
/**
* This function is responsible for submitting the data
* @param {Object} parsedData
* @param {Object} record
*/
function submitData(parsedData, row){
/*
* Create a frame. This is where you will know if your submission was correct or not.
* We add this to the current row so that we know
*/
var targetFrame = document.createElement("iframe");
targetFrame.name = "targetFrame" + Math.random();
row.appendChild(targetFrame);
/*
* This is the form that will be used to submit data. We are creating this for each element
* so that we know what exactly gets submitted
*/
var form = document.createElement("form");
/*
* This is available on the page where you create the form on Google Spreadsheets.
* If you wanna know what exactly gets sumitted, better use Tamper Data
*/
form.action = "http://spreadsheets.google.com/formResponse?formkey=yourKey";
form.method = "POST";
form.target = targetFrame.name;
form.innerHTML = ["<input name = 'entry.0.single' value = '" + result["title"] + "'/>", "<input name = 'entry.1.single' value = '" + result["finalPrice"] + "'/>", "<input name = 'entry.2.single' value = '" + result["listPrice"] + "'/>", "<input name = 'entry.3.single' value = '" + result["time"] + "'/>", "<input name = 'entry.4.single' value = '" + result["discount"] + "'/>", "<input name = 'entry.5.single' value = '" + result["misc"] + "'/>", ].join("");
row.appendChild(form);
form.submit();
/*
* Some visual indication that the current row actually gets submitted.
* Scrolling to the element where we are working helps in a long page
*/
row.style.backgroundColor = "GREEN";
targetFrame.scrollIntoView(true);
}
function parseData(record){
/*
* In the example case, all our data is in a table, so we are getting all the TD elements.
* Record is a TR element
*/
elem = row.getElementsByTagName("td");
var result = {
"field1": elem[0].getElementsByTagName("a")[0].href,
"field2": elem[1].getElementsByTagName("a")[0].innerHTML,
/*
* And so on...Get all the elements and put them in result object
*/
"field3": elem[6].getElementsByTagName("div")[0].innerHTML.replace(/-/g, "/")
};
return result;
}
/**
* Loads the next page so that we can start scraping
*/
function loadNextPage(){
var url = "http://page.you.wanna.scrape/path?pageName?pageNumber=*";
var num = parseInt(document.location.href.substring(document.location.href.lastIndexOf("=") + 1));
if (isNaN(num)) {
num = 1;
}
document.location = url + (num + 1);
}
/**
* This is function starts the web scraping logic
* @param {Object} allRows - Topmost level elements that has child elements we wanna parse
*/
function start(allRows){
var i = 0;
/*
* Dont run a loop, better to run a timeout sort of a function.
* Will not put load on the server
*/
var timerHandler = window.setInterval(function(){
if (i >= allRows.length) {
window.clearInterval(timerHandler);
window.setTimeout(loadNextPage, 2000);
}
else {
var parsedData = parseData(allRows[i])
submitData(parsedData, allRows[i]);
i++;
}
}, 1000); // this is the time taken for your next page to load
}
var records = document.getElementsByClassName("common class name of all top level rows");
start(records);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment