Created
July 26, 2009 16:56
-
-
Save axemclion/155859 to your computer and use it in GitHub Desktop.
Screen Scraper Template
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==UserScript== | |
// @name ScreenScraperTemplate | |
// @namespace http://dy-verse.blogspot.com | |
// @include http://page.you.wanna.scrape/path?pageName?pageNumber=* | |
// ==/UserScript== | |
/** | |
* This function is responsible for submitting the data | |
* @param {Object} parsedData | |
* @param {Object} record | |
*/ | |
function submitData(parsedData, row){ | |
/* | |
* Create a frame. This is where you will know if your submission was correct or not. | |
* We add this to the current row so that we know | |
*/ | |
var targetFrame = document.createElement("iframe"); | |
targetFrame.name = "targetFrame" + Math.random(); | |
row.appendChild(targetFrame); | |
/* | |
* This is the form that will be used to submit data. We are creating this for each element | |
* so that we know what exactly gets submitted | |
*/ | |
var form = document.createElement("form"); | |
/* | |
* This is available on the page where you create the form on Google Spreadsheets. | |
* If you wanna know what exactly gets sumitted, better use Tamper Data | |
*/ | |
form.action = "http://spreadsheets.google.com/formResponse?formkey=yourKey"; | |
form.method = "POST"; | |
form.target = targetFrame.name; | |
form.innerHTML = ["<input name = 'entry.0.single' value = '" + result["title"] + "'/>", "<input name = 'entry.1.single' value = '" + result["finalPrice"] + "'/>", "<input name = 'entry.2.single' value = '" + result["listPrice"] + "'/>", "<input name = 'entry.3.single' value = '" + result["time"] + "'/>", "<input name = 'entry.4.single' value = '" + result["discount"] + "'/>", "<input name = 'entry.5.single' value = '" + result["misc"] + "'/>", ].join(""); | |
row.appendChild(form); | |
form.submit(); | |
/* | |
* Some visual indication that the current row actually gets submitted. | |
* Scrolling to the element where we are working helps in a long page | |
*/ | |
row.style.backgroundColor = "GREEN"; | |
targetFrame.scrollIntoView(true); | |
} | |
function parseData(record){ | |
/* | |
* In the example case, all our data is in a table, so we are getting all the TD elements. | |
* Record is a TR element | |
*/ | |
elem = row.getElementsByTagName("td"); | |
var result = { | |
"field1": elem[0].getElementsByTagName("a")[0].href, | |
"field2": elem[1].getElementsByTagName("a")[0].innerHTML, | |
/* | |
* And so on...Get all the elements and put them in result object | |
*/ | |
"field3": elem[6].getElementsByTagName("div")[0].innerHTML.replace(/-/g, "/") | |
}; | |
return result; | |
} | |
/** | |
* Loads the next page so that we can start scraping | |
*/ | |
function loadNextPage(){ | |
var url = "http://page.you.wanna.scrape/path?pageName?pageNumber=*"; | |
var num = parseInt(document.location.href.substring(document.location.href.lastIndexOf("=") + 1)); | |
if (isNaN(num)) { | |
num = 1; | |
} | |
document.location = url + (num + 1); | |
} | |
/** | |
* This is function starts the web scraping logic | |
* @param {Object} allRows - Topmost level elements that has child elements we wanna parse | |
*/ | |
function start(allRows){ | |
var i = 0; | |
/* | |
* Dont run a loop, better to run a timeout sort of a function. | |
* Will not put load on the server | |
*/ | |
var timerHandler = window.setInterval(function(){ | |
if (i >= allRows.length) { | |
window.clearInterval(timerHandler); | |
window.setTimeout(loadNextPage, 2000); | |
} | |
else { | |
var parsedData = parseData(allRows[i]) | |
submitData(parsedData, allRows[i]); | |
i++; | |
} | |
}, 1000); // this is the time taken for your next page to load | |
} | |
var records = document.getElementsByClassName("common class name of all top level rows"); | |
start(records); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment