Skip to content

Instantly share code, notes, and snippets.

@brucemcpherson
Created September 5, 2012 15:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save brucemcpherson/3638087 to your computer and use it in GitHub Desktop.
Save brucemcpherson/3638087 to your computer and use it in GitHub Desktop.
scraper wiki code for google apps script
/** @description
* get data from scraperwiki into google apps script
* See http://ramblings.mcpher.com/Home/excelquirks/codeuse for more details
* @author <a href="mailto:bruce@mcpher.com">Bruce McPherson</a><a href="http://ramblings.mcpher.com"> ramblings.mcpher.com</a>
*/
/**
* swSeewhatworks see which scraperwikis have tables and update a list (as returned by rest entry scraperwiki) with default sql
* @param {string} ws the worksheet name with the scraperwiki list of shortnames
* @return {void} null
*/
function swSeewhatworks(ws) {
var ds = new cDataSet().populateData (wholeSheet(ws),
undefined,undefined ,undefined ,undefined , undefined, true);
var cache = sheetCache(ds.headingRow().where());
ds.rows().forEach(
function (dr) {
cache.setValue(swGetDefaultTableSql(dr.cell("short_name").toString(), false),
dr.where().getRow(), dr.columns().count()+1 );
}
);
cache.close();
}
/**
* swGetTables return the cRest result of query for table names
* @param {string} shortName the scraperWiki key
* @return {cRest} the result of the query for table names
*/
function swGetTables(shortName){
var tableDirectory = "SELECT name FROM sqlite_master " +
"WHERE type IN ('table','view') AND name NOT LIKE 'sqlite_%' " +
"Union all " +
"SELECT name FROM sqlite_temp_master " +
"WHERE type IN ('table','view') " +
"ORDER BY 1";
return restQuery(undefined, "scraperwikidata",
shortName + "&query=" + tableDirectory,undefined ,undefined ,undefined ,undefined , false);
}
/**
* swGetDefaultTableSql look up to see what tables are defined in a given scraperwiki and return sql to get the first one
* @param {string} shortName the scraperWiki key
* @param {boolean} optComplain whether to complain if there is a problem
* @return {string} the sql query to get data from first table
*/
function swGetDefaultTableSql(shortName, optComplain){
var complain = fixOptional (optComplain,true);
var cr = swGetTables(shortName);
if (!cr) {
MsgBox ("could get no info on " + shortName);
}
else {
var job = cr.jObjects().count() ? cr.jObjects().item(1) : null;
if (job && job.hasChildren()) {
// this is hokey - for the moment just take from the first table found
return "select * from '" +
job.children(1).child("name").toString() + "'";
}
else {
DebugPrint(shortName," did not return any tables: got this:", cr.responseData());
if (complain) MsgBox ("could not find any valid tables for " +
shortName + "(" + (job ? job.serialize() : "no data") + ")")
}
}
return "";
}
/**
* swCleanSheet create a clean results sheet with column headings
* @param {cJobject} job contains the list of columns headings as keys
* @param {string} ws the worksheet name to populate
* @return {cDataSet} the dataset with the headings populated
*/
function swCleanSheet(job, ws) {
// put headers to a clean sheet
var ds = null;
var cache = sheetCache(ws);
cache.clear();
if (job.hasChildren()) {
job.children().forEach(
function (cj,n) {
cache.setValue(cj.key(),1,n);
}
);
ds= new cDataSet().populateData( vResize (wholeSheet(ws), 1, job.children().count()));
}
cache.commit();
return ds;
}
/**
* swGetHeaders organize what headers are needed given the scraperWIki response
* @param {cJobject} job contains the query response
* @return {cJobject} a jobject with a list of keys for column headings
*/
function swGetHeaders(job) {
// take scraper wiki data and generate an organized dataset using the headers found
var cjKeys = new cJobject().init(null);
job.children().forEach(
function(cj) {
cj.children().forEach(
function (jo) {
cjKeys.add(jo.key());
}
);
}
);
return cjKeys;
}
/**
* scraperWikiStuff do the query and populate the data
* @param {string} shortName the scraperwiki key
* @param {string} ws the worksheet name to populate
* @param {string} optSql the optional sql string to get the data
* @param {number} optLimit the optional limit to number of rows to get
* @return {cDataSet} the finished data
*/
function scraperWikiStuff(shortName, ws , optSql, optLimit) {
// sort out the optional args
var sql = fixOptional (optSql, swGetDefaultTableSql(shortName));
var limit = IsMissing(optLimit) ? "" : "&limit=" + CStr(optLimit);
var ds = null;
// get the data
var cr = restQuery(undefined, "scraperwikidata",
shortName + "&query=" + sql + limit,undefined ,undefined ,undefined ,undefined , false);
//now organize it
if(cr) {
// get the unique headers and put them to a clean data set
var crj = cr.jObject();
var headJob = swGetHeaders(crj);
if (!headJob) {
MsgBox ("didnt work at all " + crj.serialize())
}
else {
ds = swCleanSheet(headJob, ws);
if (!ds) {
MsgBox ("failed to get the expected data " & crj.serialize())
}
else {
var cache = sheetCache(ds.headingRow().where());
var r = ds.headingRow().where().getRow();
// we know how big the cache needs to be so do it once off
cache.extend(crj.children().count()+1, ds.columns().count());
// this is the data returned - each array member is a row
crj.children().forEach(
function (cj,rIndex) {
cj.children().forEach (
function (job,cIndex) {
cache.setValue(job.value(), r + rIndex, cIndex);
}
);
}
);
cache.close();
}
}
}
return ds;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment