Extract Merged Cells and Formatted Values for Google Spreadsheets in Apps Script
function columnToLetter(column) | |
{ | |
var temp, letter = ''; | |
while (column > 0) | |
{ | |
temp = (column - 1) % 26; | |
letter = String.fromCharCode(temp + 65) + letter; | |
column = (column - temp - 1) / 26; | |
} | |
return letter; | |
} | |
function createBooleanArray(length) { | |
if (arguments.length === 0) { return false } | |
var arr = new Array(length), | |
i = length; | |
var args = Array.prototype.slice.call(arguments, 1); | |
while(i--) arr[length-1 - i] = createBooleanArray.apply(this, args); | |
return arr; | |
} | |
var _ = underscorejs.load(); | |
var docId = '<insert doc id here>'; | |
//Download the spreadsheet as HTML | |
var zipUrl = Utilities.formatString('https://docs.google.com/spreadsheets/d/%s/export?format=zip', docId); | |
var zipUrlParams = { | |
method : "get", | |
headers : {"Authorization": "Bearer " + ScriptApp.getOAuthToken()}, | |
muteHttpExceptions:true, | |
}; | |
//Build a hash of cell index (e.g. 'Sheet1:A3') to rowspan and colspan. If a cell is part of a merged range and | |
//not the top left cell in that range it will not appear in the hash. | |
var parsedHtml = _.chain(Utilities.unzip(UrlFetchApp.fetch(zipUrl,zipUrlParams).getBlob())) | |
.filter(function (x) { return _.contains(sheetNames, x.getName().slice(0, -5)) }) | |
.map(function (x) { | |
var sheetName = x.getName().slice(0, -5); | |
var sheet = document.getSheetByName(sheetName); | |
var dataRange = sheet.getDataRange(); | |
var visited = createBooleanArray(dataRange.getNumRows(), dataRange.getNumColumns()); | |
var xmlString = x.getDataAsString().match(/<tbody>(.*?)<\/tbody>/g); | |
if (!xmlString || xmlString.length === 0) { throw ('Could not find tbody element') } | |
//Clean up unclosed tags. | |
xmlString = xmlString[0].replace(/<br>/g, ''); | |
return _.map( | |
XmlService.parse(xmlString).getRootElement().getChildren('tr'), | |
function (tableRow, tableRowIndex) { | |
var rowIndices = _.chain(this.visited[tableRowIndex]) | |
.map(function (value, index) { return {index: index, value: value }}) | |
.filter(function (x) { return !x.value }) | |
.pluck('index') | |
.value(); | |
return _.map( | |
tableRow.getChildren('td'), | |
function (tableCell, tableColumnIndex) { | |
if (tableColumnIndex + this.increaseBy >= rowIndices.length) { return null } | |
var rowspan = tableCell.getAttribute('rowspan') ? parseInt(tableCell.getAttribute('rowspan').getValue()) : 1; | |
var colspan = tableCell.getAttribute('colspan') ? parseInt(tableCell.getAttribute('colspan').getValue()) : 1; | |
var columnLetter = columnToLetter(rowIndices[tableColumnIndex + this.increaseBy] + 1); | |
for (var i = 0; i < rowspan; i++) { | |
if (tableRowIndex + i > this.visited.length) { break; } | |
for (var j = 0; j < colspan; j++) { | |
if (tableColumnIndex + this.increaseBy >= rowIndices.length | |
|| rowIndices[tableColumnIndex + this.increaseBy]+j >= this.visited[0].length) { | |
break; | |
} | |
this.visited[tableRowIndex+i][rowIndices[tableColumnIndex + this.increaseBy]+j] = true; | |
} | |
} | |
if (colspan > 1) { this.increaseBy += colspan-1 } | |
return { | |
index: Utilities.formatString('%s!%s%s', sheetName, columnLetter, (tableRowIndex+1)), | |
text: _.chain(tableCell.getDescendants()) | |
.filter(function (x) { return x.getType() == XmlService.ContentTypes.TEXT }) | |
.map(function (x) { return x.asText().getText() }) | |
.value().join(' ').trim(), | |
rowspan: rowspan, | |
colspan: colspan | |
}; | |
}, | |
{increaseBy: 0, visited: this.visited}) | |
}, | |
{visited: visited}) | |
}) | |
.flatten() | |
.filter(function (x) { return x }) | |
.indexBy(function (x) { return x.index }) | |
.value(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment