Skip to content

Instantly share code, notes, and snippets.

@khanhkhuu
Created December 9, 2022 04:02
Show Gist options
  • Save khanhkhuu/605929648f82c0bcb8517391fe471370 to your computer and use it in GitHub Desktop.
Save khanhkhuu/605929648f82c0bcb8517391fe471370 to your computer and use it in GitHub Desktop.
Extract Table From PDF
function test() {
const data = extractDataFromPdf('1jVppnsxpiK56RY7vVFMkTa1_GV8SCWxo');
console.log(data);
}
function extractDataFromPdf(pdfId) {
const PDF_LANGUAGE = 'th';
const pdfFile = DriveApp.getFileById(pdfId);
const { id } = Drive.Files.insert(
{
title: pdfFile.getName().replace(/\.pdf$/, ''),
mimeType: pdfFile.getMimeType() || 'application/pdf',
},
pdfFile.getBlob(),
{
ocr: true,
ocrLanguage: PDF_LANGUAGE,
fields: 'id',
}
);
const document = DocumentApp.openById(id);
const data = [];
const body = document.getBody();
const table = body.getTables().pop();
const numberOfRow = table.getNumRows();
for (let i = 0; i < numberOfRow; i++) {
const rowData = [];
const row = table.getRow(i);
const numberOfCell = row.getNumCells();
for (let ii = 0; ii < numberOfCell; ii++) {
const cell = row.getCell(ii);
rowData.push(cell.getText());
}
data.push(rowData);
}
DriveApp.getFileById(id).setTrashed(true);
return data;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment