Skip to content

Instantly share code, notes, and snippets.

@sesn
Created June 14, 2019 08:42
Show Gist options
  • Save sesn/0767b0cfb38220e3cb857e3ee67fe872 to your computer and use it in GitHub Desktop.
Save sesn/0767b0cfb38220e3cb857e3ee67fe872 to your computer and use it in GitHub Desktop.
const config = require('config');
const AWS = require('aws-sdk');
const fs = require('fs');
const path = require('path');
AWS.config.region = config.get('textract').region;
AWS.config.credentials = new AWS.Credentials(config.get('s3'));
const awsTextract = new AWS.Textract();
/**
* Generate CSV Data from Image using AWS Textract
*
* @params inputFile - Path of the input file
*
*/
async function generateAwsTextract({ inputFile }) {
let promise = new Promise((resolve, reject) => {
let inputBuffer = fs.readFileSync(inputFile);
const params = {
Document: {
Bytes: Buffer.from(inputBuffer),
},
FeatureTypes: [
'TABLES'
]
};
awsTextract.analyzeDocument(params, (err, data) => {
if (err) reject(err); // an error occurred
let blocks = data.Blocks;
let blocks_map = {};
let table_blocks = [];
let csv = '';
for(let block of blocks) {
blocks_map[block['Id']] = block;
if (block.BlockType == 'TABLE') {
table_blocks.push(block);
}
}
if(table_blocks.length == 0) {
let parsedData = {};
return resolve(parsedData);
// return reject('Not Found');
} else {
table_blocks.forEach((table, index) => {
csv += generateAwsTextractTableCsv(table, blocks_map, index);
csv += '\n\n';
});
resolve(csv);
});
}
});
});
return promise;
}
function getAwsTextractText(result, blocks_map) {
let text = '';
if (result.Relationships) {
result.Relationships.forEach(relationship => {
if (relationship.Type === 'CHILD') {
relationship.Ids.forEach(child_id => {
let word = blocks_map[child_id];
if (word.BlockType === 'WORD') {
text += word.Text + ' ';
}
if (word.BlockType === 'SELECTION_ELEMENT' && word.SelectionStatus == 'SELECTED') {
text += 'X ';
}
});
}
});
}
return text;
}
function generateAwsTextractRowsColumnMap(table_result, blocks_map) {
let rows = {};
for (let relationship of table_result.Relationships) {
if (relationship && relationship.Type === 'CHILD') {
relationship.Ids.forEach(id => {
let cell = blocks_map[id];
if (cell.BlockType === 'CELL') {
let row_index = cell.RowIndex;
let col_index = cell.ColumnIndex;
if (typeof rows[row_index] == 'undefined') {
rows[row_index] = {};
}
rows[row_index][col_index] = getAwsTextractText(cell, blocks_map);
}
});
}
return rows;
}
}
function generateAwsTextractTableCsv(table_result, blocks_map, table_index) {
let rows = generateAwsTextractRowsColumnMap(table_result, blocks_map);
let table_id = 'Table_' + table_index;
let csv = `BOM Table: ${table_id+1}\n`;
for (let rowKey of Object.keys(rows)) {
for (let colKey of Object.keys(rows[rowKey])) {
csv += rows[rowKey][colKey] + ',';
}
csv += '\n';
}
csv += '\n\n\n';
return csv;
}
module.exports = {
generateAwsTextract
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment