Created
January 1, 2020 17:20
-
-
Save hatemalimam/30260c3ae783d24b5751ab2286e085a9 to your computer and use it in GitHub Desktop.
Textract Form extraction example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const _ = require("lodash"); | |
const aws = require("aws-sdk"); | |
const config = require("./config"); | |
aws.config.update({ | |
accessKeyId: config.awsAccesskeyID, | |
secretAccessKey: config.awsSecretAccessKey, | |
region: config.awsRegion | |
}); | |
const textract = new aws.Textract(); | |
const getText = (result, blocksMap) => { | |
let text = ""; | |
if (_.has(result, "Relationships")) { | |
result.Relationships.forEach(relationship => { | |
if (relationship.Type === "CHILD") { | |
relationship.Ids.forEach(childId => { | |
const word = blocksMap[childId]; | |
if (word.BlockType === "WORD") { | |
text += `${word.Text} `; | |
} | |
if (word.BlockType === "SELECTION_ELEMENT") { | |
if (word.SelectionStatus === "SELECTED") { | |
text += `X `; | |
} | |
} | |
}); | |
} | |
}); | |
} | |
return text.trim(); | |
}; | |
const findValueBlock = (keyBlock, valueMap) => { | |
let valueBlock; | |
keyBlock.Relationships.forEach(relationship => { | |
if (relationship.Type === "VALUE") { | |
// eslint-disable-next-line array-callback-return | |
relationship.Ids.every(valueId => { | |
if (_.has(valueMap, valueId)) { | |
valueBlock = valueMap[valueId]; | |
return false; | |
} | |
}); | |
} | |
}); | |
return valueBlock; | |
}; | |
const getKeyValueRelationship = (keyMap, valueMap, blockMap) => { | |
const keyValues = {}; | |
const keyMapValues = _.values(keyMap); | |
keyMapValues.forEach(keyMapValue => { | |
const valueBlock = findValueBlock(keyMapValue, valueMap); | |
const key = getText(keyMapValue, blockMap); | |
const value = getText(valueBlock, blockMap); | |
keyValues[key] = value; | |
}); | |
return keyValues; | |
}; | |
const getKeyValueMap = blocks => { | |
const keyMap = {}; | |
const valueMap = {}; | |
const blockMap = {}; | |
let blockId; | |
blocks.forEach(block => { | |
blockId = block.Id; | |
blockMap[blockId] = block; | |
if (block.BlockType === "KEY_VALUE_SET") { | |
if (_.includes(block.EntityTypes, "KEY")) { | |
keyMap[blockId] = block; | |
} else { | |
valueMap[blockId] = block; | |
} | |
} | |
}); | |
return { keyMap, valueMap, blockMap }; | |
}; | |
module.exports = async buffer => { | |
const params = { | |
Document: { | |
/* required */ | |
Bytes: buffer | |
}, | |
FeatureTypes: ["FORMS"] | |
}; | |
const request = textract.analyzeDocument(params); | |
const data = await request.promise(); | |
if (data && data.Blocks) { | |
const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks); | |
const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap); | |
return keyValues; | |
} | |
// in case no blocks are found return undefined | |
return undefined; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment