karimsa/getReceipt.js

## getReceipt.js
/**
 * getReceipt.js - Split
 * Logic to convert a JSON response from
 * the MS Cognitive Services OCR API into
 * proper receiptable data.
 *
 * Licensed under MIT license.
 * Copyright (C) 2017 Karim Alibhai.
 */

'use strict'

/**
 * Algorithm for vertical distance.
 * Assumes that 'a' is the line that is higher.
 */
const vdist = (a, b) => (b.boundingBox.y - (a.boundingBox.y + a.boundingBox.height))

/**
 * Algorithm for horizontal distance.
 * Assumes that 'a' is the line that is on the left.
 */
const hdist = (a, b) => (b.boundingBox.x - (a.boundingBox.x + a.boundingBox.width))

/**
 * Verify that text includes a number.
 */
const isNumber = text => (
  // it's a valid string
  text &&

  // it's got numbers and a decimal
  /[$0-9.\s]+/g.test(text) &&

  // it doesn't have letters
  !/[a-z]/gi.test(text)
)

/**
 * Transforms a response from MS cognitive services OCR
 * into a receipt items dictionary.
 */
module.exports = res => new Promise((resolve, reject) => {
  const data = {}, lines = []
  let docWidth = 0, docHeight = 0
  let averageVDist = 0, averageHDist = 0

  if (res.language !== 'en') return reject('Unsupported language: ' + res.language)
  if (res.textAngle !== 0 || res.orientation !== 'Up') {
    data.warning = 'Photo is a bit unclear. Please try to ensure that' +
                   'your receipt is straight and facing the right way.'
  }

  // we don't care about word-by-word, so let's
  // collapse lines
  //
  // let's also collapse regions because the default
  // region decomposition algorithm is based on general
  // euclidean distance and we just want to decompose
  // based upon vertical distance
  res.regions.forEach(region => {
    region.lines.reduce((arr, line) => {
      // collapse words into line
      line.words = line.words.map(word => word.text).join(' ')
                      // fix spacing issues
                      .replace(/\s+/g, ' ').trim()

      // destructure the bounding box, since we're looping anyways
      let [ x, y, width, height ] = line.boundingBox.split(',').map(i => parseInt(i, 10))
      line.boundingBox = { x, y, width, height }

      // update docWidth and docHeight
      docWidth = Math.max(docWidth, x + width)
      docHeight = Math.max(docHeight, y + height)

      // faster than concat
      arr.push(line)
      return arr
    }, lines)
  })

  // let's sort, so that our algorithms always work
  // sorting is ascending y, then ascending x
  lines
    .sort((a, b) => a.boundingBox.x - b.boundingBox.x)
    .sort((a, b) => a.boundingBox.y - b.boundingBox.y)

  // find distances between lines
  lines.forEach((line, index, self) => {
    if (index === 0) {
      line.boundingBox.fromTop = 0
    } else {
      averageVDist += Math.abs(line.boundingBox.fromTop = vdist( self[index - 1], line ))
    }
  })

  // find average
  averageVDist /= lines.length

  // decompose regions by average
  let region = [ lines[0] ]
  let regions = [region]
  lines.forEach((line, index) => {
    if (index !== 0) {
      if (line.boundingBox.fromTop > averageVDist) {
        regions.push(region = [line])
      } else {
        region.push(line)
      }
    }
  })

  // identify average line differences in regions,
  // and use them to collapse lines into sub-regions
  regions = regions.map((region, ri) => {
    const averageLineDiff = region.reduce((sum, line) => {
      return sum + Math.max(line.boundingBox.fromTop, 0)
    }, 0) / region.length

    //console.log('region #%s : diff of %s', ri, averageLineDiff)
    let subreg = []
    const lines = [subreg]

    for (let i = 0; i < region.length; ++ i) {
      if (i !== 0 && region[i].boundingBox.fromTop > averageLineDiff) {
        lines.push(subreg = [region[i]])
      } else {
        subreg.push(region[i])
      }
    }

    return lines
  })

  // do some clean-up and sorting
  regions = regions.map(region =>
    region.map(subreg => {
      let nstrings = 0, nints = 0

      subreg = subreg.sort((a, b) =>
        (a.boundingBox.x - b.boundingBox.x)
      ).map(line => {
        line = line.words

        // parse numbers
        if (isNumber(line)) {
          nints += 1
          line = parseFloat(line.replace(/[$\s]/g, ''))
        } else {
          nstrings += 1
        }

        return line
      })

      // some merging, required for some receipts
      if (nstrings === nints) {
        let sub = []

        for (let i = 0; typeof subreg[i] === 'string' && i < subreg.length; ++ i) {
          sub.push([ subreg[i], subreg[i + nstrings] ])
        }

        return sub
      }

      return subreg
    }).reduce((a,b) => a.concat(b[0] instanceof Array ? b : [b]), [])
  )

  // debugging
  //console.log(JSON.stringify(regions, null, 2))

  // find what is most probably a list of items
  let max = [-Infinity,-1]
  regions.forEach((region, index) => {
    let num = 0

    region.forEach(subreg =>
      num += (subreg.length === 2 && typeof subreg[0] === 'string' && typeof subreg[1] === 'number')
    )

    if (num > max[0]) max = [num,index]
  })

  // if not found, we have failed
  if (max[1] === -1) {
    return reject('Unable to read receipt.')
  }

  // create dictionary
  const items = {}
  regions[max[1]].forEach(subreg => {
    if (typeof subreg[0] === 'string') {
      items[subreg[0]] = subreg[1]
    }
  })

  // resolve with items list
  resolve(items)
})
	/**
	* getReceipt.js - Split
	* Logic to convert a JSON response from
	* the MS Cognitive Services OCR API into
	* proper receiptable data.
	*
	* Licensed under MIT license.
	* Copyright (C) 2017 Karim Alibhai.
	*/

	'use strict'

	/**
	* Algorithm for vertical distance.
	* Assumes that 'a' is the line that is higher.
	*/
	const vdist = (a, b) => (b.boundingBox.y - (a.boundingBox.y + a.boundingBox.height))

	/**
	* Algorithm for horizontal distance.
	* Assumes that 'a' is the line that is on the left.
	*/
	const hdist = (a, b) => (b.boundingBox.x - (a.boundingBox.x + a.boundingBox.width))

	/**
	* Verify that text includes a number.
	*/
	const isNumber = text => (
	// it's a valid string
	text &&

	// it's got numbers and a decimal
	/[$0-9.\s]+/g.test(text) &&

	// it doesn't have letters
	!/[a-z]/gi.test(text)
	)

	/**
	* Transforms a response from MS cognitive services OCR
	* into a receipt items dictionary.
	*/
	module.exports = res => new Promise((resolve, reject) => {
	const data = {}, lines = []
	let docWidth = 0, docHeight = 0
	let averageVDist = 0, averageHDist = 0

	if (res.language !== 'en') return reject('Unsupported language: ' + res.language)
	if (res.textAngle !== 0 \|\| res.orientation !== 'Up') {
	data.warning = 'Photo is a bit unclear. Please try to ensure that' +
	'your receipt is straight and facing the right way.'
	}

	// we don't care about word-by-word, so let's
	// collapse lines
	//
	// let's also collapse regions because the default
	// region decomposition algorithm is based on general
	// euclidean distance and we just want to decompose
	// based upon vertical distance
	res.regions.forEach(region => {
	region.lines.reduce((arr, line) => {
	// collapse words into line
	line.words = line.words.map(word => word.text).join(' ')
	// fix spacing issues
	.replace(/\s+/g, ' ').trim()

	// destructure the bounding box, since we're looping anyways
	let [ x, y, width, height ] = line.boundingBox.split(',').map(i => parseInt(i, 10))
	line.boundingBox = { x, y, width, height }

	// update docWidth and docHeight
	docWidth = Math.max(docWidth, x + width)
	docHeight = Math.max(docHeight, y + height)

	// faster than concat
	arr.push(line)
	return arr
	}, lines)
	})

	// let's sort, so that our algorithms always work
	// sorting is ascending y, then ascending x
	lines
	.sort((a, b) => a.boundingBox.x - b.boundingBox.x)
	.sort((a, b) => a.boundingBox.y - b.boundingBox.y)

	// find distances between lines
	lines.forEach((line, index, self) => {
	if (index === 0) {
	line.boundingBox.fromTop = 0
	} else {
	averageVDist += Math.abs(line.boundingBox.fromTop = vdist( self[index - 1], line ))
	}
	})

	// find average
	averageVDist /= lines.length

	// decompose regions by average
	let region = [ lines[0] ]
	let regions = [region]
	lines.forEach((line, index) => {
	if (index !== 0) {
	if (line.boundingBox.fromTop > averageVDist) {
	regions.push(region = [line])
	} else {
	region.push(line)
	}
	}
	})

	// identify average line differences in regions,
	// and use them to collapse lines into sub-regions
	regions = regions.map((region, ri) => {
	const averageLineDiff = region.reduce((sum, line) => {
	return sum + Math.max(line.boundingBox.fromTop, 0)
	}, 0) / region.length

	//console.log('region #%s : diff of %s', ri, averageLineDiff)
	let subreg = []
	const lines = [subreg]

	for (let i = 0; i < region.length; ++ i) {
	if (i !== 0 && region[i].boundingBox.fromTop > averageLineDiff) {
	lines.push(subreg = [region[i]])
	} else {
	subreg.push(region[i])
	}
	}

	return lines
	})

	// do some clean-up and sorting
	regions = regions.map(region =>
	region.map(subreg => {
	let nstrings = 0, nints = 0

	subreg = subreg.sort((a, b) =>
	(a.boundingBox.x - b.boundingBox.x)
	).map(line => {
	line = line.words

	// parse numbers
	if (isNumber(line)) {
	nints += 1
	line = parseFloat(line.replace(/[$\s]/g, ''))
	} else {
	nstrings += 1
	}

	return line
	})

	// some merging, required for some receipts
	if (nstrings === nints) {
	let sub = []

	for (let i = 0; typeof subreg[i] === 'string' && i < subreg.length; ++ i) {
	sub.push([ subreg[i], subreg[i + nstrings] ])
	}

	return sub
	}

	return subreg
	}).reduce((a,b) => a.concat(b[0] instanceof Array ? b : [b]), [])
	)

	// debugging
	//console.log(JSON.stringify(regions, null, 2))

	// find what is most probably a list of items
	let max = [-Infinity,-1]
	regions.forEach((region, index) => {
	let num = 0

	region.forEach(subreg =>
	num += (subreg.length === 2 && typeof subreg[0] === 'string' && typeof subreg[1] === 'number')
	)

	if (num > max[0]) max = [num,index]
	})

	// if not found, we have failed
	if (max[1] === -1) {
	return reject('Unable to read receipt.')
	}

	// create dictionary
	const items = {}
	regions[max[1]].forEach(subreg => {
	if (typeof subreg[0] === 'string') {
	items[subreg[0]] = subreg[1]
	}
	})

	// resolve with items list
	resolve(items)
	})