bruceCzK/parse-pdx-txt.js

## parse-pdx-txt.js
const {set, get, isString} = require('lodash')

module.exports.handler = function (event, context, callback) {
  const data = Buffer.isBuffer(event) ? JSON.parse(event) : event
  const text = data.text || data.content
  if (!text) {
    const error = new Error('Invalid Arguments')
    if (callback) {
      callback(error, null)
    }
    throw error
  }
  const result = parse(text)
  if (callback) {
    callback(null, result)
  }
  return result
}

function parse(content) {
  const SPACE = ' '.codePointAt()
  const EQUAL = '='.codePointAt()
  const POUND = '#'.codePointAt()
  const LBRACE = '{'.codePointAt()
  const RBRACE = '}'.codePointAt()
  const BR = '\n'.codePointAt()
  const DBQUOTE = '"'.codePointAt()
  const SLASH = '\\'.codePointAt()

  const json = {}
  const paths = []

  content = String(content).split(/\r\n|\n|\r/)
    .map(line => line.replace(/\t/g, ' ')) // replace tabs with spaces
    .map(line => line.replace(/(["\s]\w+?")(\w)/g, '$1 $2')) // 修复引号及字符粘连的问题
    .map(line => line.replace(/(\s=)(\[.+])(\s|$)/g, '$1"$2"$3')) // 修复方括号值的问题 [From.GetID]
    .map(line => line.replace(/^\s+/, '')) // remove leading spaces
    .map(line => line.replace(/#(?=(?:(?:[^"]*"){2})*[^"]*$).+(?:(\s|$))/g, '')) // remove comment
    .map(line => line.replace(/#\s*$/g, '')) // remove comment
    .map(line => line.replace(/^\s*#.+/, ''))
    // escape operator other than equal
    .map(line => line.replace(/<=/g, '= &lte;'))
    .map(line => line.replace(/>=/g, '= &gte;'))
    .map(line => line.replace(/>/g, '= &gt;'))
    .map(line => line.replace(/</g, '= &lt;'))
    .map(line => line.replace(/==/g, '= &eqeq;'))
    .join('\n')

  content += '\n' // add extra line


  let token = ''
  let leftHand = true
  let mayBeArray = false
  let string = false
  let escape = false
  let comment = false

  for (const char of content) {
    const codePoint = char.codePointAt()
    if (comment) {
      if (codePoint === BR) {
        // 换行后注释才结束
        comment = false
      }
      continue
    }
    switch (codePoint) {
      case EQUAL:
        if (string) {
          token += char
          continue
        }
        if (!leftHand) {
          // 等号后面未结束又出现了等号
          // 即单行的结构，如 { a = 1  b = 2  c = 3 }
          const parts = token.trim().split(' ')
          const key = parts.pop() // get last part as key
          const val = parts.join(' ').trim() // join the rest

          correctPaths()
          setValue(val)

          paths.pop()
          paths.push(key.trim())
          correctPaths()

          token = ''
          break
        }
        if (token) {
          let key = token.trim()
          if (/^\d+$/.test(key)) {
            // 全为数字的 key 后面加后缀防止转 JSON 时顺序错误
            key += '#'
          }
          paths.push(key)
          token = ''
        }
        correctPaths()
        leftHand = false // 遇到等号说明右侧开始
        mayBeArray = false // 而且接下来的内容不可能是数组的值
        break
      case LBRACE:
        if (string) {
          token += char
          continue
        }
        // 遇到 { 说明到了左侧，而且有可能接下来的内容是一个数组
        if (leftHand) {
          if (token) {
            // 未出现操作符之前出现了 {
            // 按普通内容处理
            token += ' '
          } else {
            paths.push('#')
          }
          correctPaths()
        }
        leftHand = true
        mayBeArray = true
        break
      case RBRACE:
        if (string) {
          token += char
          continue
        }
        if (token) {
          // 内容结束
          token = token.trim()
          correctPaths()
          if (mayBeArray) {
            // 如果没有遇到等号
            // mayBeArray 没有变为 false
            // 那这就是个数组
            // 按空格拆分内容，但是不匹配引号内的空格
            const array = token.match(/(?:[^\s"]+|"[^"]*")+/g)
            setValue(array)
          } else {
            setValue(token)
            /* b_aaaa = {
             *   #1
             *   c = { a = 'xxxx' #2}
             * }
             *
             * 类似这种结构，此时层级栈中存储的是 [b_aaaa, c, a]
             * 所以从 #2 回到 #1 的位置需要跳出两级
             * */
            paths.pop() // 这里跳出一级
          }
          token = ''
        } else if (leftHand && mayBeArray) {
          // { 到 } 之间没有任何内容，需要填充一个空对象
          setValue({})
        }
        // 这里回到父级
        paths.pop()
        mayBeArray = false
        leftHand = true
        break
      case BR:
        if (string) {
          token += char
          continue
        }
        if (leftHand && token) {
          // 说明这里进入了数组
          // 直到遇到 } 才能结束
          token += ' '
        } else if (token) {
          correctPaths()
          leftHand = true
          setValue(token) // 正常的字符串内容
          token = ''
          paths.pop()
        }
        comment = false
        break
      case DBQUOTE:
        if (escape) {
          token += char
          escape = false
          continue
        }
        string = !string
        token += char
        break
      case POUND:
        if (!string) {
          comment = true
        } else {
          token += char
        }
        break
      case SLASH:
        escape = true
        token += char
        continue
      default:
        if (comment) {
          // 注释开始后未结束
          break
        }
        if (!token && codePoint === SPACE) {
          // text 没有内容时忽略空格
          break
        }
        token += char
    }
    escape = false // 仅接下来一个字符触发escape
  }

  return json

  function setValue(val) {
    if (isString(val)) {
      val = val
        .replace('&gt;', '>')
        .replace('&lt;', '<')
        .replace('&gte;', '>=')
        .replace('&lte;', '<=')
        .replace('&eqeq;', '==')
        .trim()
    }

    // 填充占位对象，防止数字键值直接设置变成数组
    paths.reduce((path, key) => {
      path.push(key)
      if (!get(json, path)) {
        set(json, path, {})
      }
      return path
    }, [])

    set(json, paths, val)
  }

  function correctPaths(num = 1) {
    const splitter = '$$'
    if (get(json, paths)) {
      // 如果有 key 重复则在后面添加一个 #
      // 比如耶路撒冷是多教圣地, 会出现很多 holy_site 的 key
      const suffix = splitter + Number(num).toString(16).toUpperCase()
      paths[paths.length - 1] = paths[paths.length - 1].split(splitter)[0] + suffix
      // 递归
      correctPaths(num + 1)
    }
  }
}
	const {set, get, isString} = require('lodash')

	module.exports.handler = function (event, context, callback) {
	const data = Buffer.isBuffer(event) ? JSON.parse(event) : event
	const text = data.text \|\| data.content
	if (!text) {
	const error = new Error('Invalid Arguments')
	if (callback) {
	callback(error, null)
	}
	throw error
	}
	const result = parse(text)
	if (callback) {
	callback(null, result)
	}
	return result
	}

	function parse(content) {
	const SPACE = ' '.codePointAt()
	const EQUAL = '='.codePointAt()
	const POUND = '#'.codePointAt()
	const LBRACE = '{'.codePointAt()
	const RBRACE = '}'.codePointAt()
	const BR = '\n'.codePointAt()
	const DBQUOTE = '"'.codePointAt()
	const SLASH = '\\'.codePointAt()

	const json = {}
	const paths = []

	content = String(content).split(/\r\n\|\n\|\r/)
	.map(line => line.replace(/\t/g, ' ')) // replace tabs with spaces
	.map(line => line.replace(/(["\s]\w+?")(\w)/g, '$1 $2')) // 修复引号及字符粘连的问题
	.map(line => line.replace(/(\s=)(\[.+])(\s\|$)/g, '$1"$2"$3')) // 修复方括号值的问题 [From.GetID]
	.map(line => line.replace(/^\s+/, '')) // remove leading spaces
	.map(line => line.replace(/#(?=(?:(?:[^"]"){2})[^"]*$).+(?:(\s\|$))/g, '')) // remove comment
	.map(line => line.replace(/#\s*$/g, '')) // remove comment
	.map(line => line.replace(/^\s*#.+/, ''))
	// escape operator other than equal
	.map(line => line.replace(/<=/g, '= &lte;'))
	.map(line => line.replace(/>=/g, '= &gte;'))
	.map(line => line.replace(/>/g, '= >'))
	.map(line => line.replace(/</g, '= <'))
	.map(line => line.replace(/==/g, '= &eqeq;'))
	.join('\n')

	content += '\n' // add extra line


	let token = ''
	let leftHand = true
	let mayBeArray = false
	let string = false
	let escape = false
	let comment = false

	for (const char of content) {
	const codePoint = char.codePointAt()
	if (comment) {
	if (codePoint === BR) {
	// 换行后注释才结束
	comment = false
	}
	continue
	}
	switch (codePoint) {
	case EQUAL:
	if (string) {
	token += char
	continue
	}
	if (!leftHand) {
	// 等号后面未结束又出现了等号
	// 即单行的结构，如 { a = 1 b = 2 c = 3 }
	const parts = token.trim().split(' ')
	const key = parts.pop() // get last part as key
	const val = parts.join(' ').trim() // join the rest

	correctPaths()
	setValue(val)

	paths.pop()
	paths.push(key.trim())
	correctPaths()

	token = ''
	break
	}
	if (token) {
	let key = token.trim()
	if (/^\d+$/.test(key)) {
	// 全为数字的 key 后面加后缀防止转 JSON 时顺序错误
	key += '#'
	}
	paths.push(key)
	token = ''
	}
	correctPaths()
	leftHand = false // 遇到等号说明右侧开始
	mayBeArray = false // 而且接下来的内容不可能是数组的值
	break
	case LBRACE:
	if (string) {
	token += char
	continue
	}
	// 遇到 { 说明到了左侧，而且有可能接下来的内容是一个数组
	if (leftHand) {
	if (token) {
	// 未出现操作符之前出现了 {
	// 按普通内容处理
	token += ' '
	} else {
	paths.push('#')
	}
	correctPaths()
	}
	leftHand = true
	mayBeArray = true
	break
	case RBRACE:
	if (string) {
	token += char
	continue
	}
	if (token) {
	// 内容结束
	token = token.trim()
	correctPaths()
	if (mayBeArray) {
	// 如果没有遇到等号
	// mayBeArray 没有变为 false
	// 那这就是个数组
	// 按空格拆分内容，但是不匹配引号内的空格
	const array = token.match(/(?:[^\s"]+\|"[^"]*")+/g)
	setValue(array)
	} else {
	setValue(token)
	/* b_aaaa = {
	* #1
	* c = { a = 'xxxx' #2}
	* }
	*
	* 类似这种结构，此时层级栈中存储的是 [b_aaaa, c, a]
	* 所以从 #2 回到 #1 的位置需要跳出两级
	* */
	paths.pop() // 这里跳出一级
	}
	token = ''
	} else if (leftHand && mayBeArray) {
	// { 到 } 之间没有任何内容，需要填充一个空对象
	setValue({})
	}
	// 这里回到父级
	paths.pop()
	mayBeArray = false
	leftHand = true
	break
	case BR:
	if (string) {
	token += char
	continue
	}
	if (leftHand && token) {
	// 说明这里进入了数组
	// 直到遇到 } 才能结束
	token += ' '
	} else if (token) {
	correctPaths()
	leftHand = true
	setValue(token) // 正常的字符串内容
	token = ''
	paths.pop()
	}
	comment = false
	break
	case DBQUOTE:
	if (escape) {
	token += char
	escape = false
	continue
	}
	string = !string
	token += char
	break
	case POUND:
	if (!string) {
	comment = true
	} else {
	token += char
	}
	break
	case SLASH:
	escape = true
	token += char
	continue
	default:
	if (comment) {
	// 注释开始后未结束
	break
	}
	if (!token && codePoint === SPACE) {
	// text 没有内容时忽略空格
	break
	}
	token += char
	}
	escape = false // 仅接下来一个字符触发escape
	}

	return json

	function setValue(val) {
	if (isString(val)) {
	val = val
	.replace('>', '>')
	.replace('<', '<')
	.replace('&gte;', '>=')
	.replace('&lte;', '<=')
	.replace('&eqeq;', '==')
	.trim()
	}

	// 填充占位对象，防止数字键值直接设置变成数组
	paths.reduce((path, key) => {
	path.push(key)
	if (!get(json, path)) {
	set(json, path, {})
	}
	return path
	}, [])

	set(json, paths, val)
	}

	function correctPaths(num = 1) {
	const splitter = '$$'
	if (get(json, paths)) {
	// 如果有 key 重复则在后面添加一个 #
	// 比如耶路撒冷是多教圣地, 会出现很多 holy_site 的 key
	const suffix = splitter + Number(num).toString(16).toUpperCase()
	paths[paths.length - 1] = paths[paths.length - 1].split(splitter)[0] + suffix
	// 递归
	correctPaths(num + 1)
	}
	}
	}