laughinghan/contacts_extractor.js

## contacts_extractor.js
#!/usr/bin/env node

if (process.stdin.isTTY) {
  console.log('Usage: Inside the directory with all the sent text message files:');
  console.log();
  console.log('ls | path/to/contacts_extractor.js | sort | uniq | ./mkvcards -o /path/to/output_dir/');
  process.exit();
}

var DEBUG_MODE = '-d --debug'.split(' ').indexOf(process.argv.slice(-1)[0]) > -1;

//////////////// read stdin ////////////////
process.stdin.resume();
process.stdin.setEncoding('utf8');

var data = '';
process.stdin.on('data', function(chunk) { data += chunk; });
process.stdin.on('end', function() {
  var lines = data.trim().split('\n');

  //////////////// read max number of files at a time ////////////////
  var fs = require('fs');
  var q = [];
  lines.forEach(function(path) {
    if (q.length) q.push(path);
    else {
      fs.readFile(path, function(e, data) {
        if (e && e.code === 'EMFILE') q.push(path);
        else (function onFileData(path, e, data) {
          if (e) throw e;
          parseFile(path, data);
          if (q.length) {
            var path = q.shift();
            fs.readFile(path, onFileData.bind(null, path));
          }
        }(path, e, data));
      });
    }
  });

  //////////////// parse message file ////////////////

  // by inspection of message format
  const TEXT_OFFSET_BASE = 245;
  const PHONE_OFFSET_BASE = 252;
  const CONTACT_HEADER_SIZE = 29;

  var assert = require('assert');
  function parseFile(path, data) {
    try {
      var headerSize = data.readUInt32BE(4);

      var textOffset = TEXT_OFFSET_BASE + headerSize;
      if (textOffset > data.length) {
        console.error('Text offset', textOffset, '> buffer size', data.length, 'in', path);
        return;
      }
      if (DEBUG_MODE) {
        console.log('Sent "' + readUCS2BEString(textOffset).str + '" to:');
      }
      var textSize = data.readUInt16BE(textOffset);

      function readUCS2BEString(offset) {
        var size = data.readUInt16BE(offset);
        var start = offset + 2, end = start + size - 2; // omit null terminator
        for (var i = start; i < end; i += 2) { // swap byte order >:(
          var swap = data[i];
          data[i] = data[i+1];
          data[i+1] = swap;
        }
        return { str: data.toString('ucs2', start, end), end: end };
      }

      var contacts = [], phoneOffset = PHONE_OFFSET_BASE + headerSize + textSize;
      do {
        var phone = readUCS2BEString(phoneOffset);
        if (data[phone.end + 2] === 12) var name = { str: '', end: phone.end };
        else var name = readUCS2BEString(phone.end + 3); // skip null and comma
                                            // ^ why is the comma only one byte?!
        contacts.push({ name: name.str, phone: tenDigitize(phone.str) });

        var contactSeparatorSize = data.readUInt16BE(name.end + 3);
        if ('23'.split('').indexOf(contactSeparatorSize) > -1) {
          console.error('Unusual contact separator size', contactSeparatorSize,
                        'at byte', name.end + 3, 'in', path);
        }
        phoneOffset = name.end + contactSeparatorSize + CONTACT_HEADER_SIZE;
      } while (phoneOffset < data.length);

      function tenDigitize(phone) {
        if (phone.length === 11) {
          assert(phone[0] === '1' || phone[0] === '+',
                 '"'+phone[0]+'" === "1" || "'+phone[0]+'" === "+"');
          phone = phone.slice(1);
        }
        else if (phone.length === 12) {
          assert.strictEqual(phone.slice(0,2), '+1');
          phone = phone.slice(2);
        }
        if (phone.length !== 10) {
          console.error('Unusual phone length: "' + phone + '" in', path);
        }
        return phone;
      }

      contacts.forEach(function(contact, i) {
        if (DEBUG_MODE) {
          console.log('\t"' + contact.phone + '", "' + contact.name + '"' +
                      (i > 0
                        ? ' (' + (i+1) + ('nd rd'.split(' ')[i-1] || 'th') + ')'
                        : ''));
        }
        else if (contact.name) {
          console.log(contact.phone, contact.name);
        }
      });
    }
    catch (e) {
      console.error('Error while parsing ' + path + ':');
      throw e;
    }
  }
});

## mkvcards
#!/bin/bash

OUTPUT_DIR=.

while [[ $# -gt 0 ]]; do
  case "$1" in
    -o|--output-dir)
      OUTPUT_DIR="$2"; shift 2
    ;;
    *)
      echo "unrecognized option $1" >&2; shift
    ;;
  esac
done

mkdir -p "$OUTPUT_DIR"

while read PHONE FIRSTNAME OTHERNAMES; do
  cat <<EOF > "$OUTPUT_DIR/$FIRSTNAME $OTHERNAMES $PHONE.vcf"
BEGIN:VCARD
VERSION:2.1
N;CHARSET=UTF-8;ENCODING=8BIT:$OTHERNAMES;$FIRSTNAME
TEL;PREF;CELL;VOICE;ENCODING=8BIT:$PHONE
END:VCARD
EOF
done
	#!/usr/bin/env node

	if (process.stdin.isTTY) {
	console.log('Usage: Inside the directory with all the sent text message files:');
	console.log();
	console.log('ls \| path/to/contacts_extractor.js \| sort \| uniq \| ./mkvcards -o /path/to/output_dir/');
	process.exit();
	}

	var DEBUG_MODE = '-d --debug'.split(' ').indexOf(process.argv.slice(-1)[0]) > -1;

	//////////////// read stdin ////////////////
	process.stdin.resume();
	process.stdin.setEncoding('utf8');

	var data = '';
	process.stdin.on('data', function(chunk) { data += chunk; });
	process.stdin.on('end', function() {
	var lines = data.trim().split('\n');

	//////////////// read max number of files at a time ////////////////
	var fs = require('fs');
	var q = [];
	lines.forEach(function(path) {
	if (q.length) q.push(path);
	else {
	fs.readFile(path, function(e, data) {
	if (e && e.code === 'EMFILE') q.push(path);
	else (function onFileData(path, e, data) {
	if (e) throw e;
	parseFile(path, data);
	if (q.length) {
	var path = q.shift();
	fs.readFile(path, onFileData.bind(null, path));
	}
	}(path, e, data));
	});
	}
	});

	//////////////// parse message file ////////////////

	// by inspection of message format
	const TEXT_OFFSET_BASE = 245;
	const PHONE_OFFSET_BASE = 252;
	const CONTACT_HEADER_SIZE = 29;

	var assert = require('assert');
	function parseFile(path, data) {
	try {
	var headerSize = data.readUInt32BE(4);

	var textOffset = TEXT_OFFSET_BASE + headerSize;
	if (textOffset > data.length) {
	console.error('Text offset', textOffset, '> buffer size', data.length, 'in', path);
	return;
	}
	if (DEBUG_MODE) {
	console.log('Sent "' + readUCS2BEString(textOffset).str + '" to:');
	}
	var textSize = data.readUInt16BE(textOffset);

	function readUCS2BEString(offset) {
	var size = data.readUInt16BE(offset);
	var start = offset + 2, end = start + size - 2; // omit null terminator
	for (var i = start; i < end; i += 2) { // swap byte order >:(
	var swap = data[i];
	data[i] = data[i+1];
	data[i+1] = swap;
	}
	return { str: data.toString('ucs2', start, end), end: end };
	}

	var contacts = [], phoneOffset = PHONE_OFFSET_BASE + headerSize + textSize;
	do {
	var phone = readUCS2BEString(phoneOffset);
	if (data[phone.end + 2] === 12) var name = { str: '', end: phone.end };
	else var name = readUCS2BEString(phone.end + 3); // skip null and comma
	// ^ why is the comma only one byte?!
	contacts.push({ name: name.str, phone: tenDigitize(phone.str) });

	var contactSeparatorSize = data.readUInt16BE(name.end + 3);
	if ('23'.split('').indexOf(contactSeparatorSize) > -1) {
	console.error('Unusual contact separator size', contactSeparatorSize,
	'at byte', name.end + 3, 'in', path);
	}
	phoneOffset = name.end + contactSeparatorSize + CONTACT_HEADER_SIZE;
	} while (phoneOffset < data.length);

	function tenDigitize(phone) {
	if (phone.length === 11) {
	assert(phone[0] === '1' \|\| phone[0] === '+',
	'"'+phone[0]+'" === "1" \|\| "'+phone[0]+'" === "+"');
	phone = phone.slice(1);
	}
	else if (phone.length === 12) {
	assert.strictEqual(phone.slice(0,2), '+1');
	phone = phone.slice(2);
	}
	if (phone.length !== 10) {
	console.error('Unusual phone length: "' + phone + '" in', path);
	}
	return phone;
	}

	contacts.forEach(function(contact, i) {
	if (DEBUG_MODE) {
	console.log('\t"' + contact.phone + '", "' + contact.name + '"' +
	(i > 0
	? ' (' + (i+1) + ('nd rd'.split(' ')[i-1] \|\| 'th') + ')'
	: ''));
	}
	else if (contact.name) {
	console.log(contact.phone, contact.name);
	}
	});
	}
	catch (e) {
	console.error('Error while parsing ' + path + ':');
	throw e;
	}
	}
	});
	#!/bin/bash

	OUTPUT_DIR=.

	while [[ $# -gt 0 ]]; do
	case "$1" in
	-o\|--output-dir)
	OUTPUT_DIR="$2"; shift 2
	;;
	*)
	echo "unrecognized option $1" >&2; shift
	;;
	esac
	done

	mkdir -p "$OUTPUT_DIR"

	while read PHONE FIRSTNAME OTHERNAMES; do
	cat <<EOF > "$OUTPUT_DIR/$FIRSTNAME $OTHERNAMES $PHONE.vcf"
	BEGIN:VCARD
	VERSION:2.1
	N;CHARSET=UTF-8;ENCODING=8BIT:$OTHERNAMES;$FIRSTNAME
	TEL;PREF;CELL;VOICE;ENCODING=8BIT:$PHONE
	END:VCARD
	EOF
	done