Created
October 7, 2013 01:51
-
-
Save laughinghan/6861452 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
if (process.stdin.isTTY) { | |
console.log('Usage: Inside the directory with all the sent text message files:'); | |
console.log(); | |
console.log('ls | path/to/contacts_extractor.js | sort | uniq | ./mkvcards -o /path/to/output_dir/'); | |
process.exit(); | |
} | |
var DEBUG_MODE = '-d --debug'.split(' ').indexOf(process.argv.slice(-1)[0]) > -1; | |
//////////////// read stdin //////////////// | |
process.stdin.resume(); | |
process.stdin.setEncoding('utf8'); | |
var data = ''; | |
process.stdin.on('data', function(chunk) { data += chunk; }); | |
process.stdin.on('end', function() { | |
var lines = data.trim().split('\n'); | |
//////////////// read max number of files at a time //////////////// | |
var fs = require('fs'); | |
var q = []; | |
lines.forEach(function(path) { | |
if (q.length) q.push(path); | |
else { | |
fs.readFile(path, function(e, data) { | |
if (e && e.code === 'EMFILE') q.push(path); | |
else (function onFileData(path, e, data) { | |
if (e) throw e; | |
parseFile(path, data); | |
if (q.length) { | |
var path = q.shift(); | |
fs.readFile(path, onFileData.bind(null, path)); | |
} | |
}(path, e, data)); | |
}); | |
} | |
}); | |
//////////////// parse message file //////////////// | |
// by inspection of message format | |
const TEXT_OFFSET_BASE = 245; | |
const PHONE_OFFSET_BASE = 252; | |
const CONTACT_HEADER_SIZE = 29; | |
var assert = require('assert'); | |
function parseFile(path, data) { | |
try { | |
var headerSize = data.readUInt32BE(4); | |
var textOffset = TEXT_OFFSET_BASE + headerSize; | |
if (textOffset > data.length) { | |
console.error('Text offset', textOffset, '> buffer size', data.length, 'in', path); | |
return; | |
} | |
if (DEBUG_MODE) { | |
console.log('Sent "' + readUCS2BEString(textOffset).str + '" to:'); | |
} | |
var textSize = data.readUInt16BE(textOffset); | |
function readUCS2BEString(offset) { | |
var size = data.readUInt16BE(offset); | |
var start = offset + 2, end = start + size - 2; // omit null terminator | |
for (var i = start; i < end; i += 2) { // swap byte order >:( | |
var swap = data[i]; | |
data[i] = data[i+1]; | |
data[i+1] = swap; | |
} | |
return { str: data.toString('ucs2', start, end), end: end }; | |
} | |
var contacts = [], phoneOffset = PHONE_OFFSET_BASE + headerSize + textSize; | |
do { | |
var phone = readUCS2BEString(phoneOffset); | |
if (data[phone.end + 2] === 12) var name = { str: '', end: phone.end }; | |
else var name = readUCS2BEString(phone.end + 3); // skip null and comma | |
// ^ why is the comma only one byte?! | |
contacts.push({ name: name.str, phone: tenDigitize(phone.str) }); | |
var contactSeparatorSize = data.readUInt16BE(name.end + 3); | |
if ('23'.split('').indexOf(contactSeparatorSize) > -1) { | |
console.error('Unusual contact separator size', contactSeparatorSize, | |
'at byte', name.end + 3, 'in', path); | |
} | |
phoneOffset = name.end + contactSeparatorSize + CONTACT_HEADER_SIZE; | |
} while (phoneOffset < data.length); | |
function tenDigitize(phone) { | |
if (phone.length === 11) { | |
assert(phone[0] === '1' || phone[0] === '+', | |
'"'+phone[0]+'" === "1" || "'+phone[0]+'" === "+"'); | |
phone = phone.slice(1); | |
} | |
else if (phone.length === 12) { | |
assert.strictEqual(phone.slice(0,2), '+1'); | |
phone = phone.slice(2); | |
} | |
if (phone.length !== 10) { | |
console.error('Unusual phone length: "' + phone + '" in', path); | |
} | |
return phone; | |
} | |
contacts.forEach(function(contact, i) { | |
if (DEBUG_MODE) { | |
console.log('\t"' + contact.phone + '", "' + contact.name + '"' + | |
(i > 0 | |
? ' (' + (i+1) + ('nd rd'.split(' ')[i-1] || 'th') + ')' | |
: '')); | |
} | |
else if (contact.name) { | |
console.log(contact.phone, contact.name); | |
} | |
}); | |
} | |
catch (e) { | |
console.error('Error while parsing ' + path + ':'); | |
throw e; | |
} | |
} | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
OUTPUT_DIR=. | |
while [[ $# -gt 0 ]]; do | |
case "$1" in | |
-o|--output-dir) | |
OUTPUT_DIR="$2"; shift 2 | |
;; | |
*) | |
echo "unrecognized option $1" >&2; shift | |
;; | |
esac | |
done | |
mkdir -p "$OUTPUT_DIR" | |
while read PHONE FIRSTNAME OTHERNAMES; do | |
cat <<EOF > "$OUTPUT_DIR/$FIRSTNAME $OTHERNAMES $PHONE.vcf" | |
BEGIN:VCARD | |
VERSION:2.1 | |
N;CHARSET=UTF-8;ENCODING=8BIT:$OTHERNAMES;$FIRSTNAME | |
TEL;PREF;CELL;VOICE;ENCODING=8BIT:$PHONE | |
END:VCARD | |
EOF | |
done |
I really should have written the contacts extractor in Python, but I wanted to play with Node.JS and I knew that JS has native support for UTF-16. Little did I know it was only little-endian, and to read big-endian UTF-16 you have to manually go through and swap the fucking byte order. Seriously. Plus Python being an actual good language.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Inspired by http://developer.nokia.com/Community/Discussion/showthread.php/204463-Raw-text-message-file-format-(from-*-nbf)-description , written to extract contacts from the Sent text messages from my old Nokia C3-00. (I somehow deleted all the contacts; and received text messages don't have the name recorded, only the phone number, for some reason.)