Last active
August 29, 2015 14:17
-
-
Save dweinstein/0ccadd68af64a096b53f to your computer and use it in GitHub Desktop.
extract emails from apple app store descriptions itunes search emails
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.artistName) | .[]' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var q = require('hyperquest'); | |
var timeout = require('hyperquest-timeout'); | |
var pullSplit = require('pull-split'); | |
var split = require('split'); | |
var parse = require('JSONStream').parse; | |
var pull = require('pull-stream'); | |
var toPull = require('stream-to-pull-stream'); | |
var fs = require('fs'); | |
var through = require('pull-through'); | |
var paramap = require('pull-paramap'); | |
var _ = require('lodash'); | |
var assert = require('assert'); | |
var dict = fs.createReadStream('./ES.dic'); | |
var EMAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ig; | |
function producer() { | |
// word producer... | |
var stream = toPull.source(dict).pipe(pullSplit()); | |
return stream; | |
} | |
// word consumer, queries apple. | |
function urlForTerm(term) { | |
return "https://itunes.apple.com/search?term="+term+"&media=software&limit=200"; | |
} | |
//var running = 0; | |
//setInterval(function () { | |
// console.log('%d queries running', running); | |
//}, 2000) | |
//.unref(); | |
function queryApple(term, cb) { | |
var url = urlForTerm(term); | |
var res = []; | |
var req = timeout(q(url), 60*1000) | |
.on('error', function(err) { | |
console.error(err); | |
cb(err); | |
}) | |
.pipe(split()) | |
.pipe(parse('results.*')) | |
.on('error', function (err) { | |
console.error(err); | |
cb(err); | |
}) | |
.on('data', function (data) { | |
res.push(data); | |
}) | |
.on('end', function () { | |
cb(null, res); | |
}); | |
} | |
function skip(count) { | |
return through(function (data) { | |
if (count > 0) { | |
--count; | |
} else { | |
this.queue(data); | |
} | |
}); | |
} | |
function skipUntilWord(word) { | |
var skip = true; | |
return through(function (data) { | |
if (!skip) { | |
this.queue(data); | |
} | |
if (data === word) { | |
skip = false; | |
} | |
}); | |
} | |
function explode() { | |
return through(function (data) { | |
var self = this; | |
_.each(data.results, function (result) { | |
result.word = data.word; | |
self.queue(result); | |
}); | |
}); | |
} | |
function getEmails(str) { | |
assert(typeof str === 'string'); | |
var match = str.match(EMAIL_REGEX); | |
return match; | |
} | |
function extractEmails() { | |
return pull.map(function(chunk) { | |
return { | |
word: chunk.word, | |
results: _.transform(chunk.results, function (accum, item) { | |
var res = _.pick( | |
item, | |
['artistName', 'bundleId', 'trackId'] | |
); | |
res.emails = getEmails(item.description || ''); | |
accum.push(res); | |
}) | |
}; | |
}); | |
} | |
var logJ = pull.Sink(function (read) { | |
read(null, function next(end, data) { | |
if(end === true) { return; } | |
if(end) { throw end; } | |
console.log('%j', data); | |
read(null, next); | |
}); | |
}); | |
function consumer(limit) { | |
limit = limit || 5; | |
function one(word, cb) { | |
queryApple(word, function (err, res) { | |
if (err) { | |
console.error(err); | |
return cb(null);// cb(err); | |
} | |
cb(null, {word: word, results: res}); | |
}); | |
} | |
return paramap(one, limit); | |
} | |
function keepWithEmail() { | |
return pull.filter(function (data) { | |
return data && data.emails && data.emails.length > 0; | |
}); | |
} | |
function words() { | |
return pull.filter(function (data) { | |
return data.length > 0; | |
}); | |
} | |
require('./lastWord')(__dirname+'/results', function (err, lastWord) { | |
if (err) { throw err; } | |
pull( | |
producer(), | |
skipUntilWord(lastWord), | |
words(), | |
consumer(12), | |
extractEmails(), | |
explode(), | |
keepWithEmail(), | |
logJ() | |
); | |
}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function lastWord(file, cb) { | |
cb = cb || function (err, res) { | |
console.log(err?err:res); | |
}; | |
require('last-line')(file, function (err, res) { | |
if (err) { | |
cb(err); | |
} else { | |
var json = JSON.parse(res); | |
cb(null, json.word); | |
} | |
}); | |
} | |
module.exports = lastWord; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
node index.js | tee -a results | jq '.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env sh | |
#jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.artistName) | .[]' > filtered && wc -l results && wc -l filtered | |
jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.word) | .[]' > filtered && wc -l results && wc -l filtered |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment