Skip to content

Instantly share code, notes, and snippets.

@dweinstein
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dweinstein/0ccadd68af64a096b53f to your computer and use it in GitHub Desktop.
Save dweinstein/0ccadd68af64a096b53f to your computer and use it in GitHub Desktop.
extract emails from apple app store descriptions itunes search emails
jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.artistName) | .[]'
var q = require('hyperquest');
var timeout = require('hyperquest-timeout');
var pullSplit = require('pull-split');
var split = require('split');
var parse = require('JSONStream').parse;
var pull = require('pull-stream');
var toPull = require('stream-to-pull-stream');
var fs = require('fs');
var through = require('pull-through');
var paramap = require('pull-paramap');
var _ = require('lodash');
var assert = require('assert');
var dict = fs.createReadStream('./ES.dic');
var EMAIL_REGEX = /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}/ig;
function producer() {
// word producer...
var stream = toPull.source(dict).pipe(pullSplit());
return stream;
}
// word consumer, queries apple.
function urlForTerm(term) {
return "https://itunes.apple.com/search?term="+term+"&media=software&limit=200";
}
//var running = 0;
//setInterval(function () {
// console.log('%d queries running', running);
//}, 2000)
//.unref();
function queryApple(term, cb) {
var url = urlForTerm(term);
var res = [];
var req = timeout(q(url), 60*1000)
.on('error', function(err) {
console.error(err);
cb(err);
})
.pipe(split())
.pipe(parse('results.*'))
.on('error', function (err) {
console.error(err);
cb(err);
})
.on('data', function (data) {
res.push(data);
})
.on('end', function () {
cb(null, res);
});
}
function skip(count) {
return through(function (data) {
if (count > 0) {
--count;
} else {
this.queue(data);
}
});
}
function skipUntilWord(word) {
var skip = true;
return through(function (data) {
if (!skip) {
this.queue(data);
}
if (data === word) {
skip = false;
}
});
}
function explode() {
return through(function (data) {
var self = this;
_.each(data.results, function (result) {
result.word = data.word;
self.queue(result);
});
});
}
function getEmails(str) {
assert(typeof str === 'string');
var match = str.match(EMAIL_REGEX);
return match;
}
function extractEmails() {
return pull.map(function(chunk) {
return {
word: chunk.word,
results: _.transform(chunk.results, function (accum, item) {
var res = _.pick(
item,
['artistName', 'bundleId', 'trackId']
);
res.emails = getEmails(item.description || '');
accum.push(res);
})
};
});
}
var logJ = pull.Sink(function (read) {
read(null, function next(end, data) {
if(end === true) { return; }
if(end) { throw end; }
console.log('%j', data);
read(null, next);
});
});
function consumer(limit) {
limit = limit || 5;
function one(word, cb) {
queryApple(word, function (err, res) {
if (err) {
console.error(err);
return cb(null);// cb(err);
}
cb(null, {word: word, results: res});
});
}
return paramap(one, limit);
}
function keepWithEmail() {
return pull.filter(function (data) {
return data && data.emails && data.emails.length > 0;
});
}
function words() {
return pull.filter(function (data) {
return data.length > 0;
});
}
require('./lastWord')(__dirname+'/results', function (err, lastWord) {
if (err) { throw err; }
pull(
producer(),
skipUntilWord(lastWord),
words(),
consumer(12),
extractEmails(),
explode(),
keepWithEmail(),
logJ()
);
});
function lastWord(file, cb) {
cb = cb || function (err, res) {
console.log(err?err:res);
};
require('last-line')(file, function (err, res) {
if (err) {
cb(err);
} else {
var json = JSON.parse(res);
cb(null, json.word);
}
});
}
module.exports = lastWord;
#!/usr/bin/env sh
node index.js | tee -a results | jq '.'
#!/usr/bin/env sh
#jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.artistName) | .[]' > filtered && wc -l results && wc -l filtered
jq -s '[.] | .[0]' results | jq -c 'unique_by(.trackId) | sort_by(.word) | .[]' > filtered && wc -l results && wc -l filtered
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment