Skip to content

Instantly share code, notes, and snippets.

@eddking
Last active February 13, 2019 19:02
Show Gist options
  • Save eddking/97c26012f5fdad603ef02c2b4bc5c4bb to your computer and use it in GitHub Desktop.
Save eddking/97c26012f5fdad603ef02c2b4bc5c4bb to your computer and use it in GitHub Desktop.
export pull request & review data from github via the http api
const request = require('request-promise-native');
const parseLinkHeader = require('parse-link-header');
const fs = require('fs');
/*
* This is an idempotent script for downloading pr & review data for a single repository
* Set the repo and auth variables below, then run and wait. its not speedy, but it works
*
* If you stop the script or it dies or you trigger abuse detection, it is safe to restart.
* it will only download the remaining data, any requests that were pending at the time of
* the crash will be retried.
*
* If you start the script with a data.json file already present, it will only download
* new pull requests or update data for pull requests whose updated_at property has changed
*
* The resulting data will be stored as one big file 'data.json', pull request data is located
* under the 'pulls' key. the other two keys are for recording state for recovery
*/
const repo = 'YieldifyLabs/khaleesi-tag';
const auth = {
'user': 'usernameHere', // Your github username
'pass': 'githubPersonalAuthTokenHere' // A personal auth token you created
};
const dataFile = './data.json';
// Even though we request in parallel, we do at most 1 req per x milliseconds
// Otherwise you trigger abuse detection
const WAIT_BETWEEN_REQ = 250; // about 200ms is ok for full speed
const opts = {
auth: auth,
headers: {
'User-Agent': 'curl/7.43.0' // Why not?
},
resolveWithFullResponse: true
};
// Default empty data, if one is persisted, that will be used instead
let data = {
pulls: {},
queue: [],
pending: {}
};
function enqueue(obj) {
id = JSON.stringify(obj);
if (data.pending[id] !== undefined && !data.pending[id]) {
return;
}
data.queue.push(obj);
data.pending[id] = false;
}
let waitpromise = Promise.resolve();
function wait(delay = WAIT_BETWEEN_REQ) {
return new Promise((resolve) => {
setTimeout(resolve, delay);
});
}
let ratelimitPromise = Promise.resolve();
async function doApiRequest(url) {
const tmp = waitpromise;
waitpromise = waitpromise.then(wait);
await tmp;
await ratelimitPromise;
const result = await request.get(url, opts);
const rateLimitRemaining = parseInt(result.headers['x-ratelimit-remaining'], 10);
if (rateLimitRemaining < 10) {
console.log('About to hit rate limit, stopping for now .....');
writeData();
console.log('The next rate limit reset be at:');
let nextReset = new Date(parseInt(result.headers['x-ratelimit-reset'], 10) * 1000);
console.log(nextReset);
ratelimitPromise = wait(nextReset - new Date().getTime() + 5000);
await ratelimitPromise;
}
if (rateLimitRemaining % 50 === 0) {
console.log(`----- Rate limit remaining: ${rateLimitRemaining}`);
}
return result;
}
function forEachRemainingPage(result, currentPage, callback) {
const links = parseLinkHeader(result.headers.link);
if (links !== null) {
const lastpage = (links.last || {}).page || currentPage;
for (let i = currentPage + 1; i <= parseInt(lastpage, 10); i++) {
callback(i);
}
}
}
const columnSpacing = 15;
function pad(string, chars=columnSpacing) {
let padding = chars - string.length;
for (let i = 0; i < padding; i++) { // Its slow but meh cba
string = string + ' ';
}
return string;
}
async function readPulls(page) {
page = page || 1;
const url = `https://api.github.com/repos/${repo}/pulls?state=closed&page=${page}`;
const result = await doApiRequest(url, opts);
console.log(progress() + pad('[OK]', 5) + pad('[List]') + `[${repo}/pulls]` + (page > 1 ? `[Page:${page}]` : ''));
if (page === 1) { // If we're on the starting page, queue requests for the other pages
forEachRemainingPage(result, page, (i) => {
enqueue({
type: 'pulls',
page: i,
});
})
}
const body = JSON.parse(result.body);
let anyDiff = false;
for (let pull of body) {
let number = pull.number;
let prev = data.pulls[number];
let isChanged = prev === undefined || prev.updated_at !== pull.updated_at;
anyDiff = anyDiff || isChanged;
if (!isChanged) {
continue;
}
enqueue({
type: 'pull',
pull: number,
});
}
}
async function readPull(number) {
const url = `https://api.github.com/repos/${repo}/pulls/${number}`;
const result = await doApiRequest(url, opts);
const pull = JSON.parse(result.body);
console.log(progress() + pad('[OK]', 5) + pad('[Pull]') + `[${repo}/pulls/${number}]`);
data.pulls[number] = {
number: pull.number,
state: pull.state,
title: pull.title,
user: pull.user.login,
body: pull.body,
commits: pull.commits,
additions: pull.additions,
deletions: pull.deletions,
changed_files: pull.changed_files,
created_at: pull.created_at,
updated_at: pull.updated_at,
pushed_at: pull.pushed_at,
closed_at: pull.closed_at,
merged_at: pull.merged_at,
merged: pull.merged,
merged_by: (pull.merged_by || {}).login,
merge_commit_sha: pull.merge_commit_sha,
head: {
label: pull.head.label,
ref: pull.head.ref,
sha: pull.head.sha
},
base: {
label: pull.base.label,
ref: pull.base.ref,
sha: pull.base.sha
},
comments: pull.comments,
review_comments: pull.review_comments,
reviews: [],
diffComments: [],
prComments: [],
};
enqueue({
type: 'reviews',
pull: number,
page: 1
});
enqueue({
type: 'comments',
pull: number,
page: 1
});
enqueue({
type: 'prcomments',
pull: number,
page: 1
});
}
async function readComments(pull, page) {
const url = `https://api.github.com/repos/${repo}/pulls/${pull}/comments?page=${page}`;
const result = await doApiRequest(url, opts);
console.log(progress() + pad('[OK]', 5) + pad('[DiffComments]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : ''));
if (page === 1) {
forEachRemainingPage(result, page, (i) => {
enqueue({
type: 'comments',
pull: pull,
page: i,
});
})
}
const body = JSON.parse(result.body);
for (let comment of body) {
data.pulls[pull].diffComments.push({
pull_request_review_id: comment.pull_request_review_id,
user: comment.user.login,
body: comment.body,
created_at: comment.created_at,
updated_at: comment.updated_at
});
}
// re-sort since we're downloading multiple pages async
data.pulls[pull].diffComments.sort((a, b) => {
const aDate = new Date(a.created_at);
const bDate = new Date(b.created_at);
if (aDate > bDate) {
return 1;
} else if (bDate > aDate) {
return -1;
}
return 0;
});
}
async function readIssueComments(pull, page) {
const url = `https://api.github.com/repos/${repo}/issues/${pull}/comments?page=${page}`;
const result = await doApiRequest(url, opts);
console.log(progress() + pad('[OK]', 5) + pad('[PrComments]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : ''));
if (page === 1) {
forEachRemainingPage(result, page, (i) => {
enqueue({
type: 'prcomments',
pull: pull,
page: i
});
})
}
const body = JSON.parse(result.body);
for (let comment of body) {
data.pulls[pull].prComments.push({
user: comment.user.login,
body: comment.body,
created_at: comment.created_at,
updated_at: comment.updated_at
});
}
// re-sort since we're downloading multiple pages async
data.pulls[pull].prComments.sort((a, b) => {
const aDate = new Date(a.created_at);
const bDate = new Date(b.created_at);
if (aDate > bDate) {
return 1;
} else if (bDate > aDate) {
return -1;
}
return 0;
});
}
async function readReviews(pull, page) {
const url = `https://api.github.com/repos/${repo}/pulls/${pull}/reviews?page=${page}`;
const result = await doApiRequest(url, opts);
console.log(progress() + pad('[OK]', 5) + pad('[Reviews]') + `[${repo}/pulls/${pull}]` + (page > 1 ? `[Page:${page}]` : ''));
if (page === 1) {
forEachRemainingPage(result, page, (i) => {
enqueue({
type: 'reviews',
pull: pull,
page: i
});
})
}
const body = JSON.parse(result.body);
for (let review of body) {
data.pulls[pull].reviews.push({
id: review.id,
user: review.user.login,
body: review.body,
state: review.state,
submitted_at: review.submitted_at
});
}
// re-sort since we're downloading multiple pages async
data.pulls[pull].reviews.sort((a, b) => {
const aDate = new Date(a.submitted_at);
const bDate = new Date(b.submitted_at);
if (aDate > bDate) {
return 1;
} else if (bDate > aDate) {
return -1;
}
return 0;
});
}
let totalProcessed = 0;
let pending = 0;
function progress() {
return pad(`[${totalProcessed}/${totalProcessed + data.queue.length + pending}]`, 14);
}
let processedCounter = 0;
async function go() {
let fork = null;
try {
while (data.queue.length !== 0) {
const item = data.queue.pop();
const id = JSON.stringify(item);
data.pending[id] = true;
pending = pending + 1;
if (fork === null && data.queue.length !== 0) {
// lets do some work in parallel
fork = go();
}
switch(item.type) {
case 'pulls':
await readPulls(item.page);
break;
case 'pull':
await readPull(item.pull);
break;
case 'reviews':
await readReviews(item.pull, item.page);
break;
case 'prcomments':
await readComments(item.pull, item.page);
break;
case 'comments':
await readIssueComments(item.pull, item.page);
break;
default:
throw new Error('unknown: ' + id)
}
data.pending[id] = false;
pending = pending - 1;
processedCounter = (processedCounter + 1) % 10;
if (processedCounter === 0) { // Write to disk after every 10 requests
writeData();
}
totalProcessed = totalProcessed + 1;
}
} catch (errr) {
console.log(pad('', 10) + pad('[FAIL]', 5) + id);
throw errr;
} finally {
if (fork !== null) {
await fork;
}
}
}
function readData() {
if (fs.existsSync(dataFile)) {
data = JSON.parse(fs.readFileSync(dataFile, 'utf8'));
}
}
function writeData() {
fs.writeFileSync(dataFile, JSON.stringify(data));
}
async function main() {
readData();
try {
// Re-enqueue any pending requests from the state, i guess they
// need to be retried
for (let key in data.pending) {
if (data.pending[key]) {
enqueue(JSON.parse(key));
}
}
// If there are no pending requests from a crash, read the list of pull requests again
// Only the changed pull requests will be re-requested
if (data.queue.length === 0) {
data.pending = {}; // wipe out pending, so we can re-request things we need to
enqueue({
type: 'pulls',
page: 1
});
}
await go();
} catch (err) {
console.log(err)
throw err;
} finally {
writeData();
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment