Skip to content

Instantly share code, notes, and snippets.

@tcrosen
Last active December 11, 2016 01:24
Show Gist options
  • Save tcrosen/d5c3729ffe6c3447f5b2 to your computer and use it in GitHub Desktop.
Save tcrosen/d5c3729ffe6c3447f5b2 to your computer and use it in GitHub Desktop.
Kimono Example
var moment = require('moment-range');
var request = require('request');
var _ = require('lodash');
var url = require('url');
var start = new Date(2014, 9, 8);
var end = new Date();
var range = moment().range(start, end);
// Gets boxscore links from scores page
var scoresApi = 'https://www.kimonolabs.com/api/czde4e6c?apikey=abe6b22285a4d123b8d3ed875ac78331&date=';
// Gets data from PBP report
var pbpApi = 'https://www.kimonolabs.com/api/adflv7dk?apikey=abe6b22285a4d123b8d3ed875ac78331&kimpath4=';
// Loop through each date from `start` to `end`
// range.by('days', function(m) {
// m is simply the moment.js object (eg. Date)
// });
// Scrapes the scores page to locate boxscore URLs and extract game IDs for games on a given date
// http://www.nhl.com/ice/scores.htm?date=10/17/2014
function collectBoxscoreUrls(date, done) {
request(scoresApi + date.format('MM/DD/YYYY'), function(err, response, body) {
if (err) {
done(err);
}
var parsed = JSON.parse(body);
// now parse each url and pull out the game IDs to pass to the next Kimono API
var gameIds = _.map(parsed.results.collection1, function(o) {
// http://www.nhl.com/gamecenter/en/boxscore?id=2014020004 ==> 2014020004
var id = url.parse(o.boxscoreLink.href, true).query.id;
// the game ID used to retrieve the play-by-play report does not include the year (first 4 characters)
// 2014020004 ==> 020004
return id.substr(4);
});
// gameIds ==> [ '020001', '020002', '020003', '020004' ]
done(null, gameIds);
});
}
// Scrape data from a play-by-play report.
// The parameter passed to Kimono (kimpath4) is the HTML filename which uses the game ID
// http://www.nhl.com/scores/htmlreports/20142015/PL010060.HTM
function collectPlayByPlayData(gameId, done) {
var kimpath4 = 'PL' + gameId + '.HTM';
request(pbpApi + kimpath4, function(err, response, body) {
if (err) {
done(err);
}
var parsed = JSON.parse(body);
var pbpLogs = _.map(parsed.results.collection1, function(o) {
return o;
});
done(null, pbpLogs);
});
}
// Get all game data for a single date
function getGames(date) {
collectBoxscoreUrls(date, function(err, gameIds) {
collectPlayByPlayData(gameIds[0], function(err, pbpLogs) {
console.log(pbpLogs);
});
});
}
getGames(moment(start));
@bgoldste
Copy link

for this last function that starts w/ range.by('days', .... is there a way to slow down the calls? How many calls are being made in what period of time?

@tcrosen
Copy link
Author

tcrosen commented Dec 17, 2014

Okay I did a bit of work to clean it up and updated to a working version which at the moment collects games for a single date then gets the play-by-play data for a single one of those games. So now the questions are:

  1. How to run for each game in a day?
  2. How to run for multiple days?

@tcrosen
Copy link
Author

tcrosen commented Dec 17, 2014

Okay I think I've figured it out. I didn't realize I could update my APIs via API (confusing!).
Here's an updated version that sets the list of Play by Play report URLs to crawl from games for a day. The only question now is what is the best way to do this for many days...

var moment = require('moment-range');
var request = require('request');
var _ = require('lodash');
var url = require('url');
var fs = require('fs');

var start = new Date(2014, 9, 8);
var end = new Date();
var range = moment().range(start, end);

var apiKey = 'abe6b22285a4d123b8d3ed875ac78331';

// Gets boxscore links from scores page
var scoresApi = 'https://www.kimonolabs.com/api/czde4e6c?apikey=' + apiKey + '&date=';

// Gets data from PBP report
var pbpApi = 'https://www.kimonolabs.com/api/adflv7dk?apikey=' + apiKey;

// Loop through each date from `start` to `end`
// range.by('days', function(m) {
  // m is simply the moment.js object (eg. Date)
// });

// Scrapes the scores page to locate boxscore URLs and extract game IDs for games on a given date
// http://www.nhl.com/ice/scores.htm?date=10/17/2014
function collectBoxscoreUrls(date, done) {
  request(scoresApi + date.format('MM/DD/YYYY'), function(err, response, body) {
    if (err) {
      done(err);
    }

    var parsed = JSON.parse(body);

    // now parse each url and pull out the game IDs to pass to the next Kimono API
    var gameIds = _.map(parsed.results.collection1, function(o) {
      // http://www.nhl.com/gamecenter/en/boxscore?id=2014020004 ==> 2014020004
      var id = url.parse(o.boxscoreLink.href, true).query.id;

      // the game ID used to retrieve the play-by-play report does not include the year (first 4 characters)
      // 2014020004 ==> 020004
      return id.substr(4);
    });

    // gameIds ==> [ '020001', '020002', '020003', '020004' ]
    done(null, gameIds);
  });
}

// Update the PBP API source URLs to crawl each play-by-play report
function setPbpApiSourceUrls(gameIds, done) {
  var sourceUrls = _.map(gameIds, function(gameId) {
    return 'http://www.nhl.com/scores/htmlreports/20142015/PL' + gameId + '.HTM';
  });

  request({
    url: 'https://www.kimonolabs.com/kimonoapis/adflv7dk/update',
    method: 'POST',
    json: true,
    body: {
      apikey: apiKey,
      urls: sourceUrls
    }
  }, done);
}

// Start crawling PBP reports
function startPbpCrawl(done) {
  request({
    url: 'https://www.kimonolabs.com/kimonoapis/adflv7dk/startcrawl',
    method: 'POST',
    json: true,
    body: {
      apikey: apiKey
    }
  }, done);
}

// Scrape data from play-by-play reports.
function collectPlayByPlayData(done) {
  request(pbpApi, function(err, response, body) {
    if (err) {
      done(err);
    }

    var parsed = JSON.parse(body);

    fs.writeFile('pbp.json', body);

    var pbpLogs = _.map(parsed.results.collection1, function(o) {
      return o;
    });

    done(null, pbpLogs);
  });
}

// Get all game data for a single date
function getGames(date) {
  collectBoxscoreUrls(date, function(err, gameIds) {

    setPbpApiSourceUrls(gameIds, function(err, resp, body) {
      if (err) {
        console.log(err);
      } else {
        console.log('API updated to crawl these URLS: ', body.api.instructions.urls);
      }

      startPbpCrawl(function(err, resp, body) {
        if (err) {
          console.log(err);
        } else {
          console.log(body);
        }

        // collectPlayByPlayData(function(err, pbpLogs) {
        //   //console.log(pbpLogs);
        // });
      });
    });
  });
}

getGames(moment(start));

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment