Skip to content

Instantly share code, notes, and snippets.

@NeKzor
Last active July 12, 2021 20:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NeKzor/ef166f9d7e48690dabcd712f54f9d1b1 to your computer and use it in GitHub Desktop.
Save NeKzor/ef166f9d7e48690dabcd712f54f9d1b1 to your computer and use it in GitHub Desktop.
Every Mario Kart World Record form 1996-2021. Datasets available at https://www.dolthub.com/repositories/nekz/mkwrs/data/master
const fetch = require('node-fetch');
const fs = require('fs');
const { JSDOM } = require('jsdom');
const path = require('path');
const baseApi = 'https://mkwrs.com';
const fetchOptions = {
headers: {
'User-Agent': 'ne^',
},
};
const games = [
'mk8dx',
'mk8',
'mk7',
'mkwii',
'mkds',
'mkdd',
'mksc',
'mk64',
'smk',
];
const findMostColumns = (rows) => {
let [most] = rows;
let count = Object.keys(most).length;
rows.forEach((row) => {
const newCount = Object.keys(row).length;
if (newCount > count) {
count = newCount;
most = row;
}
});
return most;
};
const refresh = process.argv.slice(2).some((arg) => arg === '--refresh');
const main = async () => {
const importScript = path.join(__dirname, '/../import.sh');
const dataFolder = path.join(__dirname, '/../data');
fs.writeFileSync(importScript, '#!/bin/bash\n', 'utf-8');
for (const game of games) {
const dataFile = path.join(dataFolder, `/${game}.json`);
let rows = [];
if (refresh) {
const tracks = await scrapeTracks(game);
rows = await scrapeGame(game, tracks);
fs.writeFileSync(dataFile, JSON.stringify({ rows }, null, 4), 'utf-8');
} else {
console.log('importing', game);
rows = JSON.parse(fs.readFileSync(dataFile, 'utf-8')).rows;
}
fs.appendFileSync(
importScript,
`\ndolt sql -q 'CREATE TABLE ${game} (
${Object.keys(findMostColumns(rows)).map((column) => ` ${column} varchar(255)`).join(',\n')}
)'
dolt table import -u ${game} ${dataFile}\n`,
'utf-8'
);
}
};
const scrapeTracks = async (game) => {
console.log('scraping', game, '...');
const route = `${baseApi}/${game}`;
const res = await fetch(route, fetchOptions);
console.log('[GET]', route, ':', res.status);
const text = await res.text();
const dom = new JSDOM(text);
const document = dom.window.document;
const tracks = [];
const tables = document.querySelectorAll('.wr');
console.log('found', tables.length, 'tables');
let tableMode = false;
for (const table of tables) {
const rows = [...table.querySelectorAll('tr')]
.slice(1)
.filter((tr) => tr.parentElement.parentElement.className === 'wr');
console.log('found', rows.length, 'rows');
for (const row of rows) {
const track = row.querySelector('td');
if (!track || !track.children[0]) {
continue;
}
if (track.children[0].tagName === 'A' && !tableMode) {
const trackA = track.children[0];
const name = trackA.textContent;
const link = trackA.getAttribute('href');
if (link.startsWith('http')) {
continue;
}
console.log(name, link);
tracks.push({
name,
id: link.slice('display.php?track='.length),
});
} else if (track.children[0].tagName === 'TABLE') {
tableMode = true;
if (!(track.children[0].children[0] && track.children[0].children[0].querySelectorAll('td'))) {
continue;
}
if (track.getAttribute('colspan') !== '2') {
continue;
}
const [trackTd, ...tds] = [...track.children[0].children[0].querySelectorAll('td')];
const name = trackTd.textContent;
console.log(name);
for (const td of tds) {
const link = td.firstElementChild.getAttribute('href');
const category = td.firstElementChild.textContent;
if (link.startsWith('http')) {
continue;
}
console.log(link);
tracks.push({
name,
id: link.slice('display.php?track='.length),
category,
});
}
}
}
}
return tracks;
};
const scrapeGame = async (gameName, tracks) => {
const isMk64 = gameName === 'mk64';
const isMkwii = gameName === 'mkwii';
const columnOffset = isMk64 || isMkwii ? 5 : 4;
console.log('scraping tracks for', gameName, '...');
const result = [];
for (const track of tracks) {
const route = `${baseApi}/${gameName}/display.php?track=${track.id}`;
const res = await fetch(route, fetchOptions);
console.log('[GET]', route, ':', res.status);
const text = await res.text();
const dom = new JSDOM(text);
const document = dom.window.document;
const [columnRow, ...rows] = [...document.querySelectorAll('.wr')[1].querySelectorAll('tr')];
const columnNames = [...columnRow.querySelectorAll('th')].map((th) => th.textContent)
.slice(columnOffset)
.map((column) => column.replace(/ /g, '_').toLowerCase());
const isMk8DxGcnBabyPark = columnNames[columnNames.length - 1] === 'combination';
let insertSchroomsAndCombination = false;
let record = null;
for (const row of rows) {
const allTds = [...row.querySelectorAll('td')];
if (allTds.length <= 2) {
continue;
}
if (isMk8DxGcnBabyPark) {
if (insertSchroomsAndCombination) {
insertSchroomsAndCombination = false;
record.shrooms = allTds[0].textContent;
record.tires = allTds[1].textContent;
record.glider = allTds[2].textContent;
continue;
} else {
insertSchroomsAndCombination = true;
const tds = allTds.slice(columnOffset);
record = columnNames.slice(0, 7).reduce((record, column, idx) => {
record[column] = tds[idx].textContent;
return record;
}, {});
record.coins = tds[8].textContent;
record.character = tds[9].textContent;
record.kart = tds[10].textContent;
}
} else {
const tds = allTds.slice(columnOffset);
record = columnNames.reduce((record, column, idx) => {
record[column] = tds[idx].textContent;
return record;
}, {});
}
if (isMk64) {
const [dateTd, ntscTimeTd, palTimeTd, playerTd, nationTd] = allTds;
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent;
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null;
record.ntscTime = ntscTimeTd.firstElementChild ? ntscTimeTd.firstElementChild.textContent : ntscTimeTd.textContent;
record.ntscVideo = ntscTimeTd.firstElementChild ? ntscTimeTd.firstElementChild.getAttribute('href') : null;
record.palTime = palTimeTd.firstElementChild ? palTimeTd.firstElementChild.textContent : palTimeTd.textContent;
record.palVideo = palTimeTd.firstElementChild ? palTimeTd.firstElementChild.getAttribute('href') : null;
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null;
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent;
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild
? nationTd.firstElementChild.firstElementChild.getAttribute('title')
: null;
} else if (isMkwii) {
const [dateTd, timeTd, playerTd, miiNameTd, nationTd] = allTds;
const ghostTd = allTds[allTds.length - 1];
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent;
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null;
record.time = timeTd.firstElementChild ? timeTd.firstElementChild.textContent : timeTd.textContent;
record.video = timeTd.firstElementChild ? timeTd.firstElementChild.getAttribute('href') : null;
record.ghost = ghostTd.firstElementChild ? ghostTd.firstElementChild.getAttribute('href') : null;
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null;
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent;
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild
? nationTd.firstElementChild.firstElementChild.getAttribute('title')
: null;
record.player_mii = miiNameTd.textContent;
} else {
const [dateTd, timeTd, playerTd, nationTd] = allTds;
record.date = dateTd.firstElementChild ? dateTd.firstElementChild.textContent : dateTd.textContent;
record.note = dateTd.firstElementChild ? dateTd.firstElementChild.getAttribute('title') : null;
record.time = timeTd.firstElementChild ? timeTd.firstElementChild.textContent : timeTd.textContent;
record.video = timeTd.firstElementChild ? timeTd.firstElementChild.getAttribute('href') : null;
record.device = timeTd.children[1] ? timeTd.children[1].getAttribute('title') : null;
record.player_id = playerTd.firstElementChild ? playerTd.firstElementChild.getAttribute('href').slice(19) : null;
record.player_name = playerTd.firstElementChild ? playerTd.firstElementChild.textContent : playerTd.textContent;
record.player_nation = nationTd.firstElementChild && nationTd.firstElementChild.firstElementChild
? nationTd.firstElementChild.firstElementChild.getAttribute('title')
: null;
}
record.track_id = track.id;
record.track_name = track.name;
record.track_category = track.category ? track.category : null;
result.push(record);
}
}
return result;
};
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment