Skip to content

Instantly share code, notes, and snippets.

@zhirzh
Last active September 2, 2017 15:24
Show Gist options
  • Save zhirzh/7666c2742a2c7dfbd9859b05ee76aef1 to your computer and use it in GitHub Desktop.
Save zhirzh/7666c2742a2c7dfbd9859b05ee76aef1 to your computer and use it in GitHub Desktop.
google doodles
const fs = require('fs');
const path = require('path');
const rawDirPath = path.join(__dirname, 'raw');
const allDoodlesPath = path.join(__dirname, 'doodles.all.json');
const allDoodles = fs
.readdirSync(rawDirPath)
.reduce((_allDoodles, fileName) => {
const filePath = path.join(rawDirPath, fileName);
let fileDoodles = [];
try {
fileDoodles = JSON.parse(fs.readFileSync(filePath));
} catch (err) {
console.error('Improper JSON:', filePath);
}
return _allDoodles.concat(fileDoodles);
}, []);
fs.writeFileSync(allDoodlesPath, JSON.stringify(allDoodles));
source "./fetch.sh"
CURRENT_YEAR=$(date +%Y)
CURRENT_MONTH=$(date +%m)
NULL_FILESIZE=2
for (( YEAR = 1998; YEAR <= $CURRENT_YEAR; YEAR++ )) do
for (( MONTH = 1; MONTH <= 12; MONTH++ )) do
# fetch JSON
fetch $YEAR $MONTH
# bail
if [[ $YEAR == $CURRENT_YEAR && $ZERO_MONTH == $CURRENT_MONTH ]]
then
echo "BAIL"
break
fi
done
done
source "./fetch.sh"
YEAR=$(date +%Y)
MONTH=$(date +%m)
fetch $YEAR $MONTH true
DATA_PATH="raw"
mkdir -p $DATA_PATH
NULL_FILESIZE=2
fetch() {
YEAR=$1
MONTH=$2
FORCE=${3:-false}
ZERO_MONTH=$(printf %02d $(( 10#$MONTH))) # zero padding
URL="https://www.google.com/doodles/json/$YEAR/$ZERO_MONTH?full=1"
FILEPATH="$DATA_PATH/$YEAR-$ZERO_MONTH.json"
if [[ $FORCE != true ]]
then
if [[ -f "$FILEPATH" ]]
then
FILESIZE=$(wc -c < "$FILEPATH")
if [[ $FILESIZE -eq $NULL_FILESIZE ]]
then
echo "NULL: $FILEPATH"
else
echo "SKIP: $FILEPATH"
fi
return
fi
fi
echo "FETCH: $FILEPATH"
wget $URL -O "$FILEPATH" -q --show-progress
}
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const allDoodles = require('./doodles.all.json');
const linkTypes = [
'alternate_url',
'call_to_action_image_url',
'hires_url',
'standalone_html',
'url',
];
const urlPrefixes = [
'lh3.googleusercontent.com',
'www.google.com/logos',
'www.google.com/logos/doodles',
];
const schema = [
/* 'alternate_url',
'blog_text',
'call_to_action_image_url',
'collection_id',
'countries',
'doodle_args',
'doodle_type',
'height',
'hires_height',
'hires_width',
'history_doodles',
'id',
'is_animated_gif',
'is_dynamic',
'is_global',
'is_highlighted',
'name',
'persistent_id',
'query',
'related_doodles',
'share_text',
'standalone_html',
'tags',
'translations',
'width',
'youtube_id',
*/
'hires_url',
'next_doodle',
'prev_doodle',
'run_date_array',
'title',
'url',
'_id', // unique ID for each doodle
];
/**
* Generate unique hashes for doodles, deterministically.
* @param {object} doodle - Doodle object to generate hash for.
* @returns {string} Unique hash for supplied doodle.
*/
function generateDoodleHash(doodle) {
return crypto
.createHash('md5')
.update(`[${doodle.name}](${doodle.url})`, 'ascii')
.digest('hex');
}
/**
* Write JSON to a file.
* @param {string} filepath - absolute path of output file
* @param {any} json - JSON to write
* @param {bool} pretty - Pretty print
*/
function writeJSON(filepath, json, pretty = false) {
fs.writeFileSync(filepath, JSON.stringify(json, null, pretty ? 2 : 0));
}
const uniqueDoodles = {};
const allCountriesSet = new Set();
const allTagsSet = new Set();
allDoodles.forEach(doodle => {
doodle._id = generateDoodleHash(doodle);
uniqueDoodles[doodle._id] = doodle;
doodle.countries.forEach(country => {
country = country.trim().toLowerCase();
allCountriesSet.add(country);
});
doodle.tags.forEach(tag => {
tag = tag.trim().toLowerCase();
allTagsSet.add(tag);
});
});
const allCountries = Array.from(allCountriesSet);
const allTags = Array.from(allTagsSet);
const cleanDoodles = allDoodles
.map(doodle => {
if (doodle.next_doodle !== null) {
const nextDoodle = doodle.next_doodle;
const nextDoodleHash = generateDoodleHash(nextDoodle);
doodle.next_doodle = nextDoodleHash;
}
if (doodle.prev_doodle !== null) {
const prevDoodle = doodle.prev_doodle;
const prevDoodleHash = generateDoodleHash(prevDoodle);
doodle.prev_doodle = prevDoodleHash;
}
doodle.related_doodles = doodle.related_doodles.map(relatedDoodle => {
const relatedDoodleHash = generateDoodleHash(relatedDoodle);
return relatedDoodleHash;
});
doodle.history_doodles = doodle.history_doodles.map(historyDoodle => {
const historyDoodleHash = generateDoodleHash(historyDoodle);
return historyDoodleHash;
});
return doodle;
})
.map(doodle => {
doodle.countries = doodle.countries.map(country =>
allCountries.indexOf(country.trim().toLowerCase()),
);
doodle.tags = doodle.tags.map(tag =>
allTags.indexOf(tag.trim().toLowerCase()),
);
return doodle;
})
.map(doodle => {
linkTypes.forEach(linkType => {
const link = doodle[linkType];
switch (true) {
case link.startsWith('https://lh3.googleusercontent.com'):
doodle[linkType] = link.replace(
'https://lh3.googleusercontent.com',
0,
);
break;
case link.startsWith('//www.google.com/logos'):
doodle[linkType] = link.replace('//www.google.com/logos', 1);
break;
case link.startsWith('/logos'):
doodle[linkType] = link.replace('/logos', 1);
break;
case link.startsWith('//www.google.com/logos/doodles'):
doodle[linkType] = link.replace('//www.google.com/logos/doodles', 2);
break;
}
});
return doodle;
})
.map(doodle => schema.map(key => doodle[key]));
writeJSON('doodles.clean.json', cleanDoodles);
writeJSON('meta.json', {
countries: allCountries,
tags: allTags,
schema,
urlPrefixes,
});
const fs = require('fs');
const path = require('path');
const allDoodlesPath = path.join(__dirname, 'doodles.all.json');
const allDoodles = require(allDoodlesPath);
const keys = {};
const allKeys = allDoodles.forEach(doodle => {
Object.keys(doodle).forEach(k => keys[k] = doodle[k].constructor);
});
console.log(keys);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment