Skip to content

Instantly share code, notes, and snippets.

@liquidx
Created June 10, 2018 08:28
Show Gist options
  • Save liquidx/b1d923847c6d0af31fb77bac8b74c8d5 to your computer and use it in GitHub Desktop.
Save liquidx/b1d923847c6d0af31fb77bac8b74c8d5 to your computer and use it in GitHub Desktop.
Looking up station names on Wikipedia using Google Spreadsheets AppScript
// debug:
// https://ja.wikipedia.org/w/api.php?action=query&format=rawfm&titles=%E4%B8%AD%E4%BA%95%E9%A7%85&prop=revisions&rvprop=content
//https://www.mediawiki.org/wiki/API:Revisions
var COL_NAME = 1;
var COL_LINES = 2;
var COL_IDS = 3;
var COL_LAT = 4;
var COL_LNG = 5;
var COL_WIKINAME = 6;
var COL_HASDATA = 8;
var COL_WIKISECTIONS = 9;
function querySerialize(obj) {
var str = [];
for (var p in obj)
if (obj.hasOwnProperty(p)) {
str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
}
return str.join("&");
}
function parseCoordinate(result) {
if (result.query.pages) {
var pagesKey = Object.keys(result.query.pages);
var coordinates = result.query.pages[pagesKey[0]]['coordinates'];
if (!coordinates) {
return null;
}
var coordKey = Object.keys(coordinates);
var coordinate = coordinates[coordKey[0]];
Logger.log(coordinates);
Logger.log(coordKey);
return coordinate;
}
return null;
}
function parseWikiFormat(content) {
var sections = [];
var remaining = content;
var startEkiSection = new RegExp('\\{\\{\u99c5\u60c5\u5831', 'mg');
var startSection = new RegExp('\\{\\{', 'mg');
var endSection = new RegExp('\\}\\}', 'mg');
var found = remaining.search(startEkiSection);
while (found != -1) {
remaining = remaining.slice(found);
var section = '';
var endPos = remaining.search(endSection);
if (endPos == -1) {
Logger.log('Error: cannont find any end sections (1)');
break; // error case, can't find the end of the section.
}
section = remaining.slice(0, endPos + 2);
remaining = remaining.slice(endPos + 2);
//Logger.log('Initial end found at ' + endPos);
//Logger.log('section.length = ' + section.length);
// Count how many section starts there are.
var scanForNestedSection = section.slice(2);
var foundStart = scanForNestedSection.search(startSection);
//Logger.log('Found a nested start at ' + foundStart);
while (foundStart != -1) {
var foundEnd = remaining.search(endSection);
if (foundEnd == -1) {
Logger.log('Error: cannot find end.');
break; // error case, should handle.
}
//Logger.log('Nested found end at ' + foundEnd);
// Add all the characters to the next }}.
section += remaining.slice(0, foundEnd + 2);
// Add all the characters to the next }} to scanForNestedSection too.
scanForNestedSection += remaining.slice(0, foundEnd + 2);
// Cut the remaining string.
remaining = remaining.slice(foundEnd + 2);
// Cut the scanForNextSection to move past the nested start.
scanForNestedSection = scanForNestedSection.slice(foundStart + 2);
// Scan again to see if there are more nested starts, and repeat.
foundStart = scanForNestedSection.search(startSection);
}
//Logger.log(section);
sections.push(section);
found = remaining.search(startEkiSection);
}
return sections;
}
function parseMeta(result) {
var metadata = {};
if (result.query.pages) {
var pagesKey = Object.keys(result.query.pages);
var revisions = result.query.pages[pagesKey[0]]['revisions'];
if (!revisions) {
return null;
}
var revisionKeys = Object.keys(revisions);
var revision = revisions[revisionKeys[0]];
var header = revision['*'];
metadata['full'] = header;
var latLngPattern = /\|([0-9]+)\|([0-9]+)\|([0-9\.]+)\|.\|([0-9]+)\|([0-9]+)\|([0-9\.]+)\|./;
var latLngMatch = header.match(latLngPattern);
if (latLngMatch) {
metadata['lat'] = latLngMatch[1] + '.' + ((latLngMatch[2]/60 * 100) + (latLngMatch[3] / 60));
metadata['lon'] = latLngMatch[4] + '.' + ((latLngMatch[5]/60 * 100) + (latLngMatch[6] / 60));
}
return metadata;
}
return null;
}
function updateLatLngForAllRows() {
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List");
var startRow = 2;
var listRows = stationList.getLastRow();
var listRange = stationList.getRange(startRow, 1, listRows, COL_WIKINAME);
for (var row = 1; row < listRows; row++) {
updateLatLngForRow(startRow + row);
}
}
function updateLatLngForActiveRow() {
updateLatLngForRow(146);
}
function updateLatLngForRow(rowNumber) {
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List");
var RESULTS_MAX = 10;
var rowRange = stationList.getRange(rowNumber, 1, 1, COL_WIKISECTIONS + RESULTS_MAX);
var rowValues = rowRange.getValues();
for (var c = COL_WIKISECTIONS - 1; c < COL_WIKISECTIONS + RESULTS_MAX; c++) {
var value = rowValues[0][c];
var latlng = getGeoFromWikiSection(value);
if (latlng) {
var latLngRange = stationList.getRange(rowNumber, COL_LAT, 1, 2);
latLngRange.setValues([[latlng['lat'], latlng['lon']]]);
return true;
}
}
return false;
}
function getGeoFromWikiSection(sectionValue) {
// latlng pattern
// {{ウィキ座標度分秒
// {{ウィキ座標2段度分秒
var latLngPattern = new RegExp('ウィキ.*度分秒\\|([0-9]+)\\|([0-9]+)\\|([0-9\\.]+)\\|.\\|([0-9]+)\\|([0-9]+)\\|([0-9\\.]+)\\|.');
var latLngMatch = sectionValue.match(latLngPattern);
Logger.log(latLngMatch);
if (latLngMatch) {
return {
'lat': parseInt(latLngMatch[1]) + parseInt(latLngMatch[2])/60 + parseFloat(latLngMatch[3])/3600,
'lon': parseInt(latLngMatch[4]) + parseInt(latLngMatch[5])/60 + parseFloat(latLngMatch[6])/3600
};
}
// alternate pattern
var latPattern = new RegExp('緯度度[\\s]*\\=[\\s]*([0-9]+)[\\s\|]+緯度分[ ]*\\=[\\s]*([0-9]+)[\\s\|]+緯度秒[\\s]*\\=[ ]*([0-9\\.]+)');
var lngPattern = new RegExp('経度度[\\s]*\\=[\\s]*([0-9]+)[\\s\|]+経度分[ ]*\\=[\\s]*([0-9]+)[\\s\|]+経度秒[\\s]*\\=[ ]*([0-9\\.]+)');
var latMatch = sectionValue.match(latPattern);
var lngMatch = sectionValue.match(lngPattern);
if (latMatch && lngMatch) {
return {
'lat': parseInt(latMatch[1]) + parseInt(latMatch[2])/60 + parseFloat(latMatch[3])/3600,
'lon': parseInt(lngMatch[1]) + parseInt(lngMatch[2])/60 + parseFloat(lngMatch[3])/3600
};
}
return null;
}
function fetchAllStationsFromWiki() {
var startRow = 497;
var onlyMissing = true;
var onlyOneRow = true;
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List");
var listRows = stationList.getLastRow() - startRow;
var listRange = stationList.getRange(startRow, 1, listRows, COL_HASDATA);
var hasDataValues = stationList.getRange(startRow, COL_HASDATA, listRows, 1).getValues();
for (var row = 0; row < listRows; row++) {
// Skip rows
if (onlyMissing) {
var hasData = hasDataValues[row][0];
if (hasData == 1) {
continue;
}
}
Logger.log('Fetch row ' + (startRow + row));
var result = fetchDataFromWiki(listRange, row);
// debug
var resultCell = stationList.getRange(startRow + row, COL_WIKISECTIONS);
resultCell.setValue(result['query']);
if (result) {
var meta = parseMeta(result);
if (meta) {
var sections = parseWikiFormat(meta['full']);
for (var i = 0; i < sections.length; i++) {
var resultCell = stationList.getRange(startRow + row, COL_WIKISECTIONS + i);
resultCell.setValue(sections[i]);
}
// Update lat/lng if we updated the result.
updateLatLngForRow(startRow + row);
}
}
if (onlyOneRow) {
break;
}
}
}
var REQUEST_TYPE_GEO = 'REQUEST_TYPE_GEO';
var REQUEST_TYPE_SECTIONS = 'REQUEST_TYPE_SECTIONS';
function fetchDataFromWiki(range, row0, requestType) {
var wikiName = range.getCell(row0 + 1, COL_WIKINAME).getValue();
var name = range.getCell(row0 + 1, COL_NAME).getValue();
var stationName = name + '駅';
if (wikiName) {
stationName = wikiName;
}
var geoQuery = {
'action': 'query',
'prop': 'coordinates',
'format': 'json',
'titles': stationName,
'coprimary': 'all'
};
var metaQuery = {
'action': 'query',
'prop': 'revisions',
'rvprop': 'content',
'format': 'json',
'titles': stationName,
};
var options = {
'method' : 'get',
'headers': {
'User-Agent': 'OneOff alastair@liquidx.net',
}
};
Logger.log('Making request for ' + stationName);
var query = metaQuery;
if (requestType == REQUEST_TYPE_GEO) {
query = geoQuery;
}
var response = UrlFetchApp.fetch("https://ja.wikipedia.org/w/api.php?" + querySerialize(query), options);
var result = JSON.parse(response.getContentText('UTF-8'));
return result;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment