Created
June 10, 2018 08:28
-
-
Save liquidx/b1d923847c6d0af31fb77bac8b74c8d5 to your computer and use it in GitHub Desktop.
Looking up station names on Wikipedia using Google Spreadsheets AppScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// debug: | |
// https://ja.wikipedia.org/w/api.php?action=query&format=rawfm&titles=%E4%B8%AD%E4%BA%95%E9%A7%85&prop=revisions&rvprop=content | |
//https://www.mediawiki.org/wiki/API:Revisions | |
var COL_NAME = 1; | |
var COL_LINES = 2; | |
var COL_IDS = 3; | |
var COL_LAT = 4; | |
var COL_LNG = 5; | |
var COL_WIKINAME = 6; | |
var COL_HASDATA = 8; | |
var COL_WIKISECTIONS = 9; | |
function querySerialize(obj) { | |
var str = []; | |
for (var p in obj) | |
if (obj.hasOwnProperty(p)) { | |
str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p])); | |
} | |
return str.join("&"); | |
} | |
function parseCoordinate(result) { | |
if (result.query.pages) { | |
var pagesKey = Object.keys(result.query.pages); | |
var coordinates = result.query.pages[pagesKey[0]]['coordinates']; | |
if (!coordinates) { | |
return null; | |
} | |
var coordKey = Object.keys(coordinates); | |
var coordinate = coordinates[coordKey[0]]; | |
Logger.log(coordinates); | |
Logger.log(coordKey); | |
return coordinate; | |
} | |
return null; | |
} | |
function parseWikiFormat(content) { | |
var sections = []; | |
var remaining = content; | |
var startEkiSection = new RegExp('\\{\\{\u99c5\u60c5\u5831', 'mg'); | |
var startSection = new RegExp('\\{\\{', 'mg'); | |
var endSection = new RegExp('\\}\\}', 'mg'); | |
var found = remaining.search(startEkiSection); | |
while (found != -1) { | |
remaining = remaining.slice(found); | |
var section = ''; | |
var endPos = remaining.search(endSection); | |
if (endPos == -1) { | |
Logger.log('Error: cannont find any end sections (1)'); | |
break; // error case, can't find the end of the section. | |
} | |
section = remaining.slice(0, endPos + 2); | |
remaining = remaining.slice(endPos + 2); | |
//Logger.log('Initial end found at ' + endPos); | |
//Logger.log('section.length = ' + section.length); | |
// Count how many section starts there are. | |
var scanForNestedSection = section.slice(2); | |
var foundStart = scanForNestedSection.search(startSection); | |
//Logger.log('Found a nested start at ' + foundStart); | |
while (foundStart != -1) { | |
var foundEnd = remaining.search(endSection); | |
if (foundEnd == -1) { | |
Logger.log('Error: cannot find end.'); | |
break; // error case, should handle. | |
} | |
//Logger.log('Nested found end at ' + foundEnd); | |
// Add all the characters to the next }}. | |
section += remaining.slice(0, foundEnd + 2); | |
// Add all the characters to the next }} to scanForNestedSection too. | |
scanForNestedSection += remaining.slice(0, foundEnd + 2); | |
// Cut the remaining string. | |
remaining = remaining.slice(foundEnd + 2); | |
// Cut the scanForNextSection to move past the nested start. | |
scanForNestedSection = scanForNestedSection.slice(foundStart + 2); | |
// Scan again to see if there are more nested starts, and repeat. | |
foundStart = scanForNestedSection.search(startSection); | |
} | |
//Logger.log(section); | |
sections.push(section); | |
found = remaining.search(startEkiSection); | |
} | |
return sections; | |
} | |
function parseMeta(result) { | |
var metadata = {}; | |
if (result.query.pages) { | |
var pagesKey = Object.keys(result.query.pages); | |
var revisions = result.query.pages[pagesKey[0]]['revisions']; | |
if (!revisions) { | |
return null; | |
} | |
var revisionKeys = Object.keys(revisions); | |
var revision = revisions[revisionKeys[0]]; | |
var header = revision['*']; | |
metadata['full'] = header; | |
var latLngPattern = /\|([0-9]+)\|([0-9]+)\|([0-9\.]+)\|.\|([0-9]+)\|([0-9]+)\|([0-9\.]+)\|./; | |
var latLngMatch = header.match(latLngPattern); | |
if (latLngMatch) { | |
metadata['lat'] = latLngMatch[1] + '.' + ((latLngMatch[2]/60 * 100) + (latLngMatch[3] / 60)); | |
metadata['lon'] = latLngMatch[4] + '.' + ((latLngMatch[5]/60 * 100) + (latLngMatch[6] / 60)); | |
} | |
return metadata; | |
} | |
return null; | |
} | |
function updateLatLngForAllRows() { | |
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List"); | |
var startRow = 2; | |
var listRows = stationList.getLastRow(); | |
var listRange = stationList.getRange(startRow, 1, listRows, COL_WIKINAME); | |
for (var row = 1; row < listRows; row++) { | |
updateLatLngForRow(startRow + row); | |
} | |
} | |
function updateLatLngForActiveRow() { | |
updateLatLngForRow(146); | |
} | |
function updateLatLngForRow(rowNumber) { | |
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List"); | |
var RESULTS_MAX = 10; | |
var rowRange = stationList.getRange(rowNumber, 1, 1, COL_WIKISECTIONS + RESULTS_MAX); | |
var rowValues = rowRange.getValues(); | |
for (var c = COL_WIKISECTIONS - 1; c < COL_WIKISECTIONS + RESULTS_MAX; c++) { | |
var value = rowValues[0][c]; | |
var latlng = getGeoFromWikiSection(value); | |
if (latlng) { | |
var latLngRange = stationList.getRange(rowNumber, COL_LAT, 1, 2); | |
latLngRange.setValues([[latlng['lat'], latlng['lon']]]); | |
return true; | |
} | |
} | |
return false; | |
} | |
function getGeoFromWikiSection(sectionValue) { | |
// latlng pattern | |
// {{ウィキ座標度分秒 | |
// {{ウィキ座標2段度分秒 | |
var latLngPattern = new RegExp('ウィキ.*度分秒\\|([0-9]+)\\|([0-9]+)\\|([0-9\\.]+)\\|.\\|([0-9]+)\\|([0-9]+)\\|([0-9\\.]+)\\|.'); | |
var latLngMatch = sectionValue.match(latLngPattern); | |
Logger.log(latLngMatch); | |
if (latLngMatch) { | |
return { | |
'lat': parseInt(latLngMatch[1]) + parseInt(latLngMatch[2])/60 + parseFloat(latLngMatch[3])/3600, | |
'lon': parseInt(latLngMatch[4]) + parseInt(latLngMatch[5])/60 + parseFloat(latLngMatch[6])/3600 | |
}; | |
} | |
// alternate pattern | |
var latPattern = new RegExp('緯度度[\\s]*\\=[\\s]*([0-9]+)[\\s\|]+緯度分[ ]*\\=[\\s]*([0-9]+)[\\s\|]+緯度秒[\\s]*\\=[ ]*([0-9\\.]+)'); | |
var lngPattern = new RegExp('経度度[\\s]*\\=[\\s]*([0-9]+)[\\s\|]+経度分[ ]*\\=[\\s]*([0-9]+)[\\s\|]+経度秒[\\s]*\\=[ ]*([0-9\\.]+)'); | |
var latMatch = sectionValue.match(latPattern); | |
var lngMatch = sectionValue.match(lngPattern); | |
if (latMatch && lngMatch) { | |
return { | |
'lat': parseInt(latMatch[1]) + parseInt(latMatch[2])/60 + parseFloat(latMatch[3])/3600, | |
'lon': parseInt(lngMatch[1]) + parseInt(lngMatch[2])/60 + parseFloat(lngMatch[3])/3600 | |
}; | |
} | |
return null; | |
} | |
function fetchAllStationsFromWiki() { | |
var startRow = 497; | |
var onlyMissing = true; | |
var onlyOneRow = true; | |
var stationList = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Station List"); | |
var listRows = stationList.getLastRow() - startRow; | |
var listRange = stationList.getRange(startRow, 1, listRows, COL_HASDATA); | |
var hasDataValues = stationList.getRange(startRow, COL_HASDATA, listRows, 1).getValues(); | |
for (var row = 0; row < listRows; row++) { | |
// Skip rows | |
if (onlyMissing) { | |
var hasData = hasDataValues[row][0]; | |
if (hasData == 1) { | |
continue; | |
} | |
} | |
Logger.log('Fetch row ' + (startRow + row)); | |
var result = fetchDataFromWiki(listRange, row); | |
// debug | |
var resultCell = stationList.getRange(startRow + row, COL_WIKISECTIONS); | |
resultCell.setValue(result['query']); | |
if (result) { | |
var meta = parseMeta(result); | |
if (meta) { | |
var sections = parseWikiFormat(meta['full']); | |
for (var i = 0; i < sections.length; i++) { | |
var resultCell = stationList.getRange(startRow + row, COL_WIKISECTIONS + i); | |
resultCell.setValue(sections[i]); | |
} | |
// Update lat/lng if we updated the result. | |
updateLatLngForRow(startRow + row); | |
} | |
} | |
if (onlyOneRow) { | |
break; | |
} | |
} | |
} | |
var REQUEST_TYPE_GEO = 'REQUEST_TYPE_GEO'; | |
var REQUEST_TYPE_SECTIONS = 'REQUEST_TYPE_SECTIONS'; | |
function fetchDataFromWiki(range, row0, requestType) { | |
var wikiName = range.getCell(row0 + 1, COL_WIKINAME).getValue(); | |
var name = range.getCell(row0 + 1, COL_NAME).getValue(); | |
var stationName = name + '駅'; | |
if (wikiName) { | |
stationName = wikiName; | |
} | |
var geoQuery = { | |
'action': 'query', | |
'prop': 'coordinates', | |
'format': 'json', | |
'titles': stationName, | |
'coprimary': 'all' | |
}; | |
var metaQuery = { | |
'action': 'query', | |
'prop': 'revisions', | |
'rvprop': 'content', | |
'format': 'json', | |
'titles': stationName, | |
}; | |
var options = { | |
'method' : 'get', | |
'headers': { | |
'User-Agent': 'OneOff alastair@liquidx.net', | |
} | |
}; | |
Logger.log('Making request for ' + stationName); | |
var query = metaQuery; | |
if (requestType == REQUEST_TYPE_GEO) { | |
query = geoQuery; | |
} | |
var response = UrlFetchApp.fetch("https://ja.wikipedia.org/w/api.php?" + querySerialize(query), options); | |
var result = JSON.parse(response.getContentText('UTF-8')); | |
return result; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment