Skip to content

Instantly share code, notes, and snippets.

@ChlodAlejandro
Last active May 14, 2020 10:50
Show Gist options
  • Save ChlodAlejandro/35e709747bcfe107b115e13c0d61bdf2 to your computer and use it in GitHub Desktop.
Save ChlodAlejandro/35e709747bcfe107b115e13c0d61bdf2 to your computer and use it in GitHub Desktop.
Scrapes the PAGASA Severe Weather Bulletin webpage and turns it into a usable JavaScript object.
"use strict";
/**
*
* PAGASA Severe Weather Bulletin Scraper
*
* @author Chlod Alejandro <chlod@chlod.net>
* @license Apache-2.0
* @copyright Copyright 2020 Chlod Alejandro
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language governing permissions
* and limitations under the License.
*
* You can download a copy of the license here:
* https://www.apache.org/licenses/LICENSE-2.0.txt
*
* Feel free to use this however you want. But please, be respectful of PAGASA, and avoid
* requesting from their webpage every second.
*
**/
const cheerio = require("cheerio");
const axios = require("axios");
class PagasaScraper {
async pullBulletin() {
var page = await axios.get("http://bagong.pagasa.dost.gov.ph/tropical-cyclone/severe-weather-bulletin/2");
this.$ = cheerio.load(page.data);
return this.parseBulletin();
}
extractSections(areas) {
var extractionRegex = /(?:the\s)?([a-z]+)\s([a-z]+)\sof(?:\smainland)?\s((?:[\xF1\w]+|\s)+?)\s?\((.+?)\)/gi;
let match;
var matchList = [];
while ((match = extractionRegex.exec(areas)) != null) {
var includes = [];
match[4].split(", ").forEach(e => {
e.split(",").forEach(e => {
includes.push(e.replace(/\s?city$/gi, ""));
})
});
matchList.push({
province: match[3],
part: true,
includes: {
part: match[1],
term: match[2],
municipalities: includes
}
});
}
areas = areas.replace(extractionRegex, "").replace(/,{2}/g, ",");
return {
new: areas,
list: matchList
};
}
extractRests(areas) {
var extractionRegex = /(?:the\s)?rest\sof(?:\smainland)?\s((?:[\xF1\w\s]+|\s)+?)\b/gi;
let match;
var matchList = [];
while ((match = extractionRegex.exec(areas)) != null) {
var includes = [];
matchList.push({
province: match[1],
part: true,
includes: {
part: "rest",
term: "rest"
}
});
}
areas = areas.replace(extractionRegex, "").replace(/,{2}/g, ",");
return {
new: areas,
list: matchList
};
}
extractWholes(areas) {
var extractionRegex = /\b((?:[\xF1\w\s]+|\s)+)\b/gi;
let match;
var matchList = [];
while ((match = extractionRegex.exec(areas)) != null) {
var includes = [];
matchList.push({
province: match[1],
part: false
});
}
areas = areas.replace(extractionRegex, "").replace(/,{2}/g, ",");
return {
new: areas,
list: matchList
};
}
mapLandmasses(areas) {
var landmasses = this.$(areas).children("li");
var final = {};
landmasses.each((i, e) => {
var landmass = this.$(e).children(":first-child");
var content = this.$(e).children("ul").children("li");
var finalContent = "";
content.each((i2, e2) => {
finalContent += this.$(e2).text() + ",";
});
finalContent = finalContent.replace(/,+$/g, "");
final[landmass.text().toLowerCase()] = finalContent;
});
return final;
}
parseAffectedAreas(areasElement) {
var landmasses = this.mapLandmasses(areasElement);
var finalAffectedAreas = {};
Object.entries(landmasses).forEach(([i, landmassElement]) => {
var areas = landmassElement;
areas = areas.replace(/,\s?and\s/g, ",").replace(/(\s{2}|\.$)/g, "");
var sections = this.extractSections(areas);
areas = sections["new"];
sections = sections["list"];
var rests = this.extractRests(areas);
areas = rests["new"];
rests = rests["list"];
var wholes = this.extractWholes(areas);
areas = wholes["areas"];
wholes = wholes["list"];
finalAffectedAreas[i] = [...wholes, ...rests, ...sections];
if (/[^,. ]/gi.test(areas)) {
if (finalAffectedAreas["extras"] === undefined)
finalAffectedAreas["extras"] = {};
finalAffectedAreas["extras"][i] = areas;
}
});
return finalAffectedAreas;
}
parseTCWSRows(rawRows) {
var finalRows = {};
Object.assign(finalRows, rawRows)
Object.entries(finalRows).forEach(([i, e]) => {
var e = this.$(finalRows[i]);
switch(i) {
case "affected_areas": {
finalRows[i] = this.parseAffectedAreas(e);
break;
}
default: {
finalRows[i] = [];
this.$(e.children("li")).each((i2, e2) => {
finalRows[i].push(this.$(e2).text().trim());
});
}
}
});
return finalRows;
}
grabTCWSLevel(level) {
// Get signal table from DOM
var indicator = this.$(`.signalno${level}`);
if (indicator === null || indicator === undefined || indicator.length === 0)
return null;
var header = indicator.parent().parent();
var body = header.next();
// Parse table rows into object
var rawRows = {};
var signalRows = body.children("tr");
signalRows.each((i, e) => {
var title = this.$(e).children("td:first-child");
var content = this.$(e).children("td:not(:first-child)");
if (content.length > 1) {
var newHTML = this.$(content).children(":first-child").html();
content.each((i, e) => {
newHTML += e.html();
});
content.html(newHTML);
}
rawRows[
title.text().trim().toLowerCase().replace(/\s/g, "_")
] = content.html();
})
return this.parseTCWSRows(rawRows);
}
extractTyphoonDetails() {
var centerInfo = /\(([0-9.]+).+?([NS]).+?([0-9.]+).+?([EW]).*?\)\s*$/gi.exec(this.$(".panel-heading:contains('Location of Eye/center') + .panel-body").text());
return {
name: /\"(.+)\"/g.exec(this.$("#tcwb-1 h3").text())[1],
center: {
lat: centerInfo[2] === "N" ? centerInfo[1] : -centerInfo[1],
lon: centerInfo[4] === "E" ? centerInfo[3] : -centerInfo[3]
},
movement: this.$(".panel-heading:contains('Movement') + .panel-body").text().trim()
};
}
extractBulletinDetails(typhoonDetails) {
var issued = Date.parse(/Issued at .+/gi.exec(this.$(":contains('Issued at')").filter((i, e) => {
return /Issued at [0-9]+:[0-9]+\s?[apm]+,?\s?[0-9]+\s[a-z]+\s[0-9]+/gi.test(this.$(e).text());
}).text()));
return {
issued_timestamp: issued,
issued: new Date(issued),
summary: this.$(".row h5:contains('\"" + typhoonDetails.name.toUpperCase() + "\"')").text()
};
}
parseBulletin() {
var typhoonDetails = this.extractTyphoonDetails();
var final = {
typhoon: typhoonDetails,
bulletin: this.extractBulletinDetails(typhoonDetails),
storm_signals: {
1: this.grabTCWSLevel(1),
2: this.grabTCWSLevel(2),
3: this.grabTCWSLevel(3),
4: this.grabTCWSLevel(4),
5: this.grabTCWSLevel(5),
}
}
return final;
}
}
// Comment this line if you're using it as a module.
(async () => { console.log(require("util").inspect(await (new PagasaScraper()).pullBulletin(), undefined, 7, true)); })();
// Uncomment this line if you're using it as a module.
//module.exports = PagasaScraper;
@ChlodAlejandro
Copy link
Author

Requires the packages cheerio and axios.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment