Skip to content

Instantly share code, notes, and snippets.

@hacknightly
Last active May 23, 2017 02:29
Show Gist options
  • Save hacknightly/c4733c4f081bc66f3111518eecbe3ebc to your computer and use it in GitHub Desktop.
Save hacknightly/c4733c4f081bc66f3111518eecbe3ebc to your computer and use it in GitHub Desktop.
Wikipedia Page Downloader
import * as https from "../lib/https";
import * as http from "http";
import * as htmlparser from "htmlparser2";
import { HTML, DownloaderResult } from "./types";
export default class Downloader {
parser : htmlparser.Parser;
collecting : boolean;
done : boolean;
buffer : string[];
whitelist : string[];
blacklist : string[];
constructor() {
this.collecting = false;
this.done = false;
this.buffer = [];
this.whitelist = [`h2`, `h3`, `h4`, `p`, `ul`, `ol`, `table`, `audio`];
this.blacklist = [`br`]
this.parser = new htmlparser.Parser({
onopentag: this.handleOpenTag.bind(this),
ontext: this.handleText.bind(this),
onclosetag: this.handleCloseTag.bind(this),
}, {decodeEntities: true});
}
async get(url:string) : Promise<http.IncomingMessage> {
try {
let res = await https.get(url);
return res;
} catch (err) {
console.log(err);
}
}
parse(res : http.IncomingMessage) : Promise<DownloaderResult> {
const start = new Date().getTime();
this.buffer.push(`<section>`)
return new Promise((resolve, reject) => {
res.on(`data`, (d:Buffer) => {
this.parser.write(d.toString());
});
res.on(`end`, () => {
const end = new Date().getTime();
resolve({
html: this.buffer.join(""),
elapsed: ((end - start) / 1000) % 60
});
})
});
}
private isCollecting() : boolean {
return this.collecting && !this.done;
}
private pauseCollecting() {
this.collecting = false;
}
private startCollecting() {
this.collecting = true;
}
private stopCollecting() {
this.collecting = false;
this.done = true;
}
private handleOpenTag(name:string, attrs:any) {
if (this.whitelist.includes(name)) {
this.startCollecting();
}
if (this.blacklist.includes(name)) {
return;
}
if (attrs.id === "External_links") {
this.stopCollecting();
}
if (this.isCollecting()) {
const href = attrs.hasOwnProperty("href") ? `href=${attrs.href}` : "";
const src = attrs.hasOwnProperty("src") ? `src=https://${attrs.src}` : "";
const id = attrs.hasOwnProperty("id") ? `id=${attrs.id}` : "";
const klassAttr = attrs.hasOwnProperty("class") && attrs.class;
const klass = (
klassAttr &&
klassAttr.indexOf("infobox") > -1 ||
klassAttr === "sortkey"
) ? `class=${klassAttr}` : "";
if (name === "h2") {
this.buffer.push(`</section>`)
this.buffer.push(`<section>`)
}
const taginfo = `${name} ${id}${klass}${href}${src}`.trim();
this.buffer.push(`<${taginfo}>`);
}
}
private handleText(text:string) {
if (this.isCollecting()) {
this.buffer.push(text);
}
}
private handleCloseTag(name:string) {
if (this.blacklist.includes(name)) {
return;
}
if (this.isCollecting()) {
this.buffer.push(`</${name}>`);
}
if (name === `p`) {
this.pauseCollecting();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment