Skip to content

Instantly share code, notes, and snippets.

@lezsakdomi
Last active January 23, 2022 22:01
Show Gist options
  • Save lezsakdomi/eb98d654a9dec278cddd4f721da9809e to your computer and use it in GitHub Desktop.
Save lezsakdomi/eb98d654a9dec278cddd4f721da9809e to your computer and use it in GitHub Desktop.
Neptun scraper
import fs from 'fs';
import repl from 'repl';
import 'dotenv/config';
import FileCookieStore from 'tough-cookie-filestore';
import {JSDOM, CookieJar} from 'jsdom';
import got from 'got';
import jsdomDevtoolsFormatter from 'jsdom-devtools-formatter';
jsdomDevtoolsFormatter.install();
const cookieJar = new CookieJar(new FileCookieStore('cookies.json'));
const options = {
cookieJar,
runScripts: 'dangerously',
resources: 'usable',
};
const properties = {
JSDOM: {
writable: true,
value: JSDOM,
},
};
function addNavigation(dom, cb) {
function resolveUrl(url) {
return (new dom.window.URL(url, dom.window.location)).toString();
}
function navigateGet(url) {
JSDOM.fromURL(resolveUrl(url), {
referrer: dom.window.location.href,
...options,
}).then(newDom => {
dom.window.close();
addNavigation(newDom, cb);
});
}
function navigatePost(url, data) {
got({
url: resolveUrl(url),
method: 'post',
// body: fd,
form: data,
cookieJar,
// simply specifying 'buffer' here didn't work :(
responseType: 'text',
headers: {
// Neptun does not work without this user-agent, it doesn't include MicrosoftAjaxWebForms.js
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0",
},
}).then(response => {
fs.writeFileSync('spage.html', response.body);
const referrer = dom.window.location.href;
dom.window.close();
addNavigation(new JSDOM(response.body, {
url: response.url,
referrer,
...options,
}), cb);
});
}
console.log(`+ ${dom.window.location.href}`);
dom.window.document.querySelectorAll('form').forEach(element => {
element.addEventListener('submit', function(e) {
const form = this;
console.log(`Form submission detected: ${dom.window.location.href} -> ${form.method.toUpperCase()} ${form.action}`, this);
const fd = new dom.window.FormData(form);
properties.fd = {value: fd};
properties.form = {value: form};
switch (form.method) {
case 'post':
navigatePost(form.action, [...fd.entries()].reduce((a, v) => Object.assign(a, {[v[0]]: v[1]}), {}));
break;
default:
throw new Error("Not implemented form method " + JSON.stringify(form.method));
}
e.preventDefault();
});
});
dom.window.document.querySelectorAll('a').forEach(element => {
//let interventing = false;
element.addEventListener('click', (event) => {
if (element.href) {
navigateGet(element.href);
event.preventDefault();
}
//console.log(`click a[href=${JSON.stringify(element.href)}] interventing=${JSON.stringify(interventing)}`);
//if (interventing) {
// interventing = false;
//} else {
// interventing = true;
// try {
// element.click();
// event.preventDefault();
// } catch (e) {
// if (e.type === 'not implemented') {
// navigateGet(element.href);
// } else {
// throw e;
// }
// }
//}
});
});
const originalLocationPropertyDescriptor = Object.getOwnPropertyDescriptor(dom.window, 'location');
Object.defineProperty(dom.window, 'location', {
get() {
return originalLocationPropertyDescriptor.get.call(dom.window);
},
set(newLocation) {
//try {
// originalLocationPropertyDescriptor.set.call(dom.window, newLocation);
//} catch {
//}
if (typeof newLocation === 'string') {
navigateGet(newLocation);
}
},
});
cb(dom);
}
let _dom;
let initializeContext = (context) => {
Object.defineProperty(context, 'dom', {
get() { return _dom; }
});
}
let logout = false;
//JSDOM.fromURL("https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb", options)
JSDOM.fromURL("https://neptun.elte.hu/", options)
.then(dom => addNavigation(dom, dom => {
_dom = dom;
console.log("@ " + dom.window.location.href);
if (dom.window.document.querySelector('base') === null) {
// only for debugging purposes
console.log(` Adding <base> tag`)
dom.window.document.head.appendChild(dom.window.document.createElement('base')).href = dom.window.location.href;
}
fs.writeFileSync('page.html', dom.serialize());
try {
switch (dom.window.location.href) {
case 'https://neptun.elte.hu/Account/Login':
dom.window.document.querySelector('input[name="ReturnUrl"]').value = dom.window.document.referrer
dom.window.document.querySelector('input[name="LoginName"]').value = process.env.NEPTUN
dom.window.document.querySelector('input[name="Password"]').value = process.env.NEPTUN_PASSWORD
dom.window.document.querySelector('input[type="submit"]').click()
break;
case 'https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb':
// we'll be transferred automatically
break;
case 'https://neptun.elte.hu/':
if (logout) {
dom.window.document.querySelector('#logoutForm').submit();
} else {
const login = dom.window.document.querySelector('a[href="/Account/Login"]');
const go = dom.window.document.querySelector('a[href="/ToNeptunWeb/ToNeptunHWeb"]');
if (go) go.click();
else if (login) login.click();
else console.error("Neither login or go link");
}
break;
default:
if (/hallgato\d+.neptun.elte.hu/.test(dom.window.location.href)) {
//debugger;
Object.defineProperty(dom.window, 'menubar', {
writable: true,
});
} else {
console.log('Unexpected location, navigation finished');
}
}
} catch (e) {
console.error(e);
debugger;
}
}));
const r = repl.start('> ');
r.on('reset', initializeContext);
r.on('exit', () => {
_dom.window.close();
})
initializeContext(r.context);
initializeContext(global);
{
"name": "neptun-scraper",
"type": "module",
"version": "0.1.0",
"description": "Exports data from Neptun using a lightweight headless browser (just JSDOM, no full WebKit)",
"scripts": {
"run": "node jsdom.js"
},
"author": "Domonkos Lezsák",
"license": "TBD-something-GPLish",
"dependencies": {
"dotenv": "^14.2.0",
"got": "^12.0.1",
"jsdom": "^19.0.0",
"node-fetch": "^3.2.0",
"tough-cookie-filestore": "^0.0.1",
"zombie": "^6.1.4"
}
}
const Browser = require('zombie');
const browser = new Browser();
const repl = require('repl');
require('dotenv').config()
async function main() {
console.log("Navigating to login form");
await browser.visit("https://neptun.elte.hu/Account/Login");
await browser.fill('LoginName', process.env.NEPTUN);
await browser.fill('Password', process.env.NEPTUN_PASSWORD);
console.log("Logging in");
//await browser.pressButton('input[type="submit"]');
browser.querySelector('form[action="/Account/Login"]').submit();
await browser.wait()
console.log("Opening Neptun");
browser.on('opened', (window) => {
Object.defineProperty(window, 'menubar', {
writable: true,
})
})
try {
await browser.visit("https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb");
} catch (e) {
console.error(e);
}
/**/
browser.dump();
const r = repl.start('> ');
Object.assign(r.context, {
...browser,
browser,
})
r.on('reset', () => {
Object.assign(r.context, {
...browser,
browser,
})
})
await new Promise((resolve) => {
r.on('exit', resolve);
})
}
main().then(() => {
process.exit();
}, e => {
console.error(e);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment