Last active
January 23, 2022 22:01
-
-
Save lezsakdomi/eb98d654a9dec278cddd4f721da9809e to your computer and use it in GitHub Desktop.
Neptun scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fs from 'fs'; | |
import repl from 'repl'; | |
import 'dotenv/config'; | |
import FileCookieStore from 'tough-cookie-filestore'; | |
import {JSDOM, CookieJar} from 'jsdom'; | |
import got from 'got'; | |
import jsdomDevtoolsFormatter from 'jsdom-devtools-formatter'; | |
jsdomDevtoolsFormatter.install(); | |
const cookieJar = new CookieJar(new FileCookieStore('cookies.json')); | |
const options = { | |
cookieJar, | |
runScripts: 'dangerously', | |
resources: 'usable', | |
}; | |
const properties = { | |
JSDOM: { | |
writable: true, | |
value: JSDOM, | |
}, | |
}; | |
function addNavigation(dom, cb) { | |
function resolveUrl(url) { | |
return (new dom.window.URL(url, dom.window.location)).toString(); | |
} | |
function navigateGet(url) { | |
JSDOM.fromURL(resolveUrl(url), { | |
referrer: dom.window.location.href, | |
...options, | |
}).then(newDom => { | |
dom.window.close(); | |
addNavigation(newDom, cb); | |
}); | |
} | |
function navigatePost(url, data) { | |
got({ | |
url: resolveUrl(url), | |
method: 'post', | |
// body: fd, | |
form: data, | |
cookieJar, | |
// simply specifying 'buffer' here didn't work :( | |
responseType: 'text', | |
headers: { | |
// Neptun does not work without this user-agent, it doesn't include MicrosoftAjaxWebForms.js | |
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0", | |
}, | |
}).then(response => { | |
fs.writeFileSync('spage.html', response.body); | |
const referrer = dom.window.location.href; | |
dom.window.close(); | |
addNavigation(new JSDOM(response.body, { | |
url: response.url, | |
referrer, | |
...options, | |
}), cb); | |
}); | |
} | |
console.log(`+ ${dom.window.location.href}`); | |
dom.window.document.querySelectorAll('form').forEach(element => { | |
element.addEventListener('submit', function(e) { | |
const form = this; | |
console.log(`Form submission detected: ${dom.window.location.href} -> ${form.method.toUpperCase()} ${form.action}`, this); | |
const fd = new dom.window.FormData(form); | |
properties.fd = {value: fd}; | |
properties.form = {value: form}; | |
switch (form.method) { | |
case 'post': | |
navigatePost(form.action, [...fd.entries()].reduce((a, v) => Object.assign(a, {[v[0]]: v[1]}), {})); | |
break; | |
default: | |
throw new Error("Not implemented form method " + JSON.stringify(form.method)); | |
} | |
e.preventDefault(); | |
}); | |
}); | |
dom.window.document.querySelectorAll('a').forEach(element => { | |
//let interventing = false; | |
element.addEventListener('click', (event) => { | |
if (element.href) { | |
navigateGet(element.href); | |
event.preventDefault(); | |
} | |
//console.log(`click a[href=${JSON.stringify(element.href)}] interventing=${JSON.stringify(interventing)}`); | |
//if (interventing) { | |
// interventing = false; | |
//} else { | |
// interventing = true; | |
// try { | |
// element.click(); | |
// event.preventDefault(); | |
// } catch (e) { | |
// if (e.type === 'not implemented') { | |
// navigateGet(element.href); | |
// } else { | |
// throw e; | |
// } | |
// } | |
//} | |
}); | |
}); | |
const originalLocationPropertyDescriptor = Object.getOwnPropertyDescriptor(dom.window, 'location'); | |
Object.defineProperty(dom.window, 'location', { | |
get() { | |
return originalLocationPropertyDescriptor.get.call(dom.window); | |
}, | |
set(newLocation) { | |
//try { | |
// originalLocationPropertyDescriptor.set.call(dom.window, newLocation); | |
//} catch { | |
//} | |
if (typeof newLocation === 'string') { | |
navigateGet(newLocation); | |
} | |
}, | |
}); | |
cb(dom); | |
} | |
let _dom; | |
let initializeContext = (context) => { | |
Object.defineProperty(context, 'dom', { | |
get() { return _dom; } | |
}); | |
} | |
let logout = false; | |
//JSDOM.fromURL("https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb", options) | |
JSDOM.fromURL("https://neptun.elte.hu/", options) | |
.then(dom => addNavigation(dom, dom => { | |
_dom = dom; | |
console.log("@ " + dom.window.location.href); | |
if (dom.window.document.querySelector('base') === null) { | |
// only for debugging purposes | |
console.log(` Adding <base> tag`) | |
dom.window.document.head.appendChild(dom.window.document.createElement('base')).href = dom.window.location.href; | |
} | |
fs.writeFileSync('page.html', dom.serialize()); | |
try { | |
switch (dom.window.location.href) { | |
case 'https://neptun.elte.hu/Account/Login': | |
dom.window.document.querySelector('input[name="ReturnUrl"]').value = dom.window.document.referrer | |
dom.window.document.querySelector('input[name="LoginName"]').value = process.env.NEPTUN | |
dom.window.document.querySelector('input[name="Password"]').value = process.env.NEPTUN_PASSWORD | |
dom.window.document.querySelector('input[type="submit"]').click() | |
break; | |
case 'https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb': | |
// we'll be transferred automatically | |
break; | |
case 'https://neptun.elte.hu/': | |
if (logout) { | |
dom.window.document.querySelector('#logoutForm').submit(); | |
} else { | |
const login = dom.window.document.querySelector('a[href="/Account/Login"]'); | |
const go = dom.window.document.querySelector('a[href="/ToNeptunWeb/ToNeptunHWeb"]'); | |
if (go) go.click(); | |
else if (login) login.click(); | |
else console.error("Neither login or go link"); | |
} | |
break; | |
default: | |
if (/hallgato\d+.neptun.elte.hu/.test(dom.window.location.href)) { | |
//debugger; | |
Object.defineProperty(dom.window, 'menubar', { | |
writable: true, | |
}); | |
} else { | |
console.log('Unexpected location, navigation finished'); | |
} | |
} | |
} catch (e) { | |
console.error(e); | |
debugger; | |
} | |
})); | |
const r = repl.start('> '); | |
r.on('reset', initializeContext); | |
r.on('exit', () => { | |
_dom.window.close(); | |
}) | |
initializeContext(r.context); | |
initializeContext(global); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "neptun-scraper", | |
"type": "module", | |
"version": "0.1.0", | |
"description": "Exports data from Neptun using a lightweight headless browser (just JSDOM, no full WebKit)", | |
"scripts": { | |
"run": "node jsdom.js" | |
}, | |
"author": "Domonkos Lezsák", | |
"license": "TBD-something-GPLish", | |
"dependencies": { | |
"dotenv": "^14.2.0", | |
"got": "^12.0.1", | |
"jsdom": "^19.0.0", | |
"node-fetch": "^3.2.0", | |
"tough-cookie-filestore": "^0.0.1", | |
"zombie": "^6.1.4" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Browser = require('zombie'); | |
const browser = new Browser(); | |
const repl = require('repl'); | |
require('dotenv').config() | |
async function main() { | |
console.log("Navigating to login form"); | |
await browser.visit("https://neptun.elte.hu/Account/Login"); | |
await browser.fill('LoginName', process.env.NEPTUN); | |
await browser.fill('Password', process.env.NEPTUN_PASSWORD); | |
console.log("Logging in"); | |
//await browser.pressButton('input[type="submit"]'); | |
browser.querySelector('form[action="/Account/Login"]').submit(); | |
await browser.wait() | |
console.log("Opening Neptun"); | |
browser.on('opened', (window) => { | |
Object.defineProperty(window, 'menubar', { | |
writable: true, | |
}) | |
}) | |
try { | |
await browser.visit("https://neptun.elte.hu/ToNeptunWeb/ToNeptunHWeb"); | |
} catch (e) { | |
console.error(e); | |
} | |
/**/ | |
browser.dump(); | |
const r = repl.start('> '); | |
Object.assign(r.context, { | |
...browser, | |
browser, | |
}) | |
r.on('reset', () => { | |
Object.assign(r.context, { | |
...browser, | |
browser, | |
}) | |
}) | |
await new Promise((resolve) => { | |
r.on('exit', resolve); | |
}) | |
} | |
main().then(() => { | |
process.exit(); | |
}, e => { | |
console.error(e); | |
process.exit(1); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment