Skip to content

Instantly share code, notes, and snippets.

@limitedeternity
Last active June 11, 2019 15:12
Show Gist options
  • Save limitedeternity/fd08d6a988aa4acc58b50f58c7bcab58 to your computer and use it in GitHub Desktop.
Save limitedeternity/fd08d6a988aa4acc58b50f58c7bcab58 to your computer and use it in GitHub Desktop.
Utility to fetch data from https://opop.herzen.spb.ru/upload/scanned_docs/ by faculty code
// FileSaver.min.js
(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Depricated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(b,c,d){var e=new XMLHttpRequest;e.open("GET",b),e.responseType="blob",e.onload=function(){a(e.response,c,d)},e.onerror=function(){console.error("could not download file")},e.send()}function d(a){var b=new XMLHttpRequest;return b.open("HEAD",a,!1),b.send(),200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=f.saveAs||"object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(a,b,d,e){if(e=e||open("","_blank"),e&&(e.document.title=e.document.body.innerText="downloading..."),"string"==typeof a)return c(a,b,d);var g="application/octet-stream"===a.type,h=/constructor/i.test(f.HTMLElement)||f.safari,i=/CriOS\/[\d]+/.test(navigator.userAgent);if((i||g&&h)&&"object"==typeof FileReader){var j=new FileReader;j.onloadend=function(){var a=j.result;a=i?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),e?e.location.href=a:location=a,e=null},j.readAsDataURL(a)}else{var k=f.URL||f.webkitURL,l=k.createObjectURL(a);e?e.location=l:location.href=l,e=null,setTimeout(function(){k.revokeObjectURL(l)},4E4)}};f.saveAs=a.saveAs=a,"undefined"!=typeof module&&(module.exports=a)});
// when-dom-ready.min.js
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):e.whenDomReady=n()}(this,function(){"use strict";var i=["interactive","complete"],t=function(t,o){return new Promise(function(e){t&&"function"!=typeof t&&(o=t,t=null),o=o||window.document;var n=function(){return e(void(t&&setTimeout(t)))};-1!==i.indexOf(o.readyState)?n():o.addEventListener("DOMContentLoaded",n)})};return t.resume=function(n){return function(e){return t(n).then(function(){return e})}},t});
function createLinkList() {
let txt = '';
Array.from(
document.querySelectorAll('a[href]')
)
.filter(el =>
el.innerText.match(
new RegExp(`^${new Date().getFullYear()}(.+)\.pdf$`)
)
)
.forEach(el =>
txt += `${el.href}\n`
);
return txt.trim();
};
function downloadResult() {
let blob = new Blob([createLinkList()], {
type: 'text/plain;charset=utf-8;'
});
saveAs(blob, 'result.txt');
};
whenDomReady().then(() => {
window.scrollTo(0, document.body.scrollHeight);
downloadResult();
});
from uuid import uuid4
from os import unlink
import asyncio
from tqdm import tqdm
from tika import parser
from aiohttp import ClientSession
from aiofiles import open as aiopen
def analyzePdf(fname):
pdfReader = parser.from_file(fname)
text = pdfReader['content']
if not text or not '09.03.02' in text:
unlink(fname)
async def retrieveAndProcess(url, semaphore):
async with semaphore:
fname = uuid4().hex[:15] + '.pdf'
async with ClientSession() as session:
async with session.get(url, ssl=False) as response:
async with aiopen(fname, mode='wb') as pdfWriteObj:
while True:
chunk = await response.content.read(1 << 15)
if chunk:
await pdfWriteObj.write(chunk)
else:
break
analyzePdf(fname)
async def main():
semaphore = asyncio.Semaphore(4)
stripNewline = lambda line: line.rstrip('\n')
async with aiopen('result.txt', mode='r') as lines:
processTasks = [retrieveAndProcess(stripNewline(line), semaphore) async for line in lines]
for f in tqdm(asyncio.as_completed(processTasks), total=len(processTasks)):
await f
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.run_until_complete(asyncio.sleep(5.250))
loop.close()
aiofiles
aiohttp
tika
tqdm
@limitedeternity
Copy link
Author

JRE is required for Tika to work properly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment