Skip to content

Instantly share code, notes, and snippets.

@kokes
Last active March 24, 2016 18:26
Show Gist options
  • Save kokes/bf609d1b90c45feec1d1 to your computer and use it in GitHub Desktop.
Save kokes/bf609d1b90c45feec1d1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stenoprotokoly - první pokus\n",
"První prohrabání se 2013- schůzema."
]
},
{
"cell_type": "code",
"execution_count": 273,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import urllib.request as ur\n",
"import urllib.parse as up\n",
"from pyquery import PyQuery as pq\n",
"import os.path\n",
"import zipfile\n",
"import glob\n",
"from collections import OrderedDict\n",
"import json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"naštěstí to maj zipovaný"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"base = 'http://www.psp.cz/eknih/2013ps/stenprot/zip/index.htm'\n",
"zp = './zip/'\n",
"st = './html/'"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ht = pq(base)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"stahuj"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"if not os.path.exists('zip'): os.mkdir('zip')\n",
"for ln in ht.find('div#main-content a'):\n",
" ur.urlretrieve(up.urljoin(base, ln.attrib['href']), os.path.join(zp, ln.attrib['href']))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"extrahuj"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"if not os.path.exists('text'): os.mkdir('text')\n",
"\n",
"fns = glob.glob(os.path.join(zp, '*.zip'))\n",
"\n",
"for fn in fns:\n",
" zfn = os.path.split(fn)\n",
" zfn = zfn[-1].replace('.zip', '')\n",
" pth = os.path.join(st, zfn)\n",
" if not os.path.exists(pth): os.mkdir(pth)\n",
"\n",
" with zipfile.ZipFile(fn) as zf:\n",
" zf.extractall(pth)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Do textu"
]
},
{
"cell_type": "code",
"execution_count": 242,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"jsp = 'json/'\n",
"schuze = glob.glob('./html/*')"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def zapisuj(nm, dt):\n",
" fn = os.path.join(jsp, '%s.json' % nm)\n",
" \n",
" with open(fn, 'w') as f:\n",
" t = json.dump(dt, f, ensure_ascii=False)\n",
"\n",
"res = []\n",
"pid = 0\n",
"aut = None\n",
"tema = None\n",
"buf = []\n",
"for sch in schuze:\n",
" schd = dict()\n",
" #if pid > 1000: continue\n",
" fns = glob.glob(os.path.join(sch, '*.htm'))\n",
" for fn in fns:\n",
" h = pq(open(fn, encoding='cp1250').read())\n",
" for p in h.find('p'):\n",
" pt = p.text_content().strip()\n",
" if len(pt) == 0: continue\n",
" pt = pt.replace('\\xa0', ' ')\n",
" buf += [pt]\n",
" \n",
" od = p.find('a')\n",
" if od == None:\n",
" continue\n",
" \n",
" if len(buf) > 0:\n",
" schd[str(pid)] = OrderedDict(id=pid, autor=aut, schuze=sch, tema=tema, text='\\n'.join(buf))\n",
" aut = od.text\n",
" buf = []\n",
" pid += 1\n",
" zapisuj(sch[-sch[::-1].find('/'):], schd)\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment