Skip to content

Instantly share code, notes, and snippets.

@kokes
Created March 28, 2018 19:35
Show Gist options
  • Save kokes/3b8fcf06410cb9d051cf75401ce88ce5 to your computer and use it in GitHub Desktop.
Save kokes/3b8fcf06410cb9d051cf75401ce88ce5 to your computer and use it in GitHub Desktop.
stahovani titulku z ivysilani
"""
Stáhni seznam pořadů z webu iVysílání
"""
import json
from urllib.parse import urljoin
import lxml.html
burl = 'http://www.ceskatelevize.cz/ivysilani/podle-abecedy'
ht = lxml.html.parse(burl).getroot()
abc = ht.cssselect('ul#programmeAlphabet a')
urls = [(j.text_content().strip(), j.attrib['href']) for j in abc]
dt = dict()
for psm, url in urls:
print('Stahuju: %s ' % psm, end='\r')
ht = lxml.html.parse(urljoin(burl, url)).getroot()
seznam = ht.cssselect('div#programmeAlphabetContent ul li a')
porady = [(j.text_content().strip(), urljoin(burl, j.attrib['href'])) for j in seznam]
dt[psm] = porady
print('Staženo %d názvů pořadů' % sum([len(k) for j,k in dt.items()]))
with open('data/porady.json', 'w') as f:
json.dump(dt, f, ensure_ascii=False, indent=2)
import json
import os
import unicodedata
import re
from urllib.parse import urljoin
import lxml.html
import csv
with open('data/porady.json') as f:
dt = json.load(f)
# whitelist ma porady, ktery jako jediny stahujem
# nemusi existovat
with open('whitelist.txt') as f:
csvr = csv.reader(f)
f = open('whitelist.txt')
wl = set([j[0] for j in csvr])
# prochazej pismena
# TODO: neumi to stary layout - http://www.ceskatelevize.cz/porady/1181831094-vaclav-belohradsky-nikdo-neposloucha/20456226263
# projdi tak vsechny porady a koukni na ty, kde nemame ani jeden dil
for p, pr in dt.items():
tdr = 'data/dily/%s' % p
# a porady v ramci pismen
for por in pr:
nazev = unicodedata.normalize('NFD', por[0]).encode('ascii', 'ignore').decode().lower()
mt = '/ivysilani/'
idd = por[1][por[1].index(mt)+len(mt):por[1].index('-')]
if not idd.isdigit():
print('Přeskakuji %s (%s)' % (por[0], por[1]))
# pouzivame whitelist, ale tohle ID tam neni? Preskoc
if len(wl) > 0 and (idd not in wl):
continue
fn = os.path.join(tdr, '%s-%s.json' % ('-'.join(re.findall('\w+', nazev)), idd))
if os.path.isfile(fn): continue # nedelame zadny updaty - ty jen kdyz smazes raw data, TODO
ht = lxml.html.parse(por[1]).getroot()
urlp = urljoin(por[1], ht.cssselect('div#programmeInfo a')[0].attrib['href'])
# print(urlp)
lporid = 1e50 # potrebujem nejakej initovej id
ret = []
for pg in range(1, 10000): # max 100000 stranek?
print('Stahuju %s, stránka %d%s' % (por[0], pg, 10*' '), end='\r')
urlstr = urlp + '/dily/%d' % pg # URL stranky (schvalne nepouzito urljoin)
# print(urlstr)
ht = lxml.html.parse(urlstr).getroot()
els = ht.cssselect('div.episodes-broadcast-content a')
if len(els) == 0: break
for el in els:
datum = el.find('time').text.replace('\xa0', ' ')
nazev = el.find('h3').text
popis = el.find('p').text if el.find('p') is not None else ''
furl = urljoin(por[1], el.attrib['href']) # plna URL na porad
# porid = furl.split('/')[-2] # ID poradu
# porid = porid[:porid.find('-')] # odsekni nazev poradu, pokud tam je
porid = furl[:-1] if furl.endswith('/') else furl
porid = porid[porid.rindex('/')+1:]
if '-' in porid: # odsekni nazev poradu, pokud tam je
porid = porid[:porid.index('-')]
assert porid.isdigit(), furl
porid = int(porid)
# kontroluj, ze jdem do minulosti (obcas ma posledni strana budouci epizody, napr. v OVM)
# UPDATE: uz neni potreba, protoze oni pouzivaji jiny selektor, takze to breakuje uz vyse
# if porid > lporid:
# break
# lporid = porid
ret.append({
'datum': datum,
'nazev': nazev,
'popis': popis.replace('\r', ''),
'url': furl,
'id': porid
})
if len(ret) == 0: continue # zadny dily
if not os.path.isdir(tdr):
os.makedirs(tdr)
with open(fn, 'w') as f:
json.dump(ret, f, ensure_ascii=False, indent=2)
from glob import glob
import json
import gzip
import os
import requests
from collections import defaultdict
fns = glob('data/dily/*/*.json')
sub_url = 'http://imgct.ceskatelevize.cz/cache/data/ivysilani/subtitles/%s/%s/sub.vtt'
fl = defaultdict(int) # pocet failures na soubor
nfl = 10 # maximalni pocet failu
for fn in fns:
print('Stahuju', fn)
nm = os.path.split(fn)[-1]
tdr = 'data/titulky/raw/'
# co uz mame stazeno
mfn = os.path.join(tdr, 'mame', nm)
mame = set()
if os.path.isfile(mfn):
with open(mfn) as f:
mame = set(json.load(f))
# stazena data
dfn = os.path.join(tdr, 'titulky', nm + '.gz')
tt = dict()
if os.path.isfile(dfn):
with gzip.open(dfn, 'rt') as f:
tt = json.load(f)
# Načti seznam pořadů
with open(fn) as f:
dt = json.load(f)
for j, el in enumerate(dt[:2000]):
print('Stahuju: %d/%d' % (j, len(dt)), end='\r')
porid = str(el['id'])
if porid in mame: continue
turl = sub_url % (porid[:3], porid)
r = requests.get(turl)
st = r.status_code
if st == 200:
tt[porid] = r.text
mame.add(porid)
elif st == 404:
# koncime jen a pouze kdyz to presahne mez
fl[fn] += 1
if fl[fn] > nfl:
print('Selhalo na', el['datum'])
break
else:
print('unexpected error with', turl)
with gzip.open(dfn, 'wt') as f:
json.dump(tt, f, ensure_ascii=False)
with open(mfn, 'w') as f:
json.dump(sorted(list(mame)), f, ensure_ascii=False)
import os
import json
import gzip
from glob import glob
import re
from elasticsearch import Elasticsearch, helpers
def parsuj_titulky(dt):
ws = re.compile(r'\s+')
dt = dt.split('\n') # neni to moc velky, tak tu nebudem lazy
assert dt[0].strip() == 'WEBVTT'
dt = dt[2:] # odsekni zacatek
# TODO: merguj věty ve vícero titulcích
res = []
bf = [] # intermediate buffer
for j, rw in enumerate(dt):
hd = False # jsme v casovy hlavicce?
if '-->' in rw:
od, do = rw[:8], rw[17:25]
hd = True
# ukoncime text pokud jsme v hlavicce (minulej text) a nebo na konci
if len(bf) > 0 and (hd or j == (len(dt) - 1)):
text = re.sub(ws, ' ', ' '.join(bf).strip())
res.append({
'od': od,
'do': do,
'text': text
})
bf = [] # reset
if not hd:
bf.append(rw)
return res
es = Elasticsearch()
ind = 'ctgrep'
if es.indices.exists(ind):
es.indices.delete(ind)
mp = {
"mappings": {
"titulky": {
"properties": {
"titulky": {
"type": "nested",
"properties": {
"od": { "type": "date", "format": "hour_minute_second" },
"do": { "type": "date", "format": "hour_minute_second" },
"text": { "type": "string" }
}
}
}
}
}
}
# create index
es.indices.create(index=ind, ignore=400, body=mp)
dfns = glob('data/dily/*/*.json')
actions = [] # es queue
# loopujem porady
for fn in dfns:
with open(fn) as f:
dily = json.load(f)
# nactem titulky
tfn = os.path.join('data/titulky/raw/titulky/', os.path.split(fn)[-1]+'.gz')
with gzip.open(tfn, 'rt') as f:
dt = json.load(f)
tk = set(dt.keys()) # co ve skutecnosti mame
# a loopnem díly
for dl in dily:
porid = str(dl['id'])
if porid not in tk: continue # titulky nemame, jdem dal
edt = dl
edt['titulky'] = parsuj_titulky(dt[porid])
res = es.index(index=ind, doc_type='titulky', id=edt['id'], body=edt)
# actions.append({
# '_index': ind,
# '_type': 'titulky',
# '_id': edt['id'],
# '_source': edt
# })
# if len(actions) > 0:
# helpers.bulk(es, actions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment