Skip to content

Instantly share code, notes, and snippets.

@kgadek
Created January 22, 2014 22:05
Show Gist options
  • Save kgadek/8568337 to your computer and use it in GitHub Desktop.
Save kgadek/8568337 to your computer and use it in GitHub Desktop.
Current state of mrwhite::parser.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import lxml
import lxml.html
from itertools import count
def clipboard_to_cells():
with open("logging_1.txt", 'r') as f:
data = f.read()
# print(data)
tree = lxml.html.fromstring(data)
rows = tree.xpath('//table//tr')
print(len(rows))
result = []
for row in rows:
row_result = []
elems = row.xpath('.//td')
# print(len(elems))
for elem in elems:
e_dict = {}
# print("bg: ",elem.get('bgcolor'), "")
# print("text: ",elem.text)
bg = elem.get('bgcolor')
e_dict['bgcolor'] = bg
all_text = elem.xpath('string()').strip()
# print()
if all_text:
e_dict['content'] = all_text.strip()
e_dict['is_empty'] = False
else:
e_dict['content'] = None
e_dict['is_empty'] = True
row_result.append(e_dict)
result.append(row_result)
return result
def parse(cells = None):
if not cells:
cells = clipboard_to_cells()
pa_przedmiot_komentarz = r"""
([a-zA-Z_-]+) # trivia: nazwa przedmiotu
( # capture group komentarza
( # łap zero lub jeden
\s # biały znak
\( # nawias
[a-zA-Z. 0-9]+ # treść komentarza
\) # nawias
)?
)
"""
pa_sala = r"""
(\d+\.\d+([a-zA-Z]+)?)
"""
pa_dzien_tygodnia = r"""
[a-zA-Z]+
"""
pa_data = r"""
(\d\d\d\d-\d\d-\d\d)|(\d+\.\d+)
"""
pa_godziny = r"""
\d{1,2} # godzina
(:\d\d)? # minuty
( # opcjonalnie: przedział
- # dywiz
\d{1,2} # godzina
(:\d\d)? # minuty
)?
"""
re_przedmiot_komentarz = re.compile(pa_przedmiot_komentarz, re.VERBOSE)
re_sala = re.compile(pa_sala, re.VERBOSE)
re_dzien_tygodnia = re.compile(pa_dzien_tygodnia, re.VERBOSE)
re_data = re.compile(pa_data, re.VERBOSE)
re_godziny = re.compile(pa_godziny, re.VERBOSE)
# reguła obszaru:
# szukamy wiersza N takiego, że:
# - N,0 zawiera pa_dzien_tygodnia
# - N,[1..K] zawierają daty, K >= 1
# - [N+1..L],0 zawierają godziny
def is_area_starting_here(rowid):
if not re_dzien_tygodnia.match(cells[rowid][0]):
return False
k = None
for k in count(1):
if not re_data.match(cells[rowid][k]):
break
if not k:
return False
l = None
for l in count(rowid):
if not re_godziny.match(cells[rowid][l]):
break
if not l:
return False
return ((rowid, 0), (l, k))
return cells
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment