Created
January 22, 2014 22:05
-
-
Save kgadek/8568337 to your computer and use it in GitHub Desktop.
Current state of mrwhite::parser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import lxml | |
import lxml.html | |
from itertools import count | |
def clipboard_to_cells(): | |
with open("logging_1.txt", 'r') as f: | |
data = f.read() | |
# print(data) | |
tree = lxml.html.fromstring(data) | |
rows = tree.xpath('//table//tr') | |
print(len(rows)) | |
result = [] | |
for row in rows: | |
row_result = [] | |
elems = row.xpath('.//td') | |
# print(len(elems)) | |
for elem in elems: | |
e_dict = {} | |
# print("bg: ",elem.get('bgcolor'), "") | |
# print("text: ",elem.text) | |
bg = elem.get('bgcolor') | |
e_dict['bgcolor'] = bg | |
all_text = elem.xpath('string()').strip() | |
# print() | |
if all_text: | |
e_dict['content'] = all_text.strip() | |
e_dict['is_empty'] = False | |
else: | |
e_dict['content'] = None | |
e_dict['is_empty'] = True | |
row_result.append(e_dict) | |
result.append(row_result) | |
return result | |
def parse(cells = None): | |
if not cells: | |
cells = clipboard_to_cells() | |
pa_przedmiot_komentarz = r""" | |
([a-zA-Z_-]+) # trivia: nazwa przedmiotu | |
( # capture group komentarza | |
( # łap zero lub jeden | |
\s # biały znak | |
\( # nawias | |
[a-zA-Z. 0-9]+ # treść komentarza | |
\) # nawias | |
)? | |
) | |
""" | |
pa_sala = r""" | |
(\d+\.\d+([a-zA-Z]+)?) | |
""" | |
pa_dzien_tygodnia = r""" | |
[a-zA-Z]+ | |
""" | |
pa_data = r""" | |
(\d\d\d\d-\d\d-\d\d)|(\d+\.\d+) | |
""" | |
pa_godziny = r""" | |
\d{1,2} # godzina | |
(:\d\d)? # minuty | |
( # opcjonalnie: przedział | |
- # dywiz | |
\d{1,2} # godzina | |
(:\d\d)? # minuty | |
)? | |
""" | |
re_przedmiot_komentarz = re.compile(pa_przedmiot_komentarz, re.VERBOSE) | |
re_sala = re.compile(pa_sala, re.VERBOSE) | |
re_dzien_tygodnia = re.compile(pa_dzien_tygodnia, re.VERBOSE) | |
re_data = re.compile(pa_data, re.VERBOSE) | |
re_godziny = re.compile(pa_godziny, re.VERBOSE) | |
# reguła obszaru: | |
# szukamy wiersza N takiego, że: | |
# - N,0 zawiera pa_dzien_tygodnia | |
# - N,[1..K] zawierają daty, K >= 1 | |
# - [N+1..L],0 zawierają godziny | |
def is_area_starting_here(rowid): | |
if not re_dzien_tygodnia.match(cells[rowid][0]): | |
return False | |
k = None | |
for k in count(1): | |
if not re_data.match(cells[rowid][k]): | |
break | |
if not k: | |
return False | |
l = None | |
for l in count(rowid): | |
if not re_godziny.match(cells[rowid][l]): | |
break | |
if not l: | |
return False | |
return ((rowid, 0), (l, k)) | |
return cells | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment