Last active
May 6, 2020 16:47
-
-
Save fantods/dea6ab40f7cb15777d71eb8f6fa06d3d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def strip_header(row): | |
return re.sub(r'(", ")?Numéro inscription\s:\s(.*?):\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}(\s\d{1,2})?', "", row) | |
abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior', | |
'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'} | |
terminators = ['.', '!', '?', ';'] | |
wrappers = ['"', "'", ')', ']', '}'] | |
def find_sentences(paragraph): | |
end = True | |
sentences = [] | |
while end > -1: | |
end = find_sentence_end(paragraph) | |
if end > -1: | |
sentences.append(paragraph[end:].strip()) | |
paragraph = paragraph[:end] | |
sentences.append(paragraph) | |
sentences.reverse() | |
return sentences | |
def find_sentence_end(paragraph): | |
[possible_endings, contraction_locations] = [[], []] | |
contractions = abbreviations.keys() | |
sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators] | |
for sentence_terminator in sentence_terminators: | |
t_indices = list(find_all(paragraph, sentence_terminator)) | |
possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices])) | |
for contraction in contractions: | |
c_indices = list(find_all(paragraph, contraction)) | |
contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices])) | |
possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations] | |
if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]: | |
max_end_start = max([pe[0] for pe in possible_endings]) | |
possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start] | |
possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')] | |
end = (-1 if not len(possible_endings) else max(possible_endings)) | |
return end | |
def find_all(a_str, sub): | |
start = 0 | |
while True: | |
start = a_str.find(sub, start) | |
if start == -1: | |
return | |
yield start | |
start += len(sub) | |
re_context_flags = re.compile(r"civic|Address|parking space|parking unit|zoning regulations|vacant land|locker unit|vacant lot|party wall|building|underground emergency exit|exclusive use of a filing space|private portion|declaration of co-ownership|portions|property|storage space|les numéros civiques|servitudes actives et passives|d’eau aux termes|parties privatives communes|case de rangement|mur mitoyen|dudit immeuble|numéro domiciliaire|ensemble immobilier|Sans bâtisse|portant le numéro|bâtisse érigée|Adresse de l'immeuble|declarations of co-ownership|Declaration of co-ownership|Declaration of Co-ownership|Declaration of Co-Ownership|les parties communes|espaces de stationnement|L'entrée de gravier|espace de stationnement intérieur|six mètres|sans bâtisse dessus|indivis du lot|marches de béton|rattachant à cette fraction|unité de copropriété|bâtisses dessus construites|[ABCEGHJKLMNPRSTVXY][0-9][ABCEGHJKLMNPRSTVWXYZ] ?[0-9][ABCEGHJKLMNPRSTVWXYZ][0-9]") | |
def parse_lot_context(row): | |
row = strip_header(row) | |
table = str.maketrans(dict.fromkeys("()")) | |
start = re_lot.search(row) | |
if start: | |
idx = start.span()[0] | |
result = [] | |
around = row[idx:(idx+5000)] | |
lots = re_lot_num.findall(around) | |
uniqs = list(set(lots)) | |
for uniq in uniqs: | |
stripped = uniq.translate(table) | |
if "Lot" in stripped: | |
stripped = stripped.strip("Lot ") | |
elif "lot" in stripped: | |
stripped = stripped.strip("lot ") | |
result.append(stripped) | |
if len(result) == 0: | |
result = parse_backup_lots(around) | |
contexts = find_lot_context(uniqs, result, row) | |
return contexts | |
else: | |
temp_result = parse_backup_lots(row) | |
uniqs = list(set(temp_result)) | |
contexts = find_lot_context(uniqs, temp_result, row) | |
return contexts | |
re_lot_backup = re.compile(r'\(\d{3}\)|\d{2}\s\d{3}\s\d{3}|\d{1}\s\d{3}\s\d{3}|\d{2}\-\w\-\d{4}') | |
def parse_backup_lots(row): | |
table = str.maketrans(dict.fromkeys("()")) | |
matches = re_lot_backup.findall(row) | |
result = [] | |
uniqs = list(set(matches)) | |
for uniq in uniqs: | |
result.append(uniq.translate(table)) | |
return result | |
def find_lot_context(uniques, lots, row): | |
result = [] | |
if uniques and lots: | |
for original, stripped in zip(uniques, lots): | |
idx = row.index(original) | |
before = idx-100 | |
after = idx+750 | |
around_lot = row[before:after] | |
sentences = find_sentences(around_lot) | |
matches = [] | |
for sent in sentences: | |
match = re_context_flags.search(sent) | |
if match: | |
matches.append(sent) | |
if matches: | |
result.append( | |
(stripped, " ".join(matches)) | |
) | |
return result | |
else: | |
return [] | |
else: | |
return [] | |
dfe['lot_number_context'] = dfe['pages_combined'].apply(lambda x: parse_lot_context(x)) | |
dff['lot_number_context'] = dff['pages_combined'].apply(lambda x: parse_lot_context(x)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment