Skip to content

Instantly share code, notes, and snippets.

@fantods
Last active May 6, 2020 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fantods/dea6ab40f7cb15777d71eb8f6fa06d3d to your computer and use it in GitHub Desktop.
Save fantods/dea6ab40f7cb15777d71eb8f6fa06d3d to your computer and use it in GitHub Desktop.
def strip_header(row):
return re.sub(r'(", ")?Numéro inscription\s:\s(.*?):\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}(\s\d{1,2})?', "", row)
abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior',
'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'}
terminators = ['.', '!', '?', ';']
wrappers = ['"', "'", ')', ']', '}']
def find_sentences(paragraph):
end = True
sentences = []
while end > -1:
end = find_sentence_end(paragraph)
if end > -1:
sentences.append(paragraph[end:].strip())
paragraph = paragraph[:end]
sentences.append(paragraph)
sentences.reverse()
return sentences
def find_sentence_end(paragraph):
[possible_endings, contraction_locations] = [[], []]
contractions = abbreviations.keys()
sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
for sentence_terminator in sentence_terminators:
t_indices = list(find_all(paragraph, sentence_terminator))
possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
for contraction in contractions:
c_indices = list(find_all(paragraph, contraction))
contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
max_end_start = max([pe[0] for pe in possible_endings])
possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
end = (-1 if not len(possible_endings) else max(possible_endings))
return end
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub)
re_context_flags = re.compile(r"civic|Address|parking space|parking unit|zoning regulations|vacant land|locker unit|vacant lot|party wall|building|underground emergency exit|exclusive use of a filing space|private portion|declaration of co-ownership|portions|property|storage space|les numéros civiques|servitudes actives et passives|d’eau aux termes|parties privatives communes|case de rangement|mur mitoyen|dudit immeuble|numéro domiciliaire|ensemble immobilier|Sans bâtisse|portant le numéro|bâtisse érigée|Adresse de l'immeuble|declarations of co-ownership|Declaration of co-ownership|Declaration of Co-ownership|Declaration of Co-Ownership|les parties communes|espaces de stationnement|L'entrée de gravier|espace de stationnement intérieur|six mètres|sans bâtisse dessus|indivis du lot|marches de béton|rattachant à cette fraction|unité de copropriété|bâtisses dessus construites|[ABCEGHJKLMNPRSTVXY][0-9][ABCEGHJKLMNPRSTVWXYZ] ?[0-9][ABCEGHJKLMNPRSTVWXYZ][0-9]")
def parse_lot_context(row):
row = strip_header(row)
table = str.maketrans(dict.fromkeys("()"))
start = re_lot.search(row)
if start:
idx = start.span()[0]
result = []
around = row[idx:(idx+5000)]
lots = re_lot_num.findall(around)
uniqs = list(set(lots))
for uniq in uniqs:
stripped = uniq.translate(table)
if "Lot" in stripped:
stripped = stripped.strip("Lot ")
elif "lot" in stripped:
stripped = stripped.strip("lot ")
result.append(stripped)
if len(result) == 0:
result = parse_backup_lots(around)
contexts = find_lot_context(uniqs, result, row)
return contexts
else:
temp_result = parse_backup_lots(row)
uniqs = list(set(temp_result))
contexts = find_lot_context(uniqs, temp_result, row)
return contexts
re_lot_backup = re.compile(r'\(\d{3}\)|\d{2}\s\d{3}\s\d{3}|\d{1}\s\d{3}\s\d{3}|\d{2}\-\w\-\d{4}')
def parse_backup_lots(row):
table = str.maketrans(dict.fromkeys("()"))
matches = re_lot_backup.findall(row)
result = []
uniqs = list(set(matches))
for uniq in uniqs:
result.append(uniq.translate(table))
return result
def find_lot_context(uniques, lots, row):
result = []
if uniques and lots:
for original, stripped in zip(uniques, lots):
idx = row.index(original)
before = idx-100
after = idx+750
around_lot = row[before:after]
sentences = find_sentences(around_lot)
matches = []
for sent in sentences:
match = re_context_flags.search(sent)
if match:
matches.append(sent)
if matches:
result.append(
(stripped, " ".join(matches))
)
return result
else:
return []
else:
return []
dfe['lot_number_context'] = dfe['pages_combined'].apply(lambda x: parse_lot_context(x))
dff['lot_number_context'] = dff['pages_combined'].apply(lambda x: parse_lot_context(x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment