fantods/context.py

## context.py
def strip_header(row):
    return re.sub(r'(", ")?Numéro inscription\s:\s(.*?):\s\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}(\s\d{1,2})?', "", row)

abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro.': 'brother', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior',
                 'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'}
terminators = ['.', '!', '?', ';']
wrappers = ['"', "'", ')', ']', '}']

def find_sentences(paragraph):
    end = True
    sentences = []
    while end > -1:
        end = find_sentence_end(paragraph)
        if end > -1:
            sentences.append(paragraph[end:].strip())
            paragraph = paragraph[:end]
    sentences.append(paragraph)
    sentences.reverse()
    return sentences


def find_sentence_end(paragraph):
    [possible_endings, contraction_locations] = [[], []]
    contractions = abbreviations.keys()
    sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
    for sentence_terminator in sentence_terminators:
        t_indices = list(find_all(paragraph, sentence_terminator))
        possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
    for contraction in contractions:
        c_indices = list(find_all(paragraph, contraction))
        contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
    possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
    if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
        max_end_start = max([pe[0] for pe in possible_endings])
        possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
    possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
    end = (-1 if not len(possible_endings) else max(possible_endings))
    return end


def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1:
            return
        yield start
        start += len(sub)

 re_context_flags = re.compile(r"civic|Address|parking space|parking unit|zoning regulations|vacant land|locker unit|vacant lot|party wall|building|underground emergency exit|exclusive use of a filing space|private portion|declaration of co-ownership|portions|property|storage space|les numéros civiques|servitudes actives et passives|d’eau aux termes|parties privatives communes|case de rangement|mur mitoyen|dudit immeuble|numéro domiciliaire|ensemble immobilier|Sans bâtisse|portant le numéro|bâtisse érigée|Adresse de l'immeuble|declarations of co-ownership|Declaration of co-ownership|Declaration of Co-ownership|Declaration of Co-Ownership|les parties communes|espaces de stationnement|L'entrée de gravier|espace de stationnement intérieur|six mètres|sans bâtisse dessus|indivis du lot|marches de béton|rattachant à cette fraction|unité de copropriété|bâtisses dessus construites|[ABCEGHJKLMNPRSTVXY][0-9][ABCEGHJKLMNPRSTVWXYZ] ?[0-9][ABCEGHJKLMNPRSTVWXYZ][0-9]")

def parse_lot_context(row):
    row = strip_header(row)
    table = str.maketrans(dict.fromkeys("()"))
    start = re_lot.search(row)
    if start:
        idx = start.span()[0]
        result = []
        around = row[idx:(idx+5000)]
        lots = re_lot_num.findall(around)
        uniqs = list(set(lots))
        for uniq in uniqs:
            stripped = uniq.translate(table)
            if "Lot" in stripped:
                stripped = stripped.strip("Lot ")
            elif "lot" in stripped:
                stripped = stripped.strip("lot ")
            result.append(stripped)

        if len(result) == 0:
            result = parse_backup_lots(around)
        contexts = find_lot_context(uniqs, result, row)
        return contexts
    else:
        temp_result = parse_backup_lots(row)
        uniqs = list(set(temp_result))
        contexts = find_lot_context(uniqs, temp_result, row)
        return contexts

re_lot_backup = re.compile(r'\(\d{3}\)|\d{2}\s\d{3}\s\d{3}|\d{1}\s\d{3}\s\d{3}|\d{2}\-\w\-\d{4}')

def parse_backup_lots(row):
    table = str.maketrans(dict.fromkeys("()"))
    matches = re_lot_backup.findall(row)
    result = []
    uniqs = list(set(matches))
    for uniq in uniqs:
        result.append(uniq.translate(table))
    return result

def find_lot_context(uniques, lots, row):
    result = []
    if uniques and lots:
        for original, stripped in zip(uniques, lots):
            idx = row.index(original)
            before = idx-100
            after = idx+750
            around_lot = row[before:after]
            sentences = find_sentences(around_lot)
            matches = []
            for sent in sentences:
                match = re_context_flags.search(sent)
                if match:
                    matches.append(sent)
            if matches:
                result.append(
                    (stripped, " ".join(matches))
                )
                return result
            else:
                return []
    else:
        return []

dfe['lot_number_context'] = dfe['pages_combined'].apply(lambda x: parse_lot_context(x))
dff['lot_number_context'] = dff['pages_combined'].apply(lambda x: parse_lot_context(x))