Skip to content

Instantly share code, notes, and snippets.

@Filnor

Filnor/dict.py

Created Jun 14, 2018
Embed
What would you like to do?
# Too much Java and Kotlin <.<
import re
from collections import OrderedDict
def println(what):
print(what)
# Dataset prep
def getLineIds():
lines = open("movie_lines.txt").read().split("\n")
lineIds = {}
for line in lines:
splitLine = line.split(" +++$+++ ")
if(len(splitLine) == 5):
lineIds[splitLine[0]] = splitLine[4]
return lineIds
def removeGendered(line):
"""
Removes words like "sir" and "ma'am" from the dataset. These aren't common words, and could lower accuracy.
In addition, it would guess which to use
"""
reformatted = re.sub(r"^(sire{,2}|ma'?am)\W+", "", line, 0, re.MULTILINE).strip()
reformatted = re.sub(r"(\W+|\b)(ma'?am|sire{,2}(?![a-z]))", "", reformatted, 0, re.MULTILINE)
reformatted = re.sub(r"yessir", "yes", reformatted, 0, re.MULTILINE)
return reformatted.strip()
supportedChars = "0123456789abcdefghijklmnopqrstuvwxyz-_' ,.!?"
def removeUnwanted(line):
return "".join([c for c in line if c in supportedChars])
def tokenizeLine(line):
reformatted = removeUnwanted(line.lower().strip())
reformatted = removeGendered(reformatted)
reformatted = re.sub(r'(?P<group>[\'\"])', r" \g<group> ", reformatted)
reformatted = re.sub(r'(?P<group>[?!.,^:;\-+_])', r" \g<group> ", reformatted)
reformatted = re.sub(r'( - - )', r' -- ', reformatted)
reformatted = re.sub(r'[?]{2,}', "?", reformatted)
reformatted = re.sub(r'[!]{2,}', "!", reformatted)
tokens = reformatted.split(" ")
final = [token
for token in tokens
if token != ""
and token is not None
and token != " "]
return final
def prepQA():
lineIds = getLineIds()
orderedLines = OrderedDict(sorted(lineIds.items()))
lines = [item.lower() for item in orderedLines.values()]
vocab = []
for line in lines:
tokens = tokenizeLine(line);
unk = [t for t in tokens if t not in vocab]
for w in unk:
vocab.append(w);
#Remove duplicate entries from vocab
vocab = list(set(vocab))
println(vocab)
println("Size = " + str(len(vocab)))
if __name__ == '__main__':
prepQA()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment