The following report summarizes the work done by me during Google Summer of Code 2021 along with the results, scope for improvements, and future work.
This also serves as the final project report with all the contributions.
# IN: file_path | |
events: list[dict[str, Any]] = [] | |
if path.exists(file_path): | |
with open(file_path, encoding="utf-8") as file: | |
events: list[dict[str, Any]] = json.load(file)["messages"] | |
# OUT: events |
# IN: events | |
messages: dict[str, str] = {} | |
for event in events: | |
if event["type"] == "m.room.message" and "content" in event and "body" in event["content"]: | |
messages[event["event_id"]] = event["content"]["body"] | |
# OUT: messages |
# IN: string | |
for punctuation in "!()-[]{};:, <>./?@#$%^&*_~'\"\\": | |
string = string.replace(punctuation, " ") | |
string = string.lower() | |
tokens = word_tokenize(string) | |
tokens = [token for token in tokens if token not in stopwords.words()] | |
# OUT: tokens |
# IN: documents | |
keywords: set[str] = set() | |
for document in documents.values(): | |
keywords = keywords.union(set(document)) | |
# OUT: keywords |
# IN: documents, keywords | |
inverted_index = { | |
keyword: tuple( | |
document_id | |
for document_id, document_content in documents.items() | |
if (keyword in document_content) | |
) | |
for keyword in keywords | |
} | |
# OUT: inverted_index |
[] |