Created
January 3, 2019 15:58
-
-
Save MagnetonBora/8051b3fc6fcbb75197d74f4a7c14f392 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import numpy as np | |
from collections import Counter | |
def read_data(filename): | |
with open(filename, 'r') as f: | |
return f.readlines() | |
def tokenize(sentence): | |
tokens = re.split('[^A-Za-z]', sentence) | |
return [token.lower() for token in tokens if token] | |
def frequencies(tokens): | |
return Counter(tokens) | |
def get_words(list_of_tokenized): | |
words = set() | |
for tokens in list_of_tokenized: | |
for token in tokens: | |
words.add(token) | |
return words | |
def get_index_map(words): | |
words_count = len(words) | |
pairs = zip(words, range(words_count)) | |
return dict(pairs) | |
def create_matrix(rows, cols): | |
return np.zeros(shape=(rows, cols), dtype=int) | |
def main(filename): | |
sentences = read_data(filename) | |
tokenized = [tokenize(sentence) for sentence in sentences] | |
words = get_words(tokenized) | |
return words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment