Skip to content

Instantly share code, notes, and snippets.

@MagnetonBora
Created January 3, 2019 15:58
Show Gist options
  • Save MagnetonBora/8051b3fc6fcbb75197d74f4a7c14f392 to your computer and use it in GitHub Desktop.
Save MagnetonBora/8051b3fc6fcbb75197d74f4a7c14f392 to your computer and use it in GitHub Desktop.
import re
import numpy as np
from collections import Counter
def read_data(filename):
with open(filename, 'r') as f:
return f.readlines()
def tokenize(sentence):
tokens = re.split('[^A-Za-z]', sentence)
return [token.lower() for token in tokens if token]
def frequencies(tokens):
return Counter(tokens)
def get_words(list_of_tokenized):
words = set()
for tokens in list_of_tokenized:
for token in tokens:
words.add(token)
return words
def get_index_map(words):
words_count = len(words)
pairs = zip(words, range(words_count))
return dict(pairs)
def create_matrix(rows, cols):
return np.zeros(shape=(rows, cols), dtype=int)
def main(filename):
sentences = read_data(filename)
tokenized = [tokenize(sentence) for sentence in sentences]
words = get_words(tokenized)
return words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment