Skip to content

Instantly share code, notes, and snippets.

View eisenjulian's full-sized avatar

Julian Eisenschlos eisenjulian

View GitHub Profile
import pdfminer.high_level
import datetime
import requests
import sys
import os
import re
import unidecode
import collections
def split(delimiters, string, maxsplit=0):
def text_to_index(sentence):
# Remove punctuation characters except for the apostrophe
translator = str.maketrans('', '', string.punctuation.replace("'", ''))
tokens = sentence.translate(translator).lower().split()
return np.array([1] + [word_index[t] if t in word_index else 2 for t in tokens])
def print_predictions(sentences, classifier):
indexes = [text_to_index(sentence) for sentence in sentences]
x = sequence.pad_sequences(indexes,
maxlen=sentence_size,
def my_initializer(shape=None, dtype=tf.float32, partition_info=None):
assert dtype is tf.float32
return embedding_matrix
params = {'embedding_initializer': my_initializer}
cnn_pretrained_classifier = tf.estimator.Estimator(
model_fn=cnn_model_fn,
model_dir=os.path.join(model_dir, 'cnn_pretrained'),
params=params)
train_and_evaluate(cnn_pretrained_classifier)
embedding_matrix = np.random.uniform(-1, 1, size=(vocab_size, embedding_size))
for w, i in word_index.items():
v = embeddings.get(w)
if v is not None and i < vocab_size:
embedding_matrix[i] = v
embeddings = {}
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:
for line in f:
values = line.strip().split()
w = values[0]
vectors = np.asarray(values[1:], dtype='float32')
embeddings[w] = vectors
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(100)
_, final_states = tf.nn.dynamic_rnn(
lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32)
logits = tf.layers.dense(inputs=final_states.h, units=1)
initializer = tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
params = {'embedding_initializer': initializer}
cnn_classifier = tf.estimator.Estimator(model_fn=model_fn,
model_dir=os.path.join(model_dir, 'cnn'),
params=params)
train_and_evaluate(cnn_classifier)
head = tf.contrib.estimator.binary_classification_head()
optimizer = tf.train.AdamOptimizer()
def _train_op_fn(loss):
tf.summary.scalar('loss', loss)
return optimizer.minimize(
loss=loss,
global_step=tf.train.get_global_step())
return head.create_estimator_spec(
features=features,
training = (mode == tf.estimator.ModeKeys.TRAIN)
dropout_emb = tf.layers.dropout(inputs=input_layer,
rate=0.2,
training=training)
conv = tf.layers.conv1d(
inputs=dropout_emb,
filters=32,
kernel_size=3,
padding="same",
activation=tf.nn.relu)
input_layer = tf.contrib.layers.embed_sequence(
features['x'],
vocab_size,
embedding_size,
initializer=params['embedding_initializer'])