Created
February 10, 2019 07:18
-
-
Save shantanuo/4e35baba144ee658e4dd4d1f87e19f3a to your computer and use it in GitHub Desktop.
tensorflow code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://towardsdatascience.com/transfer-learning-using-elmo-embedding-c4a7e415103c | |
import pandas as pd | |
import numpy as np | |
import re | |
import tensorflow_hub as hub | |
import tensorflow as tf | |
import keras | |
from tensorflow.python.keras.layers import Input, Dense, Lambda | |
from keras.models import Model | |
#!mkdir module/ | |
#!mkdir module/module_elmo2 | |
#!curl -L "https://tfhub.dev/google/elmo/2?tf-hub-format=compressed" | tar -zxvC module/module_elmo2 | |
#!unzip first-gop-debate-twitter-sentiment.zip | |
elmo = hub.Module("module/module_elmo2/", trainable=False) | |
embeddings = elmo( | |
["the cat is on the mat", "what are you doing in evening"], | |
signature="default", | |
as_dict=True, | |
)["elmo"] | |
with tf.Session() as session: | |
session.run([tf.global_variables_initializer(), tf.tables_initializer()]) | |
message_embeddings = session.run(embeddings) | |
elmo = hub.Module("module/module_elmo2/", trainable=False) | |
tokens_input = [ | |
["the", "cat", "is", "on", "the", "mat"], | |
["what", "are", "you", "doing", "in", "evening"], | |
] | |
tokens_length = [6, 5] | |
embeddings = elmo( | |
inputs={"tokens": tokens_input, "sequence_len": tokens_length}, | |
signature="tokens", | |
as_dict=True, | |
)["elmo"] | |
with tf.Session() as session: | |
session.run([tf.global_variables_initializer(), tf.tables_initializer()]) | |
message_embeddings = session.run(embeddings) | |
def embed_elmo2(module): | |
with tf.Graph().as_default(): | |
sentences = tf.placeholder(tf.string) | |
embed = hub.Module(module) | |
embeddings = embed(sentences) | |
session = tf.train.MonitoredSession() | |
return lambda x: session.run(embeddings, {sentences: x}) | |
embed_fn = embed_elmo2("module/module_elmo2") | |
embed_fn(["i am sambit"]).shape | |
df = pd.read_csv("Sentiment.csv", encoding="latin") | |
df = df[df["sentiment"] != "Neutral"] | |
df.loc[df["sentiment"] == "Negative", "sentiment"] = 0 | |
df.loc[df["sentiment"] == "Positive", "sentiment"] = 1 | |
def cleanText(text): | |
text = text.strip().replace("\n", " ").replace("\r", " ") | |
# text = replace_contraction(text) | |
# text = replace_links(text, "link") | |
# text = remove_numbers(text) | |
text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]', "", text) | |
text = text.lower() | |
return text | |
X = np.array(df["text"].apply(cleanText)) | |
y = np.array(df["sentiment"]) | |
embed = hub.Module("module/module_elmo2") | |
def ELMoEmbedding(x): | |
return embed( | |
tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True | |
)["default"] | |
def build_model(): | |
input_text = Input(shape=(1,), dtype="string") | |
embedding = Lambda(ELMoEmbedding, output_shape=(1024,))(input_text) | |
dense = Dense( | |
256, activation="relu", kernel_regularizer=keras.regularizers.l2(0.001) | |
)(embedding) | |
pred = Dense(1, activation="sigmoid")(dense) | |
model = Model(inputs=[input_text], outputs=pred) | |
model.compile( | |
loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"] | |
) | |
return model | |
model_elmo = build_model() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment