This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read thow two CSV files to pandas dataframes | |
df_business=pd.read_csv('../dataset/business.csv') | |
df_review=pd.read_csv('../dataset/review.csv') | |
# Filter 'Restaurants' businesses | |
restaurants = df_business[df_business['categories'].str.contains('Restaurants')] | |
# Filter 5-stars reviews | |
five_star=df_review[df_review['stars']==5] | |
# merge the reviews with restaurants by key 'business_id' | |
# This keep only 5-star restaurants reviews | |
combo=pd.merge(restaurants_clean, five_star, on='business_id') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# remove new line characters | |
rnn_fivestar_reviews_only=rnn_fivestar_reviews_only.replace({r'\n+': ''}, regex=True) | |
# remove dupliated reviews | |
final=rnn_fivestar_reviews_only.drop_duplicates() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import keras | |
from keras import layers | |
model = keras.models.Sequential() | |
model.add(layers.LSTM(1024, input_shape=(60, 95),return_sequences=True)) | |
model.add(layers.LSTM(1024, input_shape=(60, 95))) | |
model.add(layers.Dense(95, activation='softmax')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# List of unique characters in the corpus | |
chars = sorted(list(set(text))) | |
print('Unique characters:', len(chars)) | |
# Dictionary mapping unique characters to their index in `chars` | |
char_indices = dict((char, chars.index(char)) for char in chars) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def getDataFromChunk(txtChunk, maxlen=60, step=1): | |
sentences = [] | |
next_chars = [] | |
for i in range(0, len(txtChunk) - maxlen, step): | |
sentences.append(txtChunk[i : i + maxlen]) | |
next_chars.append(txtChunk[i + maxlen]) | |
print('nb sequences:', len(sentences)) | |
print('Vectorization...') | |
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) | |
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this saves the weights everytime they improve so you can let it train. Also learning rate decay | |
filepath="Feb-22-all-{epoch:02d}-{loss:.4f}.hdf5" | |
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') | |
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, | |
patience=1, min_lr=0.00001) | |
callbacks_list = [checkpoint, reduce_lr] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for iteration in range(1, 20): | |
print('Iteration', iteration) | |
with open("../dataset/short_reviews_shuffle.txt") as f: | |
for chunk in iter(lambda: f.read(90000), ""): | |
X, y = getDataFromChunk(chunk) | |
model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sample(preds, temperature=1.0): | |
''' | |
Generate some randomness with the given preds | |
which is a list of numbers, if the temperature | |
is very small, it will always pick the index | |
with highest pred value | |
''' | |
preds = np.asarray(preds).astype('float64') | |
preds = np.log(preds) / temperature | |
exp_preds = np.exp(preds) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We generate 300 characters | |
for i in range(300): | |
sampled = np.zeros((1, maxlen, len(chars))) | |
# Turn each char to char index. | |
for t, char in enumerate(generated_text): | |
sampled[0, t, char_indices[char]] = 1. | |
# Predict next char probabilities | |
preds = model.predict(sampled, verbose=0)[0] | |
# Add some randomness by sampling given probabilities. | |
next_index = sample(preds, temperature) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def model(input_shape): | |
""" | |
Function creating the model's graph in Keras. | |
Argument: | |
input_shape -- shape of the model's input data (using Keras conventions) | |
Returns: | |
model -- Keras model instance | |
""" |
OlderNewer