Chengwei Zhang Tony607

## .py
# Read thow two CSV files to pandas dataframes
df_business=pd.read_csv('../dataset/business.csv')
df_review=pd.read_csv('../dataset/review.csv')
# Filter 'Restaurants' businesses
restaurants = df_business[df_business['categories'].str.contains('Restaurants')]
# Filter 5-stars reviews
five_star=df_review[df_review['stars']==5]
# merge the reviews with restaurants by key 'business_id'
# This keep only 5-star restaurants reviews
combo=pd.merge(restaurants_clean, five_star, on='business_id')

## .py
# remove new line characters
rnn_fivestar_reviews_only=rnn_fivestar_reviews_only.replace({r'\n+': ''}, regex=True)
# remove dupliated reviews
final=rnn_fivestar_reviews_only.drop_duplicates()

## model.py
import keras
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(1024, input_shape=(60, 95),return_sequences=True))
model.add(layers.LSTM(1024, input_shape=(60, 95)))
model.add(layers.Dense(95, activation='softmax'))

## char_indices.py
# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)

## getDataFromChunk.py
def getDataFromChunk(txtChunk, maxlen=60, step=1):
    sentences = []
    next_chars = []
    for i in range(0, len(txtChunk) - maxlen, step):
        sentences.append(txtChunk[i : i + maxlen])
        next_chars.append(txtChunk[i + maxlen])
    print('nb sequences:', len(sentences))
    print('Vectorization...')
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

## callbacks_list.py
# this saves the weights everytime they improve so you can let it train.  Also learning rate decay
filepath="Feb-22-all-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5,
              patience=1, min_lr=0.00001)
callbacks_list = [checkpoint, reduce_lr]

## train.py
for iteration in range(1, 20):
    print('Iteration', iteration)
    with open("../dataset/short_reviews_shuffle.txt") as f:
        for chunk in iter(lambda: f.read(90000), ""):
            X, y = getDataFromChunk(chunk)
            model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list)

## sample.py
def sample(preds, temperature=1.0):
    '''
    Generate some randomness with the given preds
    which is a list of numbers, if the temperature
    is very small, it will always pick the index
    with highest pred value
    '''
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)

## generate.py
# We generate 300 characters
for i in range(300):
    sampled = np.zeros((1, maxlen, len(chars)))
    # Turn each char to char index.
    for t, char in enumerate(generated_text):
        sampled[0, t, char_indices[char]] = 1.
    # Predict next char probabilities
    preds = model.predict(sampled, verbose=0)[0]
    # Add some randomness by sampling given probabilities.
    next_index = sample(preds, temperature)

## model.py
def model(input_shape):
    """
    Function creating the model's graph in Keras.

    Argument:
    input_shape -- shape of the model's input data (using Keras conventions)

    Returns:
    model -- Keras model instance
    """
	# Read thow two CSV files to pandas dataframes
	df_business=pd.read_csv('../dataset/business.csv')
	df_review=pd.read_csv('../dataset/review.csv')
	# Filter 'Restaurants' businesses
	restaurants = df_business[df_business['categories'].str.contains('Restaurants')]
	# Filter 5-stars reviews
	five_star=df_review[df_review['stars']==5]
	# merge the reviews with restaurants by key 'business_id'
	# This keep only 5-star restaurants reviews
	combo=pd.merge(restaurants_clean, five_star, on='business_id')
	# remove new line characters
	rnn_fivestar_reviews_only=rnn_fivestar_reviews_only.replace({r'\n+': ''}, regex=True)
	# remove dupliated reviews
	final=rnn_fivestar_reviews_only.drop_duplicates()
	import keras
	from keras import layers

	model = keras.models.Sequential()
	model.add(layers.LSTM(1024, input_shape=(60, 95),return_sequences=True))
	model.add(layers.LSTM(1024, input_shape=(60, 95)))
	model.add(layers.Dense(95, activation='softmax'))
	# List of unique characters in the corpus
	chars = sorted(list(set(text)))
	print('Unique characters:', len(chars))
	# Dictionary mapping unique characters to their index in `chars`
	char_indices = dict((char, chars.index(char)) for char in chars)
	def getDataFromChunk(txtChunk, maxlen=60, step=1):
	sentences = []
	next_chars = []
	for i in range(0, len(txtChunk) - maxlen, step):
	sentences.append(txtChunk[i : i + maxlen])
	next_chars.append(txtChunk[i + maxlen])
	print('nb sequences:', len(sentences))
	print('Vectorization...')
	X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
	y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
	# this saves the weights everytime they improve so you can let it train. Also learning rate decay
	filepath="Feb-22-all-{epoch:02d}-{loss:.4f}.hdf5"
	checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
	reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5,
	patience=1, min_lr=0.00001)
	callbacks_list = [checkpoint, reduce_lr]
	for iteration in range(1, 20):
	print('Iteration', iteration)
	with open("../dataset/short_reviews_shuffle.txt") as f:
	for chunk in iter(lambda: f.read(90000), ""):
	X, y = getDataFromChunk(chunk)
	model.fit(X, y, batch_size=128, epochs=1, callbacks=callbacks_list)
	def sample(preds, temperature=1.0):
	'''
	Generate some randomness with the given preds
	which is a list of numbers, if the temperature
	is very small, it will always pick the index
	with highest pred value
	'''
	preds = np.asarray(preds).astype('float64')
	preds = np.log(preds) / temperature
	exp_preds = np.exp(preds)
	# We generate 300 characters
	for i in range(300):
	sampled = np.zeros((1, maxlen, len(chars)))
	# Turn each char to char index.
	for t, char in enumerate(generated_text):
	sampled[0, t, char_indices[char]] = 1.
	# Predict next char probabilities
	preds = model.predict(sampled, verbose=0)[0]
	# Add some randomness by sampling given probabilities.
	next_index = sample(preds, temperature)
	def model(input_shape):
	"""
	Function creating the model's graph in Keras.

	Argument:
	input_shape -- shape of the model's input data (using Keras conventions)

	Returns:
	model -- Keras model instance
	"""