# giuseppebonaccorso/math_expression_learning.py

Created May 20, 2017
Mathematica expression learning experiment using a Seq2Seq approach
 Number of samples = 100000 Symbols = Integer relative numbers bounded between (0, 100), Arithmetic operations, Brackets, Empty space (for padding) Keras backend = Theano 0.9.0 Training hardware = Core i7, GeForce 960, 32 GB Ram Training time = 5.4 hours / 5 epochs Test results (as expected there are many errors due to the size of the training dataset): -50/-68 = 0 (-96*85) = -7820 -(-17--82) = -63 -16*5 = -74 48*-60 = -2840 (66+-19) = 43 69+41 = 116 (-16-26) = -44 17/-11 = -2 -20-11 = -33 5+60 = 63 -(-81+62) = 29 (-60/-89) = 0 (45+21) = 62 -(91-39) = -44 -68*-12 = 778 (-92+-7) = -97 -(35*-91) = 3175 -(13+-89) = 70 -(-5/-38) = 0 -(83+54) = -145 -61-44 = -107 65*-82 = -5470 -(-99/-64) = -2 (-88--78) = -1 -12*-94 = 904 -22*5 = -120 -91*-69 = 6227 -40/90 = -1 68/-83 = -1 -40+-89 = -137 -62--14 = -44 -87--72 = -17 (82*-35) = -3870 (-71*65) = -4085 (-51-66) = -117 -(18/-79) = 1 (-23*46) = -1162 -6*98 = -578 -(-32/-5) = -7 -18*-4 = 72 98/19 = 5 -5-68 = -61 -(99--13) = 110 (99--6) = 117 -(65/-91) = 1 -29/99 = -1 -13/-64 = 0 39/-1 = 47 -(-11*-13) = -17 (-51-26) = -77 -89-15 = -104 -(-81+9) = 62 -44/-46 = 1 (-51+96) = 43 88--42 = 138 -82*-43 = 3774 -85+56 = -23 (2/-16) = -1 -88+28 = -64 -(42+72) = -118 (7+-48) = -43 (78--41) = 127 24--35 = 53 (-4--99) = 97 (-32/-89) = 0 -62-84 = -148 78+90 = 174 (-76-83) = -157 (-80--45) = -27 -95/-21 = 4 9*56 = 484 (-25*70) = -1450 (-36-78) = -116 -84+-32 = -114 (-69-70) = -139 -(58*-53) = 3474 (4/97) = 0 -27*-70 = 1870 -(-8--12) = -12 (-34--20) = -18 -48+-75 = -123 -(-40*42) = 2840 -81/56 = -2 -97/24 = -4 (64/-90) = -1 -7*-99 = 627 -46*47 = -2208 -22*80 = -1440 26+46 = 68 (8+50) = 53 (12+-85) = -77 (78+-47) = 29 -(-80*-75) = -5600 -(-38+-29) = 63 (-25*-45) = 1155 (88+60) = 144 37+-71 = -34 83+-36 = 43 (-14*-84) = 114
 ''' Mathematical expression learning experiment Giuseppe Bonaccorso (https://www.bonaccorso.eu) Based on: http://machinelearningmastery.com/learn-add-numbers-seq2seq-recurrent-neural-networks/ ''' from __future__ import print_function from keras.models import Sequential from keras.layers import Dense, TimeDistributed, RepeatVector from keras.layers.recurrent import LSTM from sklearn.preprocessing import LabelBinarizer import keras.backend as K import numpy as np # Set random seed (for reproducibility) np.random.seed(1000) # Mathematical symbols symbols = [' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '-', '/', '*', '(', ')'] operation_offset = 11 minus_symbol = 12 open_bracket = 15 closed_bracket = 16 # Number of training samples nb_samples = 100000 # Sequence(s) lenght input_sequence_length = 340 output_sequence_length = 340 # Binarize symbols label_binarizer = LabelBinarizer() label_binarizer.fit(symbols) # Symbol length symbol_lenght = len(label_binarizer.transform([symbols[0]])[0]) # Empty symbol empty_symbol = label_binarizer.transform([symbols[0]])[0] # Time steps time_steps = int(input_sequence_length / symbol_lenght) def expression_to_symbols(value): s = [] for digit in str(value): s.append(digit) return label_binarizer.transform(np.array(s)).flatten() def symbols_to_expression(expression): syms = '' for row in expression: syms += label_binarizer.inverse_transform(to_binary(row).reshape((1, symbol_lenght)))[0] return syms.strip() def operation(op_type, a, b): ops = { 0: a + b, 1: a - b, 2: int(a / b), 3: a * b } return ops.get(op_type) def generate_random_expression(): # First term a = np.random.randint(-100, 100) # Second term (avoid zero for divisions) b = np.random.randint(1, 100) if binary_decision(): b = -b # Operator op = np.random.randint(0, 4) result = operation(op, a, b) full_expression = (expression_to_symbols(a), expression_to_symbols(symbols[op + operation_offset]), expression_to_symbols(b)) if binary_decision(): # Insert brackets open_bracket_expression = (expression_to_symbols(symbols[open_bracket]),) if binary_decision(): # Insert a minus in front of the exception open_bracket_expression = (expression_to_symbols(symbols[minus_symbol]),) + open_bracket_expression result *= -1 full_expression = open_bracket_expression + full_expression full_expression += (expression_to_symbols(symbols[closed_bracket]),) x = pad(np.concatenate(full_expression), input_sequence_length).reshape(time_steps, symbol_lenght) r = pad(expression_to_symbols(result), output_sequence_length).reshape(time_steps, symbol_lenght) return x, r, result def create_dataset(n_samples=5000): print('Creating dataset with %d samples' % nb_samples) X = [] Y = [] for _ in range(n_samples): x, r, _ = generate_random_expression() X.append(x.astype(K.floatx())) Y.append(r.astype(K.floatx())) return np.array(X).astype(K.floatx()), np.array(Y).astype(K.floatx()) def binary_decision(): return True if np.random.uniform(0, 1) < 0.5 else False def pad(x, sequence_length): if len(x) < sequence_length: n = int((sequence_length - len(x)) / len(empty_symbol)) for _ in range(n): x = np.concatenate((x, empty_symbol)) return x def to_binary(x): v = np.argmax(x) z = np.zeros(shape=symbol_lenght) z[v] = 1.0 return z def make_expression(string_expression): s = [] for digit in string_expression.strip(): s.append(digit) return pad(label_binarizer.transform(np.array(s)).flatten(), input_sequence_length).\ reshape(1, time_steps, symbol_lenght) def create_model(): model = Sequential() model.add(LSTM(250, input_shape=(time_steps, symbol_lenght))) model.add(RepeatVector(time_steps)) model.add(LSTM(100, return_sequences=True)) model.add(TimeDistributed(Dense(symbol_lenght, activation='softmax'))) # Compile model model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) return model if __name__ == '__main__': print('Expression learning experiment') print('Symbol table:') for symbol in symbols: print(symbol + ' -> ' + str(label_binarizer.transform([symbol]))) # Create dataset print('Training model...') X, Y = create_dataset(n_samples=nb_samples) # Create model model = create_model() # Train model model.fit(X, Y, batch_size=1, epochs=5) # Test print('Test:') X_test, Y_test = create_dataset(n_samples=100) Y_pred = model.predict(X_test) for i, y in enumerate(Y_pred): print('%s = %s' % (symbols_to_expression(X_test[i]), symbols_to_expression(y)))

### michael20at commented Jan 16, 2018

 Hi, I trained and modified (wrote input method for) your LSTM Script, and after training for two days I got accuracy up to 98%, which seemed fine. On testing I noted that it works great with inputs in the range it trained on (-99 to 99) with nearly no error, but resultet in totally wrong output for larger numbers (like 100*2)! Any idea why? Is this inherent in the LSTM Setup, is it only interpolating and not learning the structure? I'll try to train it on bigger input numbers now, any idea if it is even possible to generalize? Goal should be for it to work for arbitrary large numbers so that it really has learned the rules of caculating, right? Thank you!
