Ferry Djaja ferrygun

## table_prediction_df.py
interesting_areas=[]

output = [[x1, y1, x2, y2]]
for x in output:
  [x1, y1, x2, y2] = bboxes_pdf(img, pdf_page, x)
  bbox_camelot = [
            ",".join([str(x1), str(y1), str(x2), str(y2)])
        ][0]  # x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space
        #print(bbox_camelot)
  interesting_areas.append(bbox_camelot)

## table_prediction.py
import numpy as np
import cv2
import matplotlib.pyplot as plt

image_path = imgfname
#image = cv2.imread(image_path)
image = read_image_bgr(image_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

output = image.copy()

## ner_11.py
i = 1586
p = model.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5}".format("Word",  "Pred"))
for w, pred in zip(X_test[i], p[0]):
    print("{:15}: {}".format(words[w], tags[pred]))

## ner_10.py
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()


## ner_09.py
history = model.fit(X_train, np.array(y_train),
                    batch_size=32,
                    epochs=10,
                    validation_split=0.2,
                    verbose=1)

## ner_08.py
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
model = Dropout(0.5)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

model = Model(input, out)

opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

## ner_07.py
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

## ner_06.py
# One hot encoding
# n_tags = 17
y = [to_categorical(i, num_classes=n_tags) for i in y]

## ner_05.py
y = [[tag2idx[w[2]] for w in s] for s in sentences]

## ner_04.py
max_len = 50

# value = n-words-1 which is 'ENDPAD'or index 35178
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
	interesting_areas=[]

	output = [[x1, y1, x2, y2]]
	for x in output:
	[x1, y1, x2, y2] = bboxes_pdf(img, pdf_page, x)
	bbox_camelot = [
	",".join([str(x1), str(y1), str(x2), str(y2)])
	][0] # x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom in PDF coordinate space
	#print(bbox_camelot)
	interesting_areas.append(bbox_camelot)
	import numpy as np
	import cv2
	import matplotlib.pyplot as plt

	image_path = imgfname
	#image = cv2.imread(image_path)
	image = read_image_bgr(image_path)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

	output = image.copy()
	i = 1586
	p = model.predict(np.array([X_test[i]]))

	p = np.argmax(p, axis=-1)


	print("{:15} {:5}".format("Word", "Pred"))
	for w, pred in zip(X_test[i], p[0]):
	print("{:15}: {}".format(words[w], tags[pred]))
	import matplotlib.pyplot as plt

	def plot_graphs(history, string):
	plt.plot(history.history[string])
	plt.plot(history.history['val_'+string])
	plt.xlabel("Epochs")
	plt.ylabel(string)
	plt.legend([string, 'val_'+string])
	plt.show()
	history = model.fit(X_train, np.array(y_train),
	batch_size=32,
	epochs=10,
	validation_split=0.2,
	verbose=1)
	input = Input(shape=(max_len,))
	model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
	model = Dropout(0.5)(model)
	model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
	out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer

	model = Model(input, out)

	opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
	model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
	# One hot encoding
	# n_tags = 17
	y = [to_categorical(i, num_classes=n_tags) for i in y]
	max_len = 50

	# value = n-words-1 which is 'ENDPAD'or index 35178
	X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)