Skip to content

Instantly share code, notes, and snippets.

View aravindpai's full-sized avatar

Aravind Pai aravindpai

View GitHub Profile
from sklearn.metrics import classification_report
y_pred = rfc.predict(x_val)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=3),y_tr)
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val = train_test_split(features,labels, test_size=0.2, stratify=labels,random_state=0)
img_copy = np.copy(gray)
cv2.drawContours(img_copy, contours, -1, (0,255,0), 3)
plt.imshow(img_copy, cmap='gray')
image, contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
#compute frequency
pairs = get_stats(oov)
#extract keys
pairs = pairs.keys()
#find the pairs available in the learned operations
#applying BPE to OOV
oov ='lowest'
#tokenize OOV into characters
oov = " ".join(list(oov))
#append </w>
oov = oov + ' </w>'
#create a dictionary
num_merges = 10
for i in range(num_merges):
#compute frequency of bigrams in a corpus
pairs = get_stats(corpus)
#compute the best pair
best = max(pairs, key=pairs.get)
#merge the frequent pair in corpus
#merge the frequent pair in corpus
corpus = merge_vocab(best, corpus)
print("After Merging:", corpus)
#convert a tuple to a string
best = "".join(list(best))
#append to merge list and vocabulary
merges = []