Skip to content

Instantly share code, notes, and snippets.

Avatar

Aravind Pai aravindpai

View GitHub Profile
View bpe_8.py
#merge the frequent pair in corpus
corpus = merge_vocab(best, corpus)
print("Updated Corpus (After Merge operation)": corpus)
#convert a tuple to a string
best = "".join(list(best))
#append to merge and vocabulary
merges = []
merges.append(best)
View bpe_7.py
#compute the best pair
best = max(pairs, key=pairs.get)
print("Most Frequent pair:",best)
View bpe_6.py
#compute frequency of bigrams in a corpus
pairs = get_stats(corpus)
print(pairs)
View bpe_5.py
#merges the most frequent pair in the corpus
#accepts the corpus and best pair
#returns the modified corpus
import re
def merge_vocab(pair, corpus_in):
corpus_out = {}
bigram = re.escape(' '.join(pair))
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word in corpus_in:
View bpe_4.py
#computer frequency of a pair of characters or character sequences
#accepts corpus and return frequency of each pair
def get_stats(corpus):
pairs = collections.defaultdict(int)
for word, freq in corpus.items():
symbols = word.split()
for i in range(len(symbols)-1):
pairs[symbols[i],symbols[i+1]] += freq
return pairs
View bpe_3.py
import collections
#returns frequency of each word
corpus = collections.Counter(corpus)
#convert counter object to dictionary
corpus = dict(corpus)
print("Corpus:",corpus)
View bpe_2.py
#initlialize the vocabulary
vocab = list(set(" ".join(corpus)))
vocab.remove(' ')
#split the word into characters
corpus = [" ".join(token) for token in corpus]
#appending </w>
corpus=[token+' </w>' for token in corpus]
View bpe_1.py
#importing library
import pandas as pd
#reading .txt file
text = pd.read_csv("sample.txt",header=None)
#converting a dataframe into a single list
corpus=[]
for row in text.values:
tokens = row[0].split(" ")
View 10_26.py
dist= compute_distance(midpoints,num)
View 10_26.py
x1,y1,x2,y2 = person[0]
print(x1,y1,x2,y2)