This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#merge the frequent pair in corpus | |
corpus = merge_vocab(best, corpus) | |
print("Updated Corpus (After Merge operation)": corpus) | |
#convert a tuple to a string | |
best = "".join(list(best)) | |
#append to merge and vocabulary | |
merges = [] | |
merges.append(best) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#compute the best pair | |
best = max(pairs, key=pairs.get) | |
print("Most Frequent pair:",best) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#compute frequency of bigrams in a corpus | |
pairs = get_stats(corpus) | |
print(pairs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#merges the most frequent pair in the corpus | |
#accepts the corpus and best pair | |
#returns the modified corpus | |
import re | |
def merge_vocab(pair, corpus_in): | |
corpus_out = {} | |
bigram = re.escape(' '.join(pair)) | |
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') | |
for word in corpus_in: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#computer frequency of a pair of characters or character sequences | |
#accepts corpus and return frequency of each pair | |
def get_stats(corpus): | |
pairs = collections.defaultdict(int) | |
for word, freq in corpus.items(): | |
symbols = word.split() | |
for i in range(len(symbols)-1): | |
pairs[symbols[i],symbols[i+1]] += freq | |
return pairs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
#returns frequency of each word | |
corpus = collections.Counter(corpus) | |
#convert counter object to dictionary | |
corpus = dict(corpus) | |
print("Corpus:",corpus) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#initlialize the vocabulary | |
vocab = list(set(" ".join(corpus))) | |
vocab.remove(' ') | |
#split the word into characters | |
corpus = [" ".join(token) for token in corpus] | |
#appending </w> | |
corpus=[token+' </w>' for token in corpus] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing library | |
import pandas as pd | |
#reading .txt file | |
text = pd.read_csv("sample.txt",header=None) | |
#converting a dataframe into a single list | |
corpus=[] | |
for row in text.values: | |
tokens = row[0].split(" ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dist= compute_distance(midpoints,num) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x1,y1,x2,y2 = person[0] | |
print(x1,y1,x2,y2) |