Pinak Chakraborty Pinak-Chakraborty

## role1.csv

          
            nodein
            nodeout
            role

            
              bank_of_america_corporation
              financial_service
              belong

            
              bank_of_america_corporation
              five_segment
              operate

            
              bank_of_america_corporation
              government
              product_offering

            
              bank_of_america_corporation
              banking
              product_offering

            
              bank_of_america_corporation
              middle-market_business
              product_offering

            
              bank_of_america_corporation
              service_for_individual_consumer
              product_offering

            
              bank_of_america_corporation
              "1874"
              founded

            
              bank_of_america_corporation
              north_carolina
              founded

            
              bank_of_america_corporation
              large_corporation
              product_offering

## node1
node
financial_service
"1874"
"16,000_atm"
"4,700_financial_center"
account
asset-based_lending
bank_of_america_corporation
banking
brokerage

## flight.csv

          
            flight 
            airline
            depart 
            arrive

            
              23
              Indigo
              DEL
              MUM

            
              24
              Indigo
              HYD
              MUM

            
              25
              spice
              MUM
              DEL

            
              26
              air india
              DEL
              CCU

            
              27
              air india
              CCU
              DEL

## airport.csv

          
            label
            city
            state

            
              ISX
              Silchar
              Assam

            
              CCU
              Kolkata
              West Bengal

            
              BOM
              Mumbai
              Maharastra

            
              DEL
              Delhi
              Delhi

            
              HYD
              Hyderabad
              Telangana

## Smth_Backoff
#-------------------------------------------------------------------------------
#
# This module determines the sentence probability for all sentences in the test
# data set. It uses Back off method in determing the probability
#
#   prob(w1,w2,w3) is calculated as follows:
#
#   if trigram count(w1,w2,w3) not zero
#       prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2)
#   else if bigram count (w1,w2) not zero

## Del_Inter
#-------------------------------------------------------------------------------
#
# This module determines the sentence probability for all sentences in the test
# data set. It uses deleted inerpolation as: (for bigram)
#
#   prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ (
#               (coff2*(bigram freq(w1,w2)/unigram freq(w1) +
#                (coff3*(unigram freq(w1)/total no of unigram))
#
#-------------------------------------------------------------------------------

## K-means implementation
import re, random, numpy as np

def centers (X, K):
    # Initialize to K random centers
    oldmu = random.sample(list(X), K)
    mu = random.sample(list(X), K)
    while not has_converged(mu, oldmu):
        oldmu = mu
        # Assign all points in X to clusters
        clusters = cluster_points(X, mu)

## Tokenizer-2
import sys, os, os.path, glob, codecs

# Set the codecs
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

# no of highest frequemcy unigrams and bigrams that will be written out
writemax = 100

def wordTokenizier(line):
    #delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+"

## gist:0c7e8e6f505d45f012e5
import sys, os, os.path, glob, codecs

# Set the codecs
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

# set the delimiters
delimiterSet = ";.,!?\"()':[]\n/+-—=≤≥{}><*’”“|"
digits = "0123456789"
chars = "abcdefghijklmnopqrstuvwxyz"
chars = "".join( (chars, chars.upper()) )
nodein	nodeout	role
bank_of_america_corporation	financial_service	belong
bank_of_america_corporation	five_segment	operate
bank_of_america_corporation	government	product_offering
bank_of_america_corporation	banking	product_offering
bank_of_america_corporation	middle-market_business	product_offering
bank_of_america_corporation	service_for_individual_consumer	product_offering
bank_of_america_corporation	"1874"	founded
bank_of_america_corporation	north_carolina	founded
bank_of_america_corporation	large_corporation	product_offering
	node
	financial_service
	"1874"
	"16,000_atm"
	"4,700_financial_center"
	account
	asset-based_lending
	bank_of_america_corporation
	banking
	brokerage
flight	airline	depart	arrive
23	Indigo	DEL	MUM
24	Indigo	HYD	MUM
25	spice	MUM	DEL
26	air india	DEL	CCU
27	air india	CCU	DEL
label	city	state
ISX	Silchar	Assam
CCU	Kolkata	West Bengal
BOM	Mumbai	Maharastra
DEL	Delhi	Delhi
HYD	Hyderabad	Telangana
	#-------------------------------------------------------------------------------
	#
	# This module determines the sentence probability for all sentences in the test
	# data set. It uses Back off method in determing the probability
	#
	# prob(w1,w2,w3) is calculated as follows:
	#
	# if trigram count(w1,w2,w3) not zero
	# prob (w1,w2,w3) = coff1*trigram count (w1,w2,w3)/bigram count (w1,w2)
	# else if bigram count (w1,w2) not zero
	#-------------------------------------------------------------------------------
	#
	# This module determines the sentence probability for all sentences in the test
	# data set. It uses deleted inerpolation as: (for bigram)
	#
	# prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ (
	# (coff2*(bigram freq(w1,w2)/unigram freq(w1) +
	# (coff3*(unigram freq(w1)/total no of unigram))
	#
	#-------------------------------------------------------------------------------
	import re, random, numpy as np

	def centers (X, K):
	# Initialize to K random centers
	oldmu = random.sample(list(X), K)
	mu = random.sample(list(X), K)
	while not has_converged(mu, oldmu):
	oldmu = mu
	# Assign all points in X to clusters
	clusters = cluster_points(X, mu)
	import sys, os, os.path, glob, codecs

	# Set the codecs
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.detach())

	# no of highest frequemcy unigrams and bigrams that will be written out
	writemax = 100

	def wordTokenizier(line):
	#delimiters = "[A-Z]{2,}(?![a-z])\|[A-Z][a-z]+(?=[A-Z])\|[\'\w\-]+"