sunilmallya/cosinescore

## cosinescore
import math
import stemmer

def irange(sequence):
    return zip(range(len(sequence)), sequence)

class CosineScore(object):
    def __init__(self,all_docs):
        self.documents = all_docs #list all docs [doc1,doc2..]
        self.ndocs = len(all_docs)
        self.posting_list = {} #term frequency list, don't care about term position
        #term => {docId => freq}
        self.pstemmer = stemmer.PorterStemmer()

        self._term_indexer()

    def _term_indexer(self):
        #Create term frequency dict
        #Run each word through stemmer
        for doc_id,document in irange(self.documents):
            for word in document.split(' '):
                s_word = self.pstemmer.stem(word)
                if self.posting_list.has_key(s_word):
                    doc_id_mapping = self.posting_list[s_word]
                    if doc_id_mapping.has_key(doc_id):
                        doc_id_mapping[doc_id] += 1
                    else:
                        doc_id_mapping[doc_id] = 1
                else:
                    self.posting_list[s_word] = {doc_id: 1}

    def _term_frequency(self,term):
        if self.posting_list.has_key(term):
            return self.posting_list[term]
        else:
            return -1

    def _listToString(self,arg):
        if isinstance(arg,basestring):
            return arg.split(' ')

    def __qTermFrequency(self,term,bWords):
        count =0
        for i,bWordsObj in irange(bWords):
            if bWordsObj == term:
                count = count +1
        return count

    def _docListWeights(self) :

        all_terms = self.posting_list.keys()
        doclist_weights = [0.0] * self.ndocs

        #for all terms in the corpus
        for i,term in irange(all_terms):
            #for all docs in corpus that contain this term
            docs = self.posting_list[term].keys()
            for j,doc_id in irange(docs):
                tf = self.posting_list[term][doc_id]
                tfSquared = (tf * tf)
                doclist_weights[doc_id] += tfSquared

            for k in range(self.ndocs):
                doclist_weights[k] = math.sqrt(doclist_weights[k])
        return doclist_weights

    def compute(self,query,mIDF=0):
        '''
        dft - document term frequency
        idf - inverse document frequency
        wTQ - weights for each query term
        mIDF - max tf normalization
        '''

        scores = [0.0] * self.ndocs
        bWords = self._listToString(query)
        normalizationFactor = self._docListWeights()

        for qterm in bWords:
            term = self.pstemmer.stem(qterm)
            #calculate WT
            #dft = 	__qTermFrequency(queryTerm,bWords)
            #wTQ = math.log10(int(N)/dft)

            term_posting_doclist = []
            if self._term_frequency(term) != -1:
                #Find all documents with this query term

                term_posting_doclist = self.posting_list[term].keys()
                #total_term_frequency_in_corpus = sum(self.posting_list[term].values())

                if(mIDF!=0):
                    dft = mIDF
                else:
                    dft = len(term_posting_doclist)

                _wTQ = float(self.ndocs)/float(dft)
                wTQ = math.log10(float(_wTQ)) #idf

            #cosinescore algorithm
            for doc_id in term_posting_doclist:
                if normalizationFactor[doc_id] != 0:
                    #wFTD = termDocFrequencyList/ normalizationFactor(doc_id)
                    wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id])
                else:
                    wFTD = 0.0

                scores[doc_id] +=  (wTQ * wFTD)
        return scores

if __name__ == "__main__":
    docs = [  "mallya","mallya mallya in hawaii", "sunil" ]
    q = "hawaii mallya"
    cs = CosineScore(docs)
    print cs.compute(q)
	import math
	import stemmer

	def irange(sequence):
	return zip(range(len(sequence)), sequence)

	class CosineScore(object):
	def __init__(self,all_docs):
	self.documents = all_docs #list all docs [doc1,doc2..]
	self.ndocs = len(all_docs)
	self.posting_list = {} #term frequency list, don't care about term position
	#term => {docId => freq}
	self.pstemmer = stemmer.PorterStemmer()

	self._term_indexer()

	def _term_indexer(self):
	#Create term frequency dict
	#Run each word through stemmer
	for doc_id,document in irange(self.documents):
	for word in document.split(' '):
	s_word = self.pstemmer.stem(word)
	if self.posting_list.has_key(s_word):
	doc_id_mapping = self.posting_list[s_word]
	if doc_id_mapping.has_key(doc_id):
	doc_id_mapping[doc_id] += 1
	else:
	doc_id_mapping[doc_id] = 1
	else:
	self.posting_list[s_word] = {doc_id: 1}

	def _term_frequency(self,term):
	if self.posting_list.has_key(term):
	return self.posting_list[term]
	else:
	return -1

	def _listToString(self,arg):
	if isinstance(arg,basestring):
	return arg.split(' ')

	def __qTermFrequency(self,term,bWords):
	count =0
	for i,bWordsObj in irange(bWords):
	if bWordsObj == term:
	count = count +1
	return count

	def _docListWeights(self) :

	all_terms = self.posting_list.keys()
	doclist_weights = [0.0] * self.ndocs

	#for all terms in the corpus
	for i,term in irange(all_terms):
	#for all docs in corpus that contain this term
	docs = self.posting_list[term].keys()
	for j,doc_id in irange(docs):
	tf = self.posting_list[term][doc_id]
	tfSquared = (tf * tf)
	doclist_weights[doc_id] += tfSquared

	for k in range(self.ndocs):
	doclist_weights[k] = math.sqrt(doclist_weights[k])
	return doclist_weights

	def compute(self,query,mIDF=0):
	'''
	dft - document term frequency
	idf - inverse document frequency
	wTQ - weights for each query term
	mIDF - max tf normalization
	'''

	scores = [0.0] * self.ndocs
	bWords = self._listToString(query)
	normalizationFactor = self._docListWeights()

	for qterm in bWords:
	term = self.pstemmer.stem(qterm)
	#calculate WT
	#dft = __qTermFrequency(queryTerm,bWords)
	#wTQ = math.log10(int(N)/dft)

	term_posting_doclist = []
	if self._term_frequency(term) != -1:
	#Find all documents with this query term

	term_posting_doclist = self.posting_list[term].keys()
	#total_term_frequency_in_corpus = sum(self.posting_list[term].values())

	if(mIDF!=0):
	dft = mIDF
	else:
	dft = len(term_posting_doclist)

	_wTQ = float(self.ndocs)/float(dft)
	wTQ = math.log10(float(_wTQ)) #idf

	#cosinescore algorithm
	for doc_id in term_posting_doclist:
	if normalizationFactor[doc_id] != 0:
	#wFTD = termDocFrequencyList/ normalizationFactor(doc_id)
	wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id])
	else:
	wFTD = 0.0

	scores[doc_id] += (wTQ * wFTD)
	return scores

	if __name__ == "__main__":
	docs = [ "mallya","mallya mallya in hawaii", "sunil" ]
	q = "hawaii mallya"
	cs = CosineScore(docs)
	print cs.compute(q)