marknorgren/ir2.py

## ir2.py
import re, math

# Debug flag: True=print intermediate data structures
DEBUG = True

# hardcoded (for simplicity) tuple of documents to index
documents = ('D1.txt','D2.txt','D3.txt')

index={}    # dictionary key: (document, term), value: frequencies/weights
maxterm={}  # dictionary key: document, value: maximum number of time any term appears
docfreq={}  # dictionary key: term, value: number of documents term appears in
vocab=[]
vocabSet = set()

def add_Term_Weights(the_index):
	for ( document, term ) in sorted( index.keys() ):
		#tf=(termCountInADocument/largestTermCountInADocument)
		largest_term_count_in_a_document = maxterm[max(maxterm, key=maxterm.get)]
		term_frequency = float( float(index[ ( document, term ) ]) / float(largest_term_count_in_a_document) )
		if DEBUG:
			print 'TF====doc:%s, term:%s, docfreq:%s, largestTermCountInADoc:%d, termFreq:%f' % \
			(document, term, index[ ( document, term ) ],largest_term_count_in_a_document, term_frequency)

		#idf=log base 2[(totalNumberofDocs)/(numberOfDocumentsContainingTerm)]
		inverse_document_frequency = math.log( float(len(documents)) / float(docfreq[term]), 2 )
		if DEBUG:
			print 'IDF====total#ofDocs:%d, term:%s, docfreq:%s, inverseDocumentFreq:%f' % \
			(len(documents), term, index[ ( document, term ) ],inverse_document_frequency)

		term_weight = round((term_frequency * inverse_document_frequency), 2)
		if DEBUG:
			print 'term:%s ---- TERM_WEIGHT=%f' % \
			(term, term_weight)
			print '\n'
		the_index[document,term] = term_weight

def docFreqCount():
    global vocabSet
    numberOfDocsContainingTerm = 0
    for doc in documents:
        for term in vocabSet:
            if( index.has_key( (doc, term) ) ):
                if not docfreq.has_key( term.lower() ):
                    docfreq[term.lower()] = 1
                else:
                    print docfreq[term]
                    docfreq[term.lower()] += 1


for document in documents:
    if not maxterm.has_key( ( document) ):
        maxterm[ document ] = 0
    for line in open( document, 'r'):
        terms = re.split('\W+', line)
        for term in terms:
            if term != '':      # Ignore null results from split at start/end of line
                vocab.append(term.lower())
                if not index.has_key( ( document, term.lower() ) ):
                    index[ ( document, term.lower() ) ] = 0
                index[ ( document, term.lower() ) ] += 1
                # check if this is the highest maxterm found in document
                if ( index[ ( document, term.lower() ) ] > maxterm[ document ]):
                    maxterm[ document ] = index[ ( document, term.lower() ) ]

                #if index.has_key( document, term.lower() ):
                '''
                if not docfreq.has_key( term.lower() ):
                    docfreq[term.lower()] = 1
                else:
                    print docfreq[term]
                    docfreq[term.lower()] += 1
                '''


vocabSet = set(vocab)
print vocabSet
docFreqCount()
print docfreq

if DEBUG:
    print '\nRaw term count:'
    for ( document, term ) in sorted( index.keys() ):
        print document, term, index[ ( document, term ) ]


add_Term_Weights(index)

if DEBUG:
    print '\nMaxterm Dict:'
    print maxterm
    print '\nDocFreq Dict:'
    print docfreq
    print max(maxterm)
    #get maxterm in all docs
    print maxterm[max(maxterm, key=maxterm.get)]
    print '\nTerm Weights:\n'
    for ( document, term ) in sorted( index.keys() ):
        print document, term, index[ ( document, term ) ]
	import re, math

	# Debug flag: True=print intermediate data structures
	DEBUG = True

	# hardcoded (for simplicity) tuple of documents to index
	documents = ('D1.txt','D2.txt','D3.txt')

	index={} # dictionary key: (document, term), value: frequencies/weights
	maxterm={} # dictionary key: document, value: maximum number of time any term appears
	docfreq={} # dictionary key: term, value: number of documents term appears in
	vocab=[]
	vocabSet = set()

	def add_Term_Weights(the_index):
	for ( document, term ) in sorted( index.keys() ):
	#tf=(termCountInADocument/largestTermCountInADocument)
	largest_term_count_in_a_document = maxterm[max(maxterm, key=maxterm.get)]
	term_frequency = float( float(index[ ( document, term ) ]) / float(largest_term_count_in_a_document) )
	if DEBUG:
	print 'TF====doc:%s, term:%s, docfreq:%s, largestTermCountInADoc:%d, termFreq:%f' % \
	(document, term, index[ ( document, term ) ],largest_term_count_in_a_document, term_frequency)

	#idf=log base 2[(totalNumberofDocs)/(numberOfDocumentsContainingTerm)]
	inverse_document_frequency = math.log( float(len(documents)) / float(docfreq[term]), 2 )
	if DEBUG:
	print 'IDF====total#ofDocs:%d, term:%s, docfreq:%s, inverseDocumentFreq:%f' % \
	(len(documents), term, index[ ( document, term ) ],inverse_document_frequency)

	term_weight = round((term_frequency * inverse_document_frequency), 2)
	if DEBUG:
	print 'term:%s ---- TERM_WEIGHT=%f' % \
	(term, term_weight)
	print '\n'
	the_index[document,term] = term_weight

	def docFreqCount():
	global vocabSet
	numberOfDocsContainingTerm = 0
	for doc in documents:
	for term in vocabSet:
	if( index.has_key( (doc, term) ) ):
	if not docfreq.has_key( term.lower() ):
	docfreq[term.lower()] = 1
	else:
	print docfreq[term]
	docfreq[term.lower()] += 1


	for document in documents:
	if not maxterm.has_key( ( document) ):
	maxterm[ document ] = 0
	for line in open( document, 'r'):
	terms = re.split('\W+', line)
	for term in terms:
	if term != '': # Ignore null results from split at start/end of line
	vocab.append(term.lower())
	if not index.has_key( ( document, term.lower() ) ):
	index[ ( document, term.lower() ) ] = 0
	index[ ( document, term.lower() ) ] += 1
	# check if this is the highest maxterm found in document
	if ( index[ ( document, term.lower() ) ] > maxterm[ document ]):
	maxterm[ document ] = index[ ( document, term.lower() ) ]

	#if index.has_key( document, term.lower() ):
	'''
	if not docfreq.has_key( term.lower() ):
	docfreq[term.lower()] = 1
	else:
	print docfreq[term]
	docfreq[term.lower()] += 1
	'''


	vocabSet = set(vocab)
	print vocabSet
	docFreqCount()
	print docfreq

	if DEBUG:
	print '\nRaw term count:'
	for ( document, term ) in sorted( index.keys() ):
	print document, term, index[ ( document, term ) ]


	add_Term_Weights(index)

	if DEBUG:
	print '\nMaxterm Dict:'
	print maxterm
	print '\nDocFreq Dict:'
	print docfreq
	print max(maxterm)
	#get maxterm in all docs
	print maxterm[max(maxterm, key=maxterm.get)]
	print '\nTerm Weights:\n'
	for ( document, term ) in sorted( index.keys() ):
	print document, term, index[ ( document, term ) ]