Skip to content

Instantly share code, notes, and snippets.

@jelsas
Created September 1, 2010 19:18
Show Gist options
  • Save jelsas/561196 to your computer and use it in GitHub Desktop.
Save jelsas/561196 to your computer and use it in GitHub Desktop.
Functions to build full dependence model queries for Indri http://lemurproject.org/indri
'''
Functions to build full dependence model queries for Indri http://lemurproject.org/indri.
See "A Markov Random Field Model for Term Dependencies" by Metzler & Croft
http://ciir.cs.umass.edu/pubfiles/ir-387.pdf
'''
import re
nonword_chars = re.compile(r'\W+')
def powerset(l):
'''Computes the power set of the given list, excluding the empty set.
For example:
>>> powerset([])
[]
>>> powerset('123')
[['1'], ['2'], ['1', '2'], ['3'], ['1', '3'], ['2', '3'], ['1', '2', '3']]
>>> powerset([1, 2, 3, 4, 5])
[[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3], [4], [1, 4], [2, 4], [1, 2, 4], [3, 4], [1, 3, 4], [2, 3, 4], [1, 2, 3, 4], [5], [1, 5], [2, 5], [1, 2, 5], [3, 5], [1, 3, 5], [2, 3, 5], [1, 2, 3, 5], [4, 5], [1, 4, 5], [2, 4, 5], [1, 2, 4, 5], [3, 4, 5], [1, 3, 4, 5], [2, 3, 4, 5], [1, 2, 3, 4, 5]]
'''
p = [[]]
for t in l:
to_add = [x + [t] for x in p]
p = p + to_add
return p[1:] # remove the first (empty) set
def all_adjacent(l):
'''Computes all adjacent items of the given list.
For example:
>>> all_adjacent([])
[]
>>> all_adjacent('123')
['1', '12', '123', '2', '23']
>>> all_adjacent([1,2,3,4,5])
[[1], [1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [2], [2, 3], [2, 3, 4], [2, 3, 4, 5], [3], [3, 4], [3, 4, 5], [4], [4, 5]]
'''
p = []
for i in xrange(len(l)-1):
for j in xrange(i, len(l)):
p = p + [ l[i:j+1] ]
return p
def _add_field_to_q_list(l, field):
if field: return ['%s.(%s)' % (q, field) for q in l]
else: return l
def build_indri_query_dm(query, field=None):
'''Builds a dependence model query from the given text string, optionally
adding field field context for the provided field.
For example:
>>> build_indri_query_dm('information retrieval systems')
'#weight( 0.8 #combine( information retrieval systems ) 0.1 #combine( #1( information retrieval ) #1( information retrieval systems ) #1( retrieval systems ) ) 0.1 #combine( #uw4( information retrieval ) #uw4( information systems ) #uw4( retrieval systems ) #uw8( information retrieval systems ) ) )'
>>> build_indri_query_dm('information retrieval systems', 'title')
'#weight( 0.8 #combine( information.(title) retrieval.(title) systems.(title) ) 0.1 #combine( #1( information retrieval ).(title) #1( information retrieval systems ).(title) #1( retrieval systems ).(title) ) 0.1 #combine( #uw4( information retrieval ).(title) #uw4( information systems ).(title) #uw4( retrieval systems ).(title) #uw8( information retrieval systems ).(title) ) )'
'''
q_tokens = [s for s in nonword_chars.split(query) if len(s) > 0]
if len(q_tokens) == 0:
return
elif len(q_tokens) == 1:
if field:
return '%s.(%s)' % (q_tokens[0], field)
else:
return q_tokens[0]
elif len(q_tokens) > 5:
return '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
ordered_windows = ['#1( %s )' % ' '.join(s) \
for s in all_adjacent(q_tokens) if len(s) > 1]
unordered_windows = ['#uw%d( %s )' % ((len(s)-1)*4, ' '.join(s)) \
for s in powerset(q_tokens) if len(s) > 1]
token_q = '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
od_q = '#combine( %s )' % \
' '.join(_add_field_to_q_list(ordered_windows, field))
uw_q = '#combine( %s )' % \
' '.join(_add_field_to_q_list(unordered_windows, field))
return '#weight( 0.8 %s 0.1 %s 0.1 %s )' % (token_q, od_q, uw_q)
if __name__ == "__main__":
import sys
for q in sys.argv[1:]:
print build_indri_query_dm(q)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment