jelsas/indri_dependence_model_query.py

## indri_dependence_model_query.py
'''
Functions to build full dependence model queries for Indri http://lemurproject.org/indri.
See "A Markov Random Field Model for Term Dependencies" by Metzler & Croft
http://ciir.cs.umass.edu/pubfiles/ir-387.pdf
'''
import re

nonword_chars = re.compile(r'\W+')

def powerset(l):
  '''Computes the power set of the given list, excluding the empty set.

  For example:

  >>> powerset([])
  []
  >>> powerset('123')
  [['1'], ['2'], ['1', '2'], ['3'], ['1', '3'], ['2', '3'], ['1', '2', '3']]
  >>> powerset([1, 2, 3, 4, 5])
  [[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3], [4], [1, 4], [2, 4], [1, 2, 4], [3, 4], [1, 3, 4], [2, 3, 4], [1, 2, 3, 4], [5], [1, 5], [2, 5], [1, 2, 5], [3, 5], [1, 3, 5], [2, 3, 5], [1, 2, 3, 5], [4, 5], [1, 4, 5], [2, 4, 5], [1, 2, 4, 5], [3, 4, 5], [1, 3, 4, 5], [2, 3, 4, 5], [1, 2, 3, 4, 5]]
  '''
  p = [[]]
  for t in l:
    to_add = [x + [t] for x in p]
    p = p + to_add
  return p[1:] # remove the first (empty) set

def all_adjacent(l):
  '''Computes all adjacent items of the given list.

  For example:

  >>> all_adjacent([])
  []
  >>> all_adjacent('123')
  ['1', '12', '123', '2', '23']
  >>> all_adjacent([1,2,3,4,5])
  [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [2], [2, 3], [2, 3, 4], [2, 3, 4, 5], [3], [3, 4], [3, 4, 5], [4], [4, 5]]
  '''
  p = []
  for i in xrange(len(l)-1):
    for j in xrange(i, len(l)):
      p = p + [ l[i:j+1] ]
  return p

def _add_field_to_q_list(l, field):
  if field: return ['%s.(%s)' % (q, field) for q in l]
  else: return l

def build_indri_query_dm(query, field=None):
  '''Builds a dependence model query from the given text string, optionally
  adding field field context for the provided field.

  For example:

  >>> build_indri_query_dm('information retrieval systems')
  '#weight( 0.8 #combine( information retrieval systems ) 0.1 #combine( #1( information retrieval ) #1( information retrieval systems ) #1( retrieval systems ) ) 0.1 #combine( #uw4( information retrieval ) #uw4( information systems ) #uw4( retrieval systems ) #uw8( information retrieval systems ) ) )'
  >>> build_indri_query_dm('information retrieval systems', 'title')
  '#weight( 0.8 #combine( information.(title) retrieval.(title) systems.(title) ) 0.1 #combine( #1( information retrieval ).(title) #1( information retrieval systems ).(title) #1( retrieval systems ).(title) ) 0.1 #combine( #uw4( information retrieval ).(title) #uw4( information systems ).(title) #uw4( retrieval systems ).(title) #uw8( information retrieval systems ).(title) ) )'
  '''
  q_tokens = [s for s in nonword_chars.split(query) if len(s) > 0]
  if len(q_tokens) == 0:
    return
  elif len(q_tokens) == 1:
    if field:
      return '%s.(%s)' % (q_tokens[0], field)
    else:
      return q_tokens[0]
  elif len(q_tokens) > 5:
    return '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
  ordered_windows = ['#1( %s )' % ' '.join(s) \
                        for s in all_adjacent(q_tokens) if len(s) > 1]
  unordered_windows = ['#uw%d( %s )' % ((len(s)-1)*4, ' '.join(s)) \
                        for s in powerset(q_tokens) if len(s) > 1]
  token_q = '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
  od_q = '#combine( %s )' % \
                        ' '.join(_add_field_to_q_list(ordered_windows, field))
  uw_q = '#combine( %s )' % \
                        ' '.join(_add_field_to_q_list(unordered_windows, field))
  return '#weight( 0.8 %s 0.1 %s 0.1 %s )' % (token_q, od_q, uw_q)


if __name__ == "__main__":
  import sys
  for q in sys.argv[1:]:
    print build_indri_query_dm(q)
	'''
	Functions to build full dependence model queries for Indri http://lemurproject.org/indri.
	See "A Markov Random Field Model for Term Dependencies" by Metzler & Croft
	http://ciir.cs.umass.edu/pubfiles/ir-387.pdf
	'''
	import re

	nonword_chars = re.compile(r'\W+')

	def powerset(l):
	'''Computes the power set of the given list, excluding the empty set.

	For example:

	>>> powerset([])
	[]
	>>> powerset('123')
	[['1'], ['2'], ['1', '2'], ['3'], ['1', '3'], ['2', '3'], ['1', '2', '3']]
	>>> powerset([1, 2, 3, 4, 5])
	[[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3], [4], [1, 4], [2, 4], [1, 2, 4], [3, 4], [1, 3, 4], [2, 3, 4], [1, 2, 3, 4], [5], [1, 5], [2, 5], [1, 2, 5], [3, 5], [1, 3, 5], [2, 3, 5], [1, 2, 3, 5], [4, 5], [1, 4, 5], [2, 4, 5], [1, 2, 4, 5], [3, 4, 5], [1, 3, 4, 5], [2, 3, 4, 5], [1, 2, 3, 4, 5]]
	'''
	p = [[]]
	for t in l:
	to_add = [x + [t] for x in p]
	p = p + to_add
	return p[1:] # remove the first (empty) set

	def all_adjacent(l):
	'''Computes all adjacent items of the given list.

	For example:

	>>> all_adjacent([])
	[]
	>>> all_adjacent('123')
	['1', '12', '123', '2', '23']
	>>> all_adjacent([1,2,3,4,5])
	[[1], [1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [2], [2, 3], [2, 3, 4], [2, 3, 4, 5], [3], [3, 4], [3, 4, 5], [4], [4, 5]]
	'''
	p = []
	for i in xrange(len(l)-1):
	for j in xrange(i, len(l)):
	p = p + [ l[i:j+1] ]
	return p

	def _add_field_to_q_list(l, field):
	if field: return ['%s.(%s)' % (q, field) for q in l]
	else: return l

	def build_indri_query_dm(query, field=None):
	'''Builds a dependence model query from the given text string, optionally
	adding field field context for the provided field.

	For example:

	>>> build_indri_query_dm('information retrieval systems')
	'#weight( 0.8 #combine( information retrieval systems ) 0.1 #combine( #1( information retrieval ) #1( information retrieval systems ) #1( retrieval systems ) ) 0.1 #combine( #uw4( information retrieval ) #uw4( information systems ) #uw4( retrieval systems ) #uw8( information retrieval systems ) ) )'
	>>> build_indri_query_dm('information retrieval systems', 'title')
	'#weight( 0.8 #combine( information.(title) retrieval.(title) systems.(title) ) 0.1 #combine( #1( information retrieval ).(title) #1( information retrieval systems ).(title) #1( retrieval systems ).(title) ) 0.1 #combine( #uw4( information retrieval ).(title) #uw4( information systems ).(title) #uw4( retrieval systems ).(title) #uw8( information retrieval systems ).(title) ) )'
	'''
	q_tokens = [s for s in nonword_chars.split(query) if len(s) > 0]
	if len(q_tokens) == 0:
	return
	elif len(q_tokens) == 1:
	if field:
	return '%s.(%s)' % (q_tokens[0], field)
	else:
	return q_tokens[0]
	elif len(q_tokens) > 5:
	return '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
	ordered_windows = ['#1( %s )' % ' '.join(s) \
	for s in all_adjacent(q_tokens) if len(s) > 1]
	unordered_windows = ['#uw%d( %s )' % ((len(s)-1)*4, ' '.join(s)) \
	for s in powerset(q_tokens) if len(s) > 1]
	token_q = '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field))
	od_q = '#combine( %s )' % \
	' '.join(_add_field_to_q_list(ordered_windows, field))
	uw_q = '#combine( %s )' % \
	' '.join(_add_field_to_q_list(unordered_windows, field))
	return '#weight( 0.8 %s 0.1 %s 0.1 %s )' % (token_q, od_q, uw_q)


	if __name__ == "__main__":
	import sys
	for q in sys.argv[1:]:
	print build_indri_query_dm(q)