gibrown/bulk_test_queries.py

## bulk_test_queries.py
#/bin/python

import sys
import org_search as org
import pprint
import csv

#Configuration
lang='en_US'
min_count=1000

if ( len(sys.argv) < 3 ):
    sys.exit( 'need to specify query file and output file' )

query_file=sys.argv[1]
out_file=sys.argv[2]

out_fields = [
    'query',
    'count',
    'total_results',
    'install_cnt_4',
    'sup_thr_cnt_4',
    'sup_res_cnt_4',
    'install_ratio_4',
    'resolve_ratio_4',
    'install_cnt_14',
    'sup_thr_cnt_14',
    'sup_res_cnt_14',
    'results',
]

pp = pprint.PrettyPrinter(indent=2)

def get_val( v ):
    if ( isinstance( v, list ) ):
	return v[0]
    else:
	return v

def calc_search_details( r, query, q_count ):
    data = {}

    data['query'] = query
    data['count'] = q_count
    data['total_results'] = r['total']
    data['install_cnt_4'] = 0
    data['sup_thr_cnt_4'] = 0
    data['sup_res_cnt_4'] = 0
    data['install_cnt_14'] = 0
    data['sup_thr_cnt_14'] = 0
    data['sup_res_cnt_14'] = 0
    data['results'] = ''
    i = 0
    for hit in r['hits']:
        if ( i < 4 ):
	    data['install_cnt_4'] += get_val( hit['fields']['active_installs'] )
    	    data['sup_thr_cnt_4'] += get_val( hit['fields']['support_threads'] )
    	    data['sup_res_cnt_4'] += get_val( hit['fields']['support_threads_resolved'] )
        if ( i < 14 ):
	    data['install_cnt_14'] += get_val( hit['fields']['active_installs'] )
	    data['sup_thr_cnt_14'] += get_val( hit['fields']['support_threads'] )
	    data['sup_res_cnt_14'] += get_val( hit['fields']['support_threads_resolved'] )
        data['results'] += ',' + get_val( hit['fields']['slug'] )
        i+=1

    d = [
	data['query'],
	data['count'],
	data['total_results'],
	data['install_cnt_4'],
	data['sup_thr_cnt_4'],
	data['sup_res_cnt_4'],
    	float(data['install_cnt_4'])/data['count'], #'install_ratio_4',
    	float(data['sup_res_cnt_4'])/data['count'], #'resolve_ratio_4',
	data['install_cnt_14'],
	data['sup_thr_cnt_14'],
	data['sup_res_cnt_14'],
	data['results'],
    ]
    return d

outfile = open(out_file, 'wb')
datawriter = csv.writer(outfile, delimiter="\t")

datawriter.writerow(out_fields)
j=0
with open(query_file, 'rb' ) as csvfile:
    rdr = csv.DictReader(csvfile, fieldnames=['cnt', 'query', 'lang'], delimiter="\t", quotechar='"', escapechar='\\', doublequote=False)
    for row in rdr:
        print j
        #r = org.plugin_search_experiment( row['query'] )
        r = org.plugin_search_current( row['query'] )
        d = calc_search_details( r, row['query'], int(row['cnt']) )
	datawriter.writerow(d)
	j += 1

outfile.close()

#r = org.plugin_search( 'social' )
#pp = pprint.PrettyPrinter(indent=2)
#pp.pprint(r)

## org_search.py
#!/bin/python

import requests

#I've been working on some sketchy connections...
requests.adapters.DEFAULT_RETRIES = 2

def query_es( query ):
  return query_remote_es( query )

def query_local_es( query ):
  r = requests.post("http://localhost:9200/wp-plugins/post/_search", data = query)
  r.raise_for_status()
  return r.json()['hits']

def query_remote_es( query ):
  r = requests.post("https://public-api.wordpress.com/rest/v1/sites/108986944/search", data = query)
  r.raise_for_status()
  return r.json()['results']

#other stuff to try
# query expansion
# boosting resolved threads
# boosting install count more
# Look at support thread resolved percentage.
# Look at contributor install count
# query expansion by running a sig terms on that query, and then running a second query with expanded terms

#this tries:
# - boosting installs more
# - boosting resolved threads more
# - max boost (tried at 1.3, but wow, bad results, maybe try much higher, look at scores)
# - increasing the penalty for no updates
# - decreasing the penalty for not tested with the latest version
def plugin_search_experiment( text ):
  return query_es('''{
  "size": 14,
  "from": 0,
  "fields": [
    "slug","active_installs","support_threads","support_threads_resolved"
  ],
  "query": {
    "filtered": {
      "query": {
        "function_score": {
          "query": {
            "bool": {
              "must": {
                "multi_match": {
                  "query": "%(query_text)s",
                  "fields": "all_content_en",
                  "boost": 0.1,
                  "operator": "and"
                }
              },
              "should": [
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en",
                      "excerpt_en",
                      "description_en",
                      "taxonomy.plugin_tags.name"
                    ],
                    "type": "phrase",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en.ngram"
                    ],
                    "type": "phrase",
                    "boost": 0.2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en",
                      "slug_text"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "excerpt_en",
                      "description_en",
                      "taxonomy.plugin_tags.name"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "author",
                      "contributors"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                }
              ]
            }
          },
          "functions": [
            {
              "exp": {
                "plugin_modified": {
                  "origin": "2017-01-20",
                  "offset": "180d",
                  "scale": "360d",
                  "decay": 0.5
                }
              }
            },
            {
              "exp": {
                "tested": {
                  "origin": "4.7",
                  "offset": 0.1,
                  "scale": 0.4,
                  "decay": 0.6
                }
              }
            },
            {
              "field_value_factor": {
                "field": "active_installs",
                "factor": 0.375,
                "modifier": "log2p",
                "missing": 1
              }
            },
            {
              "exp": {
                "active_installs": {
                  "origin": 1000000,
                  "offset": 0,
                  "scale": 900000,
                  "decay": 0.75
                }
              }
            },
            {
              "field_value_factor": {
                "field": "support_threads_resolved",
                "factor": 0.25,
                "modifier": "log2p",
                "missing": 0.5
              }
            },
            {
              "field_value_factor": {
                "field": "rating",
                "factor": 0.25,
                "modifier": "sqrt",
                "missing": 2.5
              }
            }
          ],
          "boost_mode": "multiply"
        }
      }
    }
  },
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    }
  ],
  "filter": {
    "and": [
      {
        "term": {
          "disabled": {
            "value": false
          }
        }
      }
    ]
  }
}
''' % {'query_text': text})

def plugin_search_current( text ):
  return query_es('''{
  "size": 14,
  "from": 0,
  "fields": [
    "slug","active_installs","support_threads","support_threads_resolved"
  ],
  "query": {
    "filtered": {
      "query": {
        "function_score": {
          "query": {
            "bool": {
              "must": {
                "multi_match": {
                  "query": "%(query_text)s",
                  "fields": "all_content_en",
                  "boost": 0.1,
                  "operator": "and"
                }
              },
              "should": [
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en",
                      "excerpt_en",
                      "description_en",
                      "taxonomy.plugin_tags.name"
                    ],
                    "type": "phrase",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en.ngram"
                    ],
                    "type": "phrase",
                    "boost": 0.2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "title_en",
                      "slug_text"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "excerpt_en",
                      "description_en",
                      "taxonomy.plugin_tags.name"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                },
                {
                  "multi_match": {
                    "query": "%(query_text)s",
                    "fields": [
                      "author",
                      "contributors"
                    ],
                    "type": "best_fields",
                    "boost": 2
                  }
                }
              ]
            }
          },
          "functions": [
            {
              "exp": {
                "plugin_modified": {
                  "origin": "2017-01-20",
                  "offset": "180d",
                  "scale": "360d",
                  "decay": 0.5
                }
              }
            },
            {
              "exp": {
                "tested": {
                  "origin": "4.7",
                  "offset": 0.1,
                  "scale": 0.4,
                  "decay": 0.6
                }
              }
            },
            {
              "field_value_factor": {
                "field": "active_installs",
                "factor": 0.375,
                "modifier": "log2p",
                "missing": 1
              }
            },
            {
              "exp": {
                "active_installs": {
                  "origin": 1000000,
                  "offset": 0,
                  "scale": 900000,
                  "decay": 0.75
                }
              }
            },
            {
              "field_value_factor": {
                "field": "support_threads_resolved",
                "factor": 0.25,
                "modifier": "log2p",
                "missing": 0.5
              }
            },
            {
              "field_value_factor": {
                "field": "rating",
                "factor": 0.25,
                "modifier": "sqrt",
                "missing": 2.5
              }
            }
          ],
          "boost_mode": "multiply"
        }
      }
    }
  },
  "sort": [
    {
      "_score": {
        "order": "desc"
      }
    }
  ],
  "filter": {
    "and": [
      {
        "term": {
          "disabled": {
            "value": false
          }
        }
      }
    ]
  }
}''' % {'query_text': text})

## queries.csv
30615	backup	en_US
	#/bin/python

	import sys
	import org_search as org
	import pprint
	import csv

	#Configuration
	lang='en_US'
	min_count=1000

	if ( len(sys.argv) < 3 ):
	sys.exit( 'need to specify query file and output file' )

	query_file=sys.argv[1]
	out_file=sys.argv[2]

	out_fields = [
	'query',
	'count',
	'total_results',
	'install_cnt_4',
	'sup_thr_cnt_4',
	'sup_res_cnt_4',
	'install_ratio_4',
	'resolve_ratio_4',
	'install_cnt_14',
	'sup_thr_cnt_14',
	'sup_res_cnt_14',
	'results',
	]

	pp = pprint.PrettyPrinter(indent=2)

	def get_val( v ):
	if ( isinstance( v, list ) ):
	return v[0]
	else:
	return v

	def calc_search_details( r, query, q_count ):
	data = {}

	data['query'] = query
	data['count'] = q_count
	data['total_results'] = r['total']
	data['install_cnt_4'] = 0
	data['sup_thr_cnt_4'] = 0
	data['sup_res_cnt_4'] = 0
	data['install_cnt_14'] = 0
	data['sup_thr_cnt_14'] = 0
	data['sup_res_cnt_14'] = 0
	data['results'] = ''
	i = 0
	for hit in r['hits']:
	if ( i < 4 ):
	data['install_cnt_4'] += get_val( hit['fields']['active_installs'] )
	data['sup_thr_cnt_4'] += get_val( hit['fields']['support_threads'] )
	data['sup_res_cnt_4'] += get_val( hit['fields']['support_threads_resolved'] )
	if ( i < 14 ):
	data['install_cnt_14'] += get_val( hit['fields']['active_installs'] )
	data['sup_thr_cnt_14'] += get_val( hit['fields']['support_threads'] )
	data['sup_res_cnt_14'] += get_val( hit['fields']['support_threads_resolved'] )
	data['results'] += ',' + get_val( hit['fields']['slug'] )
	i+=1

	d = [
	data['query'],
	data['count'],
	data['total_results'],
	data['install_cnt_4'],
	data['sup_thr_cnt_4'],
	data['sup_res_cnt_4'],
	float(data['install_cnt_4'])/data['count'], #'install_ratio_4',
	float(data['sup_res_cnt_4'])/data['count'], #'resolve_ratio_4',
	data['install_cnt_14'],
	data['sup_thr_cnt_14'],
	data['sup_res_cnt_14'],
	data['results'],
	]
	return d

	outfile = open(out_file, 'wb')
	datawriter = csv.writer(outfile, delimiter="\t")

	datawriter.writerow(out_fields)
	j=0
	with open(query_file, 'rb' ) as csvfile:
	rdr = csv.DictReader(csvfile, fieldnames=['cnt', 'query', 'lang'], delimiter="\t", quotechar='"', escapechar='\\', doublequote=False)
	for row in rdr:
	print j
	#r = org.plugin_search_experiment( row['query'] )
	r = org.plugin_search_current( row['query'] )
	d = calc_search_details( r, row['query'], int(row['cnt']) )
	datawriter.writerow(d)
	j += 1

	outfile.close()

	#r = org.plugin_search( 'social' )
	#pp = pprint.PrettyPrinter(indent=2)
	#pp.pprint(r)
	#!/bin/python

	import requests

	#I've been working on some sketchy connections...
	requests.adapters.DEFAULT_RETRIES = 2

	def query_es( query ):
	return query_remote_es( query )

	def query_local_es( query ):
	r = requests.post("http://localhost:9200/wp-plugins/post/_search", data = query)
	r.raise_for_status()
	return r.json()['hits']

	def query_remote_es( query ):
	r = requests.post("https://public-api.wordpress.com/rest/v1/sites/108986944/search", data = query)
	r.raise_for_status()
	return r.json()['results']

	#other stuff to try
	# query expansion
	# boosting resolved threads
	# boosting install count more
	# Look at support thread resolved percentage.
	# Look at contributor install count
	# query expansion by running a sig terms on that query, and then running a second query with expanded terms

	#this tries:
	# - boosting installs more
	# - boosting resolved threads more
	# - max boost (tried at 1.3, but wow, bad results, maybe try much higher, look at scores)
	# - increasing the penalty for no updates
	# - decreasing the penalty for not tested with the latest version
	def plugin_search_experiment( text ):
	return query_es('''{
	"size": 14,
	"from": 0,
	"fields": [
	"slug","active_installs","support_threads","support_threads_resolved"
	],
	"query": {
	"filtered": {
	"query": {
	"function_score": {
	"query": {
	"bool": {
	"must": {
	"multi_match": {
	"query": "%(query_text)s",
	"fields": "all_content_en",
	"boost": 0.1,
	"operator": "and"
	}
	},
	"should": [
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en",
	"excerpt_en",
	"description_en",
	"taxonomy.plugin_tags.name"
	],
	"type": "phrase",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en.ngram"
	],
	"type": "phrase",
	"boost": 0.2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en",
	"slug_text"
	],
	"type": "best_fields",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"excerpt_en",
	"description_en",
	"taxonomy.plugin_tags.name"
	],
	"type": "best_fields",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"author",
	"contributors"
	],
	"type": "best_fields",
	"boost": 2
	}
	}
	]
	}
	},
	"functions": [
	{
	"exp": {
	"plugin_modified": {
	"origin": "2017-01-20",
	"offset": "180d",
	"scale": "360d",
	"decay": 0.5
	}
	}
	},
	{
	"exp": {
	"tested": {
	"origin": "4.7",
	"offset": 0.1,
	"scale": 0.4,
	"decay": 0.6
	}
	}
	},
	{
	"field_value_factor": {
	"field": "active_installs",
	"factor": 0.375,
	"modifier": "log2p",
	"missing": 1
	}
	},
	{
	"exp": {
	"active_installs": {
	"origin": 1000000,
	"offset": 0,
	"scale": 900000,
	"decay": 0.75
	}
	}
	},
	{
	"field_value_factor": {
	"field": "support_threads_resolved",
	"factor": 0.25,
	"modifier": "log2p",
	"missing": 0.5
	}
	},
	{
	"field_value_factor": {
	"field": "rating",
	"factor": 0.25,
	"modifier": "sqrt",
	"missing": 2.5
	}
	}
	],
	"boost_mode": "multiply"
	}
	}
	}
	},
	"sort": [
	{
	"_score": {
	"order": "desc"
	}
	}
	],
	"filter": {
	"and": [
	{
	"term": {
	"disabled": {
	"value": false
	}
	}
	}
	]
	}
	}
	''' % {'query_text': text})

	def plugin_search_current( text ):
	return query_es('''{
	"size": 14,
	"from": 0,
	"fields": [
	"slug","active_installs","support_threads","support_threads_resolved"
	],
	"query": {
	"filtered": {
	"query": {
	"function_score": {
	"query": {
	"bool": {
	"must": {
	"multi_match": {
	"query": "%(query_text)s",
	"fields": "all_content_en",
	"boost": 0.1,
	"operator": "and"
	}
	},
	"should": [
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en",
	"excerpt_en",
	"description_en",
	"taxonomy.plugin_tags.name"
	],
	"type": "phrase",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en.ngram"
	],
	"type": "phrase",
	"boost": 0.2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"title_en",
	"slug_text"
	],
	"type": "best_fields",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"excerpt_en",
	"description_en",
	"taxonomy.plugin_tags.name"
	],
	"type": "best_fields",
	"boost": 2
	}
	},
	{
	"multi_match": {
	"query": "%(query_text)s",
	"fields": [
	"author",
	"contributors"
	],
	"type": "best_fields",
	"boost": 2
	}
	}
	]
	}
	},
	"functions": [
	{
	"exp": {
	"plugin_modified": {
	"origin": "2017-01-20",
	"offset": "180d",
	"scale": "360d",
	"decay": 0.5
	}
	}
	},
	{
	"exp": {
	"tested": {
	"origin": "4.7",
	"offset": 0.1,
	"scale": 0.4,
	"decay": 0.6
	}
	}
	},
	{
	"field_value_factor": {
	"field": "active_installs",
	"factor": 0.375,
	"modifier": "log2p",
	"missing": 1
	}
	},
	{
	"exp": {
	"active_installs": {
	"origin": 1000000,
	"offset": 0,
	"scale": 900000,
	"decay": 0.75
	}
	}
	},
	{
	"field_value_factor": {
	"field": "support_threads_resolved",
	"factor": 0.25,
	"modifier": "log2p",
	"missing": 0.5
	}
	},
	{
	"field_value_factor": {
	"field": "rating",
	"factor": 0.25,
	"modifier": "sqrt",
	"missing": 2.5
	}
	}
	],
	"boost_mode": "multiply"
	}
	}
	}
	},
	"sort": [
	{
	"_score": {
	"order": "desc"
	}
	}
	],
	"filter": {
	"and": [
	{
	"term": {
	"disabled": {
	"value": false
	}
	}
	}
	]
	}
	}''' % {'query_text': text})