Skip to content

Instantly share code, notes, and snippets.

@gibrown
Created April 10, 2017 17:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gibrown/9b54444cb23fb61f4e6513a45163e98c to your computer and use it in GitHub Desktop.
Save gibrown/9b54444cb23fb61f4e6513a45163e98c to your computer and use it in GitHub Desktop.
Querying wp.org plugin search
#/bin/python
import sys
import org_search as org
import pprint
import csv
#Configuration
lang='en_US'
min_count=1000
if ( len(sys.argv) < 3 ):
sys.exit( 'need to specify query file and output file' )
query_file=sys.argv[1]
out_file=sys.argv[2]
out_fields = [
'query',
'count',
'total_results',
'install_cnt_4',
'sup_thr_cnt_4',
'sup_res_cnt_4',
'install_ratio_4',
'resolve_ratio_4',
'install_cnt_14',
'sup_thr_cnt_14',
'sup_res_cnt_14',
'results',
]
pp = pprint.PrettyPrinter(indent=2)
def get_val( v ):
if ( isinstance( v, list ) ):
return v[0]
else:
return v
def calc_search_details( r, query, q_count ):
data = {}
data['query'] = query
data['count'] = q_count
data['total_results'] = r['total']
data['install_cnt_4'] = 0
data['sup_thr_cnt_4'] = 0
data['sup_res_cnt_4'] = 0
data['install_cnt_14'] = 0
data['sup_thr_cnt_14'] = 0
data['sup_res_cnt_14'] = 0
data['results'] = ''
i = 0
for hit in r['hits']:
if ( i < 4 ):
data['install_cnt_4'] += get_val( hit['fields']['active_installs'] )
data['sup_thr_cnt_4'] += get_val( hit['fields']['support_threads'] )
data['sup_res_cnt_4'] += get_val( hit['fields']['support_threads_resolved'] )
if ( i < 14 ):
data['install_cnt_14'] += get_val( hit['fields']['active_installs'] )
data['sup_thr_cnt_14'] += get_val( hit['fields']['support_threads'] )
data['sup_res_cnt_14'] += get_val( hit['fields']['support_threads_resolved'] )
data['results'] += ',' + get_val( hit['fields']['slug'] )
i+=1
d = [
data['query'],
data['count'],
data['total_results'],
data['install_cnt_4'],
data['sup_thr_cnt_4'],
data['sup_res_cnt_4'],
float(data['install_cnt_4'])/data['count'], #'install_ratio_4',
float(data['sup_res_cnt_4'])/data['count'], #'resolve_ratio_4',
data['install_cnt_14'],
data['sup_thr_cnt_14'],
data['sup_res_cnt_14'],
data['results'],
]
return d
outfile = open(out_file, 'wb')
datawriter = csv.writer(outfile, delimiter="\t")
datawriter.writerow(out_fields)
j=0
with open(query_file, 'rb' ) as csvfile:
rdr = csv.DictReader(csvfile, fieldnames=['cnt', 'query', 'lang'], delimiter="\t", quotechar='"', escapechar='\\', doublequote=False)
for row in rdr:
print j
#r = org.plugin_search_experiment( row['query'] )
r = org.plugin_search_current( row['query'] )
d = calc_search_details( r, row['query'], int(row['cnt']) )
datawriter.writerow(d)
j += 1
outfile.close()
#r = org.plugin_search( 'social' )
#pp = pprint.PrettyPrinter(indent=2)
#pp.pprint(r)
#!/bin/python
import requests
#I've been working on some sketchy connections...
requests.adapters.DEFAULT_RETRIES = 2
def query_es( query ):
return query_remote_es( query )
def query_local_es( query ):
r = requests.post("http://localhost:9200/wp-plugins/post/_search", data = query)
r.raise_for_status()
return r.json()['hits']
def query_remote_es( query ):
r = requests.post("https://public-api.wordpress.com/rest/v1/sites/108986944/search", data = query)
r.raise_for_status()
return r.json()['results']
#other stuff to try
# query expansion
# boosting resolved threads
# boosting install count more
# Look at support thread resolved percentage.
# Look at contributor install count
# query expansion by running a sig terms on that query, and then running a second query with expanded terms
#this tries:
# - boosting installs more
# - boosting resolved threads more
# - max boost (tried at 1.3, but wow, bad results, maybe try much higher, look at scores)
# - increasing the penalty for no updates
# - decreasing the penalty for not tested with the latest version
def plugin_search_experiment( text ):
return query_es('''{
"size": 14,
"from": 0,
"fields": [
"slug","active_installs","support_threads","support_threads_resolved"
],
"query": {
"filtered": {
"query": {
"function_score": {
"query": {
"bool": {
"must": {
"multi_match": {
"query": "%(query_text)s",
"fields": "all_content_en",
"boost": 0.1,
"operator": "and"
}
},
"should": [
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en",
"excerpt_en",
"description_en",
"taxonomy.plugin_tags.name"
],
"type": "phrase",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en.ngram"
],
"type": "phrase",
"boost": 0.2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en",
"slug_text"
],
"type": "best_fields",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"excerpt_en",
"description_en",
"taxonomy.plugin_tags.name"
],
"type": "best_fields",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"author",
"contributors"
],
"type": "best_fields",
"boost": 2
}
}
]
}
},
"functions": [
{
"exp": {
"plugin_modified": {
"origin": "2017-01-20",
"offset": "180d",
"scale": "360d",
"decay": 0.5
}
}
},
{
"exp": {
"tested": {
"origin": "4.7",
"offset": 0.1,
"scale": 0.4,
"decay": 0.6
}
}
},
{
"field_value_factor": {
"field": "active_installs",
"factor": 0.375,
"modifier": "log2p",
"missing": 1
}
},
{
"exp": {
"active_installs": {
"origin": 1000000,
"offset": 0,
"scale": 900000,
"decay": 0.75
}
}
},
{
"field_value_factor": {
"field": "support_threads_resolved",
"factor": 0.25,
"modifier": "log2p",
"missing": 0.5
}
},
{
"field_value_factor": {
"field": "rating",
"factor": 0.25,
"modifier": "sqrt",
"missing": 2.5
}
}
],
"boost_mode": "multiply"
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"filter": {
"and": [
{
"term": {
"disabled": {
"value": false
}
}
}
]
}
}
''' % {'query_text': text})
def plugin_search_current( text ):
return query_es('''{
"size": 14,
"from": 0,
"fields": [
"slug","active_installs","support_threads","support_threads_resolved"
],
"query": {
"filtered": {
"query": {
"function_score": {
"query": {
"bool": {
"must": {
"multi_match": {
"query": "%(query_text)s",
"fields": "all_content_en",
"boost": 0.1,
"operator": "and"
}
},
"should": [
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en",
"excerpt_en",
"description_en",
"taxonomy.plugin_tags.name"
],
"type": "phrase",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en.ngram"
],
"type": "phrase",
"boost": 0.2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"title_en",
"slug_text"
],
"type": "best_fields",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"excerpt_en",
"description_en",
"taxonomy.plugin_tags.name"
],
"type": "best_fields",
"boost": 2
}
},
{
"multi_match": {
"query": "%(query_text)s",
"fields": [
"author",
"contributors"
],
"type": "best_fields",
"boost": 2
}
}
]
}
},
"functions": [
{
"exp": {
"plugin_modified": {
"origin": "2017-01-20",
"offset": "180d",
"scale": "360d",
"decay": 0.5
}
}
},
{
"exp": {
"tested": {
"origin": "4.7",
"offset": 0.1,
"scale": 0.4,
"decay": 0.6
}
}
},
{
"field_value_factor": {
"field": "active_installs",
"factor": 0.375,
"modifier": "log2p",
"missing": 1
}
},
{
"exp": {
"active_installs": {
"origin": 1000000,
"offset": 0,
"scale": 900000,
"decay": 0.75
}
}
},
{
"field_value_factor": {
"field": "support_threads_resolved",
"factor": 0.25,
"modifier": "log2p",
"missing": 0.5
}
},
{
"field_value_factor": {
"field": "rating",
"factor": 0.25,
"modifier": "sqrt",
"missing": 2.5
}
}
],
"boost_mode": "multiply"
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"filter": {
"and": [
{
"term": {
"disabled": {
"value": false
}
}
}
]
}
}''' % {'query_text': text})
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
30615 backup en_US
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment