Created
April 10, 2017 17:01
-
-
Save gibrown/9b54444cb23fb61f4e6513a45163e98c to your computer and use it in GitHub Desktop.
Querying wp.org plugin search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/bin/python | |
import sys | |
import org_search as org | |
import pprint | |
import csv | |
#Configuration | |
lang='en_US' | |
min_count=1000 | |
if ( len(sys.argv) < 3 ): | |
sys.exit( 'need to specify query file and output file' ) | |
query_file=sys.argv[1] | |
out_file=sys.argv[2] | |
out_fields = [ | |
'query', | |
'count', | |
'total_results', | |
'install_cnt_4', | |
'sup_thr_cnt_4', | |
'sup_res_cnt_4', | |
'install_ratio_4', | |
'resolve_ratio_4', | |
'install_cnt_14', | |
'sup_thr_cnt_14', | |
'sup_res_cnt_14', | |
'results', | |
] | |
pp = pprint.PrettyPrinter(indent=2) | |
def get_val( v ): | |
if ( isinstance( v, list ) ): | |
return v[0] | |
else: | |
return v | |
def calc_search_details( r, query, q_count ): | |
data = {} | |
data['query'] = query | |
data['count'] = q_count | |
data['total_results'] = r['total'] | |
data['install_cnt_4'] = 0 | |
data['sup_thr_cnt_4'] = 0 | |
data['sup_res_cnt_4'] = 0 | |
data['install_cnt_14'] = 0 | |
data['sup_thr_cnt_14'] = 0 | |
data['sup_res_cnt_14'] = 0 | |
data['results'] = '' | |
i = 0 | |
for hit in r['hits']: | |
if ( i < 4 ): | |
data['install_cnt_4'] += get_val( hit['fields']['active_installs'] ) | |
data['sup_thr_cnt_4'] += get_val( hit['fields']['support_threads'] ) | |
data['sup_res_cnt_4'] += get_val( hit['fields']['support_threads_resolved'] ) | |
if ( i < 14 ): | |
data['install_cnt_14'] += get_val( hit['fields']['active_installs'] ) | |
data['sup_thr_cnt_14'] += get_val( hit['fields']['support_threads'] ) | |
data['sup_res_cnt_14'] += get_val( hit['fields']['support_threads_resolved'] ) | |
data['results'] += ',' + get_val( hit['fields']['slug'] ) | |
i+=1 | |
d = [ | |
data['query'], | |
data['count'], | |
data['total_results'], | |
data['install_cnt_4'], | |
data['sup_thr_cnt_4'], | |
data['sup_res_cnt_4'], | |
float(data['install_cnt_4'])/data['count'], #'install_ratio_4', | |
float(data['sup_res_cnt_4'])/data['count'], #'resolve_ratio_4', | |
data['install_cnt_14'], | |
data['sup_thr_cnt_14'], | |
data['sup_res_cnt_14'], | |
data['results'], | |
] | |
return d | |
outfile = open(out_file, 'wb') | |
datawriter = csv.writer(outfile, delimiter="\t") | |
datawriter.writerow(out_fields) | |
j=0 | |
with open(query_file, 'rb' ) as csvfile: | |
rdr = csv.DictReader(csvfile, fieldnames=['cnt', 'query', 'lang'], delimiter="\t", quotechar='"', escapechar='\\', doublequote=False) | |
for row in rdr: | |
print j | |
#r = org.plugin_search_experiment( row['query'] ) | |
r = org.plugin_search_current( row['query'] ) | |
d = calc_search_details( r, row['query'], int(row['cnt']) ) | |
datawriter.writerow(d) | |
j += 1 | |
outfile.close() | |
#r = org.plugin_search( 'social' ) | |
#pp = pprint.PrettyPrinter(indent=2) | |
#pp.pprint(r) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python | |
import requests | |
#I've been working on some sketchy connections... | |
requests.adapters.DEFAULT_RETRIES = 2 | |
def query_es( query ): | |
return query_remote_es( query ) | |
def query_local_es( query ): | |
r = requests.post("http://localhost:9200/wp-plugins/post/_search", data = query) | |
r.raise_for_status() | |
return r.json()['hits'] | |
def query_remote_es( query ): | |
r = requests.post("https://public-api.wordpress.com/rest/v1/sites/108986944/search", data = query) | |
r.raise_for_status() | |
return r.json()['results'] | |
#other stuff to try | |
# query expansion | |
# boosting resolved threads | |
# boosting install count more | |
# Look at support thread resolved percentage. | |
# Look at contributor install count | |
# query expansion by running a sig terms on that query, and then running a second query with expanded terms | |
#this tries: | |
# - boosting installs more | |
# - boosting resolved threads more | |
# - max boost (tried at 1.3, but wow, bad results, maybe try much higher, look at scores) | |
# - increasing the penalty for no updates | |
# - decreasing the penalty for not tested with the latest version | |
def plugin_search_experiment( text ): | |
return query_es('''{ | |
"size": 14, | |
"from": 0, | |
"fields": [ | |
"slug","active_installs","support_threads","support_threads_resolved" | |
], | |
"query": { | |
"filtered": { | |
"query": { | |
"function_score": { | |
"query": { | |
"bool": { | |
"must": { | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": "all_content_en", | |
"boost": 0.1, | |
"operator": "and" | |
} | |
}, | |
"should": [ | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en", | |
"excerpt_en", | |
"description_en", | |
"taxonomy.plugin_tags.name" | |
], | |
"type": "phrase", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en.ngram" | |
], | |
"type": "phrase", | |
"boost": 0.2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en", | |
"slug_text" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"excerpt_en", | |
"description_en", | |
"taxonomy.plugin_tags.name" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"author", | |
"contributors" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
} | |
] | |
} | |
}, | |
"functions": [ | |
{ | |
"exp": { | |
"plugin_modified": { | |
"origin": "2017-01-20", | |
"offset": "180d", | |
"scale": "360d", | |
"decay": 0.5 | |
} | |
} | |
}, | |
{ | |
"exp": { | |
"tested": { | |
"origin": "4.7", | |
"offset": 0.1, | |
"scale": 0.4, | |
"decay": 0.6 | |
} | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "active_installs", | |
"factor": 0.375, | |
"modifier": "log2p", | |
"missing": 1 | |
} | |
}, | |
{ | |
"exp": { | |
"active_installs": { | |
"origin": 1000000, | |
"offset": 0, | |
"scale": 900000, | |
"decay": 0.75 | |
} | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "support_threads_resolved", | |
"factor": 0.25, | |
"modifier": "log2p", | |
"missing": 0.5 | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "rating", | |
"factor": 0.25, | |
"modifier": "sqrt", | |
"missing": 2.5 | |
} | |
} | |
], | |
"boost_mode": "multiply" | |
} | |
} | |
} | |
}, | |
"sort": [ | |
{ | |
"_score": { | |
"order": "desc" | |
} | |
} | |
], | |
"filter": { | |
"and": [ | |
{ | |
"term": { | |
"disabled": { | |
"value": false | |
} | |
} | |
} | |
] | |
} | |
} | |
''' % {'query_text': text}) | |
def plugin_search_current( text ): | |
return query_es('''{ | |
"size": 14, | |
"from": 0, | |
"fields": [ | |
"slug","active_installs","support_threads","support_threads_resolved" | |
], | |
"query": { | |
"filtered": { | |
"query": { | |
"function_score": { | |
"query": { | |
"bool": { | |
"must": { | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": "all_content_en", | |
"boost": 0.1, | |
"operator": "and" | |
} | |
}, | |
"should": [ | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en", | |
"excerpt_en", | |
"description_en", | |
"taxonomy.plugin_tags.name" | |
], | |
"type": "phrase", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en.ngram" | |
], | |
"type": "phrase", | |
"boost": 0.2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"title_en", | |
"slug_text" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"excerpt_en", | |
"description_en", | |
"taxonomy.plugin_tags.name" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "%(query_text)s", | |
"fields": [ | |
"author", | |
"contributors" | |
], | |
"type": "best_fields", | |
"boost": 2 | |
} | |
} | |
] | |
} | |
}, | |
"functions": [ | |
{ | |
"exp": { | |
"plugin_modified": { | |
"origin": "2017-01-20", | |
"offset": "180d", | |
"scale": "360d", | |
"decay": 0.5 | |
} | |
} | |
}, | |
{ | |
"exp": { | |
"tested": { | |
"origin": "4.7", | |
"offset": 0.1, | |
"scale": 0.4, | |
"decay": 0.6 | |
} | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "active_installs", | |
"factor": 0.375, | |
"modifier": "log2p", | |
"missing": 1 | |
} | |
}, | |
{ | |
"exp": { | |
"active_installs": { | |
"origin": 1000000, | |
"offset": 0, | |
"scale": 900000, | |
"decay": 0.75 | |
} | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "support_threads_resolved", | |
"factor": 0.25, | |
"modifier": "log2p", | |
"missing": 0.5 | |
} | |
}, | |
{ | |
"field_value_factor": { | |
"field": "rating", | |
"factor": 0.25, | |
"modifier": "sqrt", | |
"missing": 2.5 | |
} | |
} | |
], | |
"boost_mode": "multiply" | |
} | |
} | |
} | |
}, | |
"sort": [ | |
{ | |
"_score": { | |
"order": "desc" | |
} | |
} | |
], | |
"filter": { | |
"and": [ | |
{ | |
"term": { | |
"disabled": { | |
"value": false | |
} | |
} | |
} | |
] | |
} | |
}''' % {'query_text': text}) |
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
30615 backup en_US |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment