Skip to content

Instantly share code, notes, and snippets.

@Tattoo
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tattoo/6c5ba0b8874aa94e5c1c to your computer and use it in GitHub Desktop.
Save Tattoo/6c5ba0b8874aa94e5c1c to your computer and use it in GitHub Desktop.
keywords.py
'''
You need to have Keywords-table as the last table inside of the suite- and
resource files. Or more clever grepping than presented below. `sed` etc. is also
fine if the output would be same as the grep command below.
$ grep -irn -A 5000 '* Keywords *' path/to/tests > my_file
$ python keywords.py my_file [how_many_scores]
or one-liner:
$ grep -irn -A 5000 '* Keywords *' path/to/tests | python keywords.py - [how_many_scores]
`how_many_scores` supports Python's list indexing:
$ python keywords.py my_file 1 # prints matching keywords with highest
# score
$ python keywords.py my_file 1:3 # prints matching keywords with second
# and third highest scores
$ python keywords.py my_file 2:5:2 # prints matching keywords with third
# and fifth highest scores, skipping
# fourth highest
'''
import re
import sys
from collections import namedtuple
try:
from fuzzywuzzy import fuzz
except ImportError:
print 'Please install fuzzywuzzy https://github.com/seatgeek/fuzzywuzzy'
sys.exit(1)
def extract(content):
pattern = re.compile('^(.*?)-(\d+)-(.*)$')
for line in content:
line = line.strip('\n')
m = pattern.match(line)
if not m:
continue
yield m.groups()
def extract_kw_information(content):
data = []
all_keywords = []
record = namedtuple('data', ['kw_name', 'filename', 'lineno'])
for (filename, lineno, kw_name) in extract(content):
if (not kw_name or kw_name.startswith('#') or kw_name.startswith(' ')):
continue
data.append(record(kw_name, filename, lineno))
all_keywords.append(kw_name)
return data, all_keywords
def calculate_scores(keywords, data):
scores = {}
for index, kw_name in enumerate(keywords):
best_score = -1
best_records = None
for another_index, another_kw_name in enumerate(keywords):
if another_index == index:
continue
new_score = fuzz.token_sort_ratio(kw_name, another_kw_name)
record = { 'kws': (data[index], data[another_index]),
'score': new_score }
if new_score > best_score:
best_score = new_score
best_records = [record]
elif new_score == best_score:
best_records.append(record)
if best_score in scores:
scores[best_score].extend(best_records)
else:
scores[best_score] = best_records
return scores
def print_out(scores, number_range):
if len(number_range) == 1:
number_range = slice(0, number_range[0])
else:
number_range = slice(*number_range)
for key in sorted(scores, reverse=True)[number_range]:
for record in sorted(scores[key],
lambda x, y: cmp(x['kws'][0].kw_name,
y['kws'][0].kw_name)):
kw1, kw2 = record['kws']
print ('Keyword "%s"\nfrom %s:%s seems similar to:\n"%s"\nfrom '
'%s:%s with score %d\n' % (kw1 + kw2 + (record['score'],)))
def parse_how_many(number_or_range):
try:
return (int(number_or_range),)
except ValueError:
if not ':' in number_or_range:
raise
return [int(i) for i in number_or_range.split(':')]
def main(path, how_many=10, *ignored):
how_many = parse_how_many(how_many)
content = None
if path == '-':
content = sys.stdin.readlines()
else:
with open(path, 'r') as f:
content = f.readlines()
data, all_keywords = extract_kw_information(content)
scores = calculate_scores(all_keywords, data)
print_out(scores, how_many)
if __name__ == '__main__':
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment