Last active
August 29, 2015 14:22
-
-
Save Tattoo/6c5ba0b8874aa94e5c1c to your computer and use it in GitHub Desktop.
keywords.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
You need to have Keywords-table as the last table inside of the suite- and | |
resource files. Or more clever grepping than presented below. `sed` etc. is also | |
fine if the output would be same as the grep command below. | |
$ grep -irn -A 5000 '* Keywords *' path/to/tests > my_file | |
$ python keywords.py my_file [how_many_scores] | |
or one-liner: | |
$ grep -irn -A 5000 '* Keywords *' path/to/tests | python keywords.py - [how_many_scores] | |
`how_many_scores` supports Python's list indexing: | |
$ python keywords.py my_file 1 # prints matching keywords with highest | |
# score | |
$ python keywords.py my_file 1:3 # prints matching keywords with second | |
# and third highest scores | |
$ python keywords.py my_file 2:5:2 # prints matching keywords with third | |
# and fifth highest scores, skipping | |
# fourth highest | |
''' | |
import re | |
import sys | |
from collections import namedtuple | |
try: | |
from fuzzywuzzy import fuzz | |
except ImportError: | |
print 'Please install fuzzywuzzy https://github.com/seatgeek/fuzzywuzzy' | |
sys.exit(1) | |
def extract(content): | |
pattern = re.compile('^(.*?)-(\d+)-(.*)$') | |
for line in content: | |
line = line.strip('\n') | |
m = pattern.match(line) | |
if not m: | |
continue | |
yield m.groups() | |
def extract_kw_information(content): | |
data = [] | |
all_keywords = [] | |
record = namedtuple('data', ['kw_name', 'filename', 'lineno']) | |
for (filename, lineno, kw_name) in extract(content): | |
if (not kw_name or kw_name.startswith('#') or kw_name.startswith(' ')): | |
continue | |
data.append(record(kw_name, filename, lineno)) | |
all_keywords.append(kw_name) | |
return data, all_keywords | |
def calculate_scores(keywords, data): | |
scores = {} | |
for index, kw_name in enumerate(keywords): | |
best_score = -1 | |
best_records = None | |
for another_index, another_kw_name in enumerate(keywords): | |
if another_index == index: | |
continue | |
new_score = fuzz.token_sort_ratio(kw_name, another_kw_name) | |
record = { 'kws': (data[index], data[another_index]), | |
'score': new_score } | |
if new_score > best_score: | |
best_score = new_score | |
best_records = [record] | |
elif new_score == best_score: | |
best_records.append(record) | |
if best_score in scores: | |
scores[best_score].extend(best_records) | |
else: | |
scores[best_score] = best_records | |
return scores | |
def print_out(scores, number_range): | |
if len(number_range) == 1: | |
number_range = slice(0, number_range[0]) | |
else: | |
number_range = slice(*number_range) | |
for key in sorted(scores, reverse=True)[number_range]: | |
for record in sorted(scores[key], | |
lambda x, y: cmp(x['kws'][0].kw_name, | |
y['kws'][0].kw_name)): | |
kw1, kw2 = record['kws'] | |
print ('Keyword "%s"\nfrom %s:%s seems similar to:\n"%s"\nfrom ' | |
'%s:%s with score %d\n' % (kw1 + kw2 + (record['score'],))) | |
def parse_how_many(number_or_range): | |
try: | |
return (int(number_or_range),) | |
except ValueError: | |
if not ':' in number_or_range: | |
raise | |
return [int(i) for i in number_or_range.split(':')] | |
def main(path, how_many=10, *ignored): | |
how_many = parse_how_many(how_many) | |
content = None | |
if path == '-': | |
content = sys.stdin.readlines() | |
else: | |
with open(path, 'r') as f: | |
content = f.readlines() | |
data, all_keywords = extract_kw_information(content) | |
scores = calculate_scores(all_keywords, data) | |
print_out(scores, how_many) | |
if __name__ == '__main__': | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment