Tattoo/keywords.py

## keywords.py
'''
You need to have Keywords-table as the last table inside of the suite- and
resource files. Or more clever grepping than presented below. `sed` etc. is also
fine if the output would be same as the grep command below.

$ grep -irn -A 5000 '* Keywords *' path/to/tests > my_file
$ python keywords.py my_file [how_many_scores]

or one-liner:
$ grep -irn -A 5000 '* Keywords *' path/to/tests | python keywords.py - [how_many_scores]

`how_many_scores` supports Python's list indexing:
$ python keywords.py my_file 1        # prints matching keywords with highest
                                      # score

$ python keywords.py my_file 1:3      # prints matching keywords with second
                                      # and third highest scores

$ python keywords.py my_file 2:5:2    # prints matching keywords with third
                                      # and fifth highest scores, skipping
                                      # fourth highest
'''
import re
import sys

from collections import namedtuple

try:
    from fuzzywuzzy import fuzz
except ImportError:
    print 'Please install fuzzywuzzy https://github.com/seatgeek/fuzzywuzzy'
    sys.exit(1)


def extract(content):
    pattern = re.compile('^(.*?)-(\d+)-(.*)$')
    for line in content:
        line = line.strip('\n')
        m = pattern.match(line)
        if not m:
            continue
        yield m.groups()

def extract_kw_information(content):
    data = []
    all_keywords = []
    record = namedtuple('data', ['kw_name', 'filename', 'lineno'])
    for (filename, lineno, kw_name) in extract(content):
        if (not kw_name or kw_name.startswith('#') or kw_name.startswith(' ')):
            continue
        data.append(record(kw_name, filename, lineno))
        all_keywords.append(kw_name)
    return data, all_keywords

def calculate_scores(keywords, data):
    scores = {}
    for index, kw_name in enumerate(keywords):
        best_score = -1
        best_records = None
        for another_index, another_kw_name in enumerate(keywords):
            if another_index == index:
                continue
            new_score = fuzz.token_sort_ratio(kw_name, another_kw_name)
            record =  { 'kws': (data[index], data[another_index]),
                        'score': new_score }
            if new_score > best_score:
                best_score = new_score
                best_records = [record]
            elif new_score == best_score:
                best_records.append(record)
        if best_score in scores:
            scores[best_score].extend(best_records)
        else:
            scores[best_score] = best_records
    return scores

def print_out(scores, number_range):
    if len(number_range) == 1:
        number_range = slice(0, number_range[0])
    else:
        number_range = slice(*number_range)
    for key in sorted(scores, reverse=True)[number_range]:
        for record in sorted(scores[key],
                             lambda x, y: cmp(x['kws'][0].kw_name,
                                              y['kws'][0].kw_name)):
            kw1, kw2 = record['kws']
            print ('Keyword "%s"\nfrom %s:%s seems similar to:\n"%s"\nfrom '
                   '%s:%s with score %d\n' % (kw1 + kw2 + (record['score'],)))

def parse_how_many(number_or_range):
  try:
    return (int(number_or_range),)
  except ValueError:
    if not ':' in number_or_range:
      raise
    return [int(i) for i in number_or_range.split(':')]

def main(path, how_many=10, *ignored):
    how_many = parse_how_many(how_many)
    content = None
    if path == '-':
      content = sys.stdin.readlines()
    else:
        with open(path, 'r') as f:
            content = f.readlines()
    data, all_keywords = extract_kw_information(content)
    scores = calculate_scores(all_keywords, data)
    print_out(scores, how_many)


if __name__ == '__main__':
    main(*sys.argv[1:])
	'''
	You need to have Keywords-table as the last table inside of the suite- and
	resource files. Or more clever grepping than presented below. `sed` etc. is also
	fine if the output would be same as the grep command below.

	$ grep -irn -A 5000 '* Keywords *' path/to/tests > my_file
	$ python keywords.py my_file [how_many_scores]

	or one-liner:
	$ grep -irn -A 5000 '* Keywords *' path/to/tests \| python keywords.py - [how_many_scores]

	`how_many_scores` supports Python's list indexing:
	$ python keywords.py my_file 1 # prints matching keywords with highest
	# score

	$ python keywords.py my_file 1:3 # prints matching keywords with second
	# and third highest scores

	$ python keywords.py my_file 2:5:2 # prints matching keywords with third
	# and fifth highest scores, skipping
	# fourth highest
	'''
	import re
	import sys

	from collections import namedtuple

	try:
	from fuzzywuzzy import fuzz
	except ImportError:
	print 'Please install fuzzywuzzy https://github.com/seatgeek/fuzzywuzzy'
	sys.exit(1)


	def extract(content):
	pattern = re.compile('^(.?)-(\d+)-(.)$')
	for line in content:
	line = line.strip('\n')
	m = pattern.match(line)
	if not m:
	continue
	yield m.groups()

	def extract_kw_information(content):
	data = []
	all_keywords = []
	record = namedtuple('data', ['kw_name', 'filename', 'lineno'])
	for (filename, lineno, kw_name) in extract(content):
	if (not kw_name or kw_name.startswith('#') or kw_name.startswith(' ')):
	continue
	data.append(record(kw_name, filename, lineno))
	all_keywords.append(kw_name)
	return data, all_keywords

	def calculate_scores(keywords, data):
	scores = {}
	for index, kw_name in enumerate(keywords):
	best_score = -1
	best_records = None
	for another_index, another_kw_name in enumerate(keywords):
	if another_index == index:
	continue
	new_score = fuzz.token_sort_ratio(kw_name, another_kw_name)
	record = { 'kws': (data[index], data[another_index]),
	'score': new_score }
	if new_score > best_score:
	best_score = new_score
	best_records = [record]
	elif new_score == best_score:
	best_records.append(record)
	if best_score in scores:
	scores[best_score].extend(best_records)
	else:
	scores[best_score] = best_records
	return scores

	def print_out(scores, number_range):
	if len(number_range) == 1:
	number_range = slice(0, number_range[0])
	else:
	number_range = slice(*number_range)
	for key in sorted(scores, reverse=True)[number_range]:
	for record in sorted(scores[key],
	lambda x, y: cmp(x['kws'][0].kw_name,
	y['kws'][0].kw_name)):
	kw1, kw2 = record['kws']
	print ('Keyword "%s"\nfrom %s:%s seems similar to:\n"%s"\nfrom '
	'%s:%s with score %d\n' % (kw1 + kw2 + (record['score'],)))

	def parse_how_many(number_or_range):
	try:
	return (int(number_or_range),)
	except ValueError:
	if not ':' in number_or_range:
	raise
	return [int(i) for i in number_or_range.split(':')]

	def main(path, how_many=10, *ignored):
	how_many = parse_how_many(how_many)
	content = None
	if path == '-':
	content = sys.stdin.readlines()
	else:
	with open(path, 'r') as f:
	content = f.readlines()
	data, all_keywords = extract_kw_information(content)
	scores = calculate_scores(all_keywords, data)
	print_out(scores, how_many)


	if __name__ == '__main__':
	main(*sys.argv[1:])