veegee/rake.py

## rake.py
"""
Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm

as described in:
Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
Automatic keyword extraction from individual documents.
In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import string
from collections import defaultdict
from functools import reduce
from itertools import takewhile, chain
from typing import List, Callable, TypeVar, Dict, Tuple

import nltk
from tabulate import tabulate

T = TypeVar('T')

#: :type: Set[str]
STOPWORDS = set(nltk.corpus.stopwords.words())

#: :type: int
TOP_FRACTION = 1  # consider top third candidate keywords by score


def is_punct(word: str) -> bool:
    return len(word) == 1 and word in string.punctuation


def is_numeric(word: str) -> bool:
    try:
        float(word) if '.' in word else int(word)
        return True
    except ValueError:
        return False


def group_list(predicate: Callable[[T], bool], lst: List[T]) -> List[List[T]]:
    """Group a list by consecutive elements matching the predicate
    """
    if len(lst) == 0:
        return []
    else:
        segment = list(takewhile(predicate, lst))
        if len(segment):
            return [segment] + group_list(predicate, lst[len(segment) + 1:])
        else:
            return group_list(predicate, lst[1:])


def _generate_candidate_keywords(sentences: List[str]) -> List[List[str]]:
    def process_sentence(sent: str) -> List[str]:
        words = list(map(lambda x: "|" if x in STOPWORDS else x, nltk.word_tokenize(sent.lower())))
        return group_list(lambda w: not (w == '|' or is_punct(w)), words)

    return list(chain(*map(process_sentence, sentences)))


def _calculate_word_scores(phrase_list: List[List[str]]) -> Dict[str, float]:
    all_words = list(chain(*phrase_list))
    word_freq = {word: all_words.count(word) for word in set(all_words)}

    word_degree = defaultdict(int)

    for phrase in phrase_list:
        degree = len(list(filter(lambda x: not is_numeric(x), phrase))) - 1
        for word in phrase:
            word_degree[word] += degree  # other words

    for word in word_freq.keys():
        word_degree[word] += word_freq[word]  # itself

    # word score = degree(w) / freq(w)
    word_scores = {word: word_degree[word] / word_freq[word] for word in word_freq.keys()}

    return word_scores


def _calculate_phrase_scores(phrase_list: List[List[str]], word_scores: Dict[str, float]) -> Dict[str, float]:
    def phrase_score(phrase_words: List[str]) -> int:
        return reduce(lambda acc, x: acc + word_scores[x], phrase_words, 0)

    phrase_scores = {' '.join(pw): phrase_score(pw) for pw in phrase_list}
    return phrase_scores


def rake(text: str) -> List[Tuple[str, float]]:
    sentences = nltk.sent_tokenize(text)
    phrase_list = _generate_candidate_keywords(sentences)
    word_scores = _calculate_word_scores(phrase_list)
    phrase_scores = _calculate_phrase_scores(phrase_list, word_scores)
    sorted_phrase_scores = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
    n_phrases = len(sorted_phrase_scores)
    return sorted_phrase_scores[0:int(n_phrases / TOP_FRACTION)]


if __name__ == '__main__':
    doc1 = '''
        Compatibility of systems of linear constraints over the set of natural
        numbers. Criteria of compatibility of a system of linear Diophantine
        equations, strict inequations, and nonstrict inequations are considered.
        Upper bounds for components of a minimal set of solutions and algorithms of
        construction of minimal generating sets of solutions for all types of
        systems are given. These criteria and the corresponding algorithms for
        constructing a minimal supporting set of solutions can be used in solving
        all the considered types of systems and systems of mixed types.'''

    keywords = rake(doc1)
    out = map(lambda t: (t[0], round(t[1], 2)), keywords)
    print(tabulate(out, tablefmt='psql'))
	"""
	Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm

	as described in:
	Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
	Automatic keyword extraction from individual documents.
	In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <http://www.gnu.org/licenses/>.
	"""

	import string
	from collections import defaultdict
	from functools import reduce
	from itertools import takewhile, chain
	from typing import List, Callable, TypeVar, Dict, Tuple

	import nltk
	from tabulate import tabulate

	T = TypeVar('T')

	#: :type: Set[str]
	STOPWORDS = set(nltk.corpus.stopwords.words())

	#: :type: int
	TOP_FRACTION = 1 # consider top third candidate keywords by score


	def is_punct(word: str) -> bool:
	return len(word) == 1 and word in string.punctuation


	def is_numeric(word: str) -> bool:
	try:
	float(word) if '.' in word else int(word)
	return True
	except ValueError:
	return False


	def group_list(predicate: Callable[[T], bool], lst: List[T]) -> List[List[T]]:
	"""Group a list by consecutive elements matching the predicate
	"""
	if len(lst) == 0:
	return []
	else:
	segment = list(takewhile(predicate, lst))
	if len(segment):
	return [segment] + group_list(predicate, lst[len(segment) + 1:])
	else:
	return group_list(predicate, lst[1:])


	def _generate_candidate_keywords(sentences: List[str]) -> List[List[str]]:
	def process_sentence(sent: str) -> List[str]:
	words = list(map(lambda x: "\|" if x in STOPWORDS else x, nltk.word_tokenize(sent.lower())))
	return group_list(lambda w: not (w == '\|' or is_punct(w)), words)

	return list(chain(*map(process_sentence, sentences)))


	def _calculate_word_scores(phrase_list: List[List[str]]) -> Dict[str, float]:
	all_words = list(chain(*phrase_list))
	word_freq = {word: all_words.count(word) for word in set(all_words)}

	word_degree = defaultdict(int)

	for phrase in phrase_list:
	degree = len(list(filter(lambda x: not is_numeric(x), phrase))) - 1
	for word in phrase:
	word_degree[word] += degree # other words

	for word in word_freq.keys():
	word_degree[word] += word_freq[word] # itself

	# word score = degree(w) / freq(w)
	word_scores = {word: word_degree[word] / word_freq[word] for word in word_freq.keys()}

	return word_scores


	def _calculate_phrase_scores(phrase_list: List[List[str]], word_scores: Dict[str, float]) -> Dict[str, float]:
	def phrase_score(phrase_words: List[str]) -> int:
	return reduce(lambda acc, x: acc + word_scores[x], phrase_words, 0)

	phrase_scores = {' '.join(pw): phrase_score(pw) for pw in phrase_list}
	return phrase_scores


	def rake(text: str) -> List[Tuple[str, float]]:
	sentences = nltk.sent_tokenize(text)
	phrase_list = _generate_candidate_keywords(sentences)
	word_scores = _calculate_word_scores(phrase_list)
	phrase_scores = _calculate_phrase_scores(phrase_list, word_scores)
	sorted_phrase_scores = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
	n_phrases = len(sorted_phrase_scores)
	return sorted_phrase_scores[0:int(n_phrases / TOP_FRACTION)]


	if __name__ == '__main__':
	doc1 = '''
	Compatibility of systems of linear constraints over the set of natural
	numbers. Criteria of compatibility of a system of linear Diophantine
	equations, strict inequations, and nonstrict inequations are considered.
	Upper bounds for components of a minimal set of solutions and algorithms of
	construction of minimal generating sets of solutions for all types of
	systems are given. These criteria and the corresponding algorithms for
	constructing a minimal supporting set of solutions can be used in solving
	all the considered types of systems and systems of mixed types.'''

	keywords = rake(doc1)
	out = map(lambda t: (t[0], round(t[1], 2)), keywords)
	print(tabulate(out, tablefmt='psql'))