Skip to content

Instantly share code, notes, and snippets.

@veegee
Created May 29, 2016 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veegee/b7f4dc4af5343000f8cb94197c1b0c92 to your computer and use it in GitHub Desktop.
Save veegee/b7f4dc4af5343000f8cb94197c1b0c92 to your computer and use it in GitHub Desktop.
RAKE algorithm implemented in Python 3.5+
"""
Implementation of RAKE - Rapid Automatic Keyword Extraction algorithm
as described in:
Rose, S., D. Engel, N. Cramer, and W. Cowley (2010).
Automatic keyword extraction from individual documents.
In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import string
from collections import defaultdict
from functools import reduce
from itertools import takewhile, chain
from typing import List, Callable, TypeVar, Dict, Tuple
import nltk
from tabulate import tabulate
T = TypeVar('T')
#: :type: Set[str]
STOPWORDS = set(nltk.corpus.stopwords.words())
#: :type: int
TOP_FRACTION = 1 # consider top third candidate keywords by score
def is_punct(word: str) -> bool:
return len(word) == 1 and word in string.punctuation
def is_numeric(word: str) -> bool:
try:
float(word) if '.' in word else int(word)
return True
except ValueError:
return False
def group_list(predicate: Callable[[T], bool], lst: List[T]) -> List[List[T]]:
"""Group a list by consecutive elements matching the predicate
"""
if len(lst) == 0:
return []
else:
segment = list(takewhile(predicate, lst))
if len(segment):
return [segment] + group_list(predicate, lst[len(segment) + 1:])
else:
return group_list(predicate, lst[1:])
def _generate_candidate_keywords(sentences: List[str]) -> List[List[str]]:
def process_sentence(sent: str) -> List[str]:
words = list(map(lambda x: "|" if x in STOPWORDS else x, nltk.word_tokenize(sent.lower())))
return group_list(lambda w: not (w == '|' or is_punct(w)), words)
return list(chain(*map(process_sentence, sentences)))
def _calculate_word_scores(phrase_list: List[List[str]]) -> Dict[str, float]:
all_words = list(chain(*phrase_list))
word_freq = {word: all_words.count(word) for word in set(all_words)}
word_degree = defaultdict(int)
for phrase in phrase_list:
degree = len(list(filter(lambda x: not is_numeric(x), phrase))) - 1
for word in phrase:
word_degree[word] += degree # other words
for word in word_freq.keys():
word_degree[word] += word_freq[word] # itself
# word score = degree(w) / freq(w)
word_scores = {word: word_degree[word] / word_freq[word] for word in word_freq.keys()}
return word_scores
def _calculate_phrase_scores(phrase_list: List[List[str]], word_scores: Dict[str, float]) -> Dict[str, float]:
def phrase_score(phrase_words: List[str]) -> int:
return reduce(lambda acc, x: acc + word_scores[x], phrase_words, 0)
phrase_scores = {' '.join(pw): phrase_score(pw) for pw in phrase_list}
return phrase_scores
def rake(text: str) -> List[Tuple[str, float]]:
sentences = nltk.sent_tokenize(text)
phrase_list = _generate_candidate_keywords(sentences)
word_scores = _calculate_word_scores(phrase_list)
phrase_scores = _calculate_phrase_scores(phrase_list, word_scores)
sorted_phrase_scores = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
n_phrases = len(sorted_phrase_scores)
return sorted_phrase_scores[0:int(n_phrases / TOP_FRACTION)]
if __name__ == '__main__':
doc1 = '''
Compatibility of systems of linear constraints over the set of natural
numbers. Criteria of compatibility of a system of linear Diophantine
equations, strict inequations, and nonstrict inequations are considered.
Upper bounds for components of a minimal set of solutions and algorithms of
construction of minimal generating sets of solutions for all types of
systems are given. These criteria and the corresponding algorithms for
constructing a minimal supporting set of solutions can be used in solving
all the considered types of systems and systems of mixed types.'''
keywords = rake(doc1)
out = map(lambda t: (t[0], round(t[1], 2)), keywords)
print(tabulate(out, tablefmt='psql'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment