Skip to content

Instantly share code, notes, and snippets.

@wannaphong
Last active March 12, 2018 07:39
Show Gist options
  • Save wannaphong/857d564c6dbc549f29f698564f2a07f8 to your computer and use it in GitHub Desktop.
Save wannaphong/857d564c6dbc549f29f698564f2a07f8 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""Implementation of Rapid Automatic Keyword Extraction algorithm.
As described in the paper `Automatic keyword extraction from individual
documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
Thai language by Mr.Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com>
"""
import string
from collections import defaultdict, Counter
from itertools import chain, groupby, product
import pythainlp
class Rake(object):
def __init__(self, stopwords=None, punctuations=None, language='thai'):
"""Constructor.
:param stopwords: List of Words to be ignored for keyword extraction.
:param punctuations: Punctuations to be ignored for keyword extraction.
:param language: Language to be used for stopwords
"""
# If stopwords not provided we use language stopwords by default.
self.stopwords = stopwords
if self.stopwords is None:
self.stopwords = pythainlp.corpus.stopwords.words(language)
# If punctuations are not provided we ignore all punctuation symbols.
self.punctuations = punctuations
if self.punctuations is None:
self.punctuations = string.punctuation
# All things which act as sentence breaks during keyword extraction.
self.to_ignore = set(chain(self.stopwords, self.punctuations))
# Stuff to be extracted from the provided text.
self.frequency_dist = None
self.degree = None
self.rank_list = None
self.ranked_phrases = None
def extract_keywords_from_text(self, text):
"""Method to extract keywords from the text provided.
:param text: Text to extract keywords from, provided as a string.
"""
sentences = pythainlp.tokenize.sent_tokenize(text)
self.extract_keywords_from_sentences(sentences)
def extract_keywords_from_sentences(self, sentences):
"""Method to extract keywords from the list of sentences provided.
:param sentences: Text to extraxt keywords from, provided as a list
of strings, where each string is a sentence.
"""
phrase_list = self._generate_phrases(sentences)
self._build_frequency_dist(phrase_list)
self._build_word_co_occurance_graph(phrase_list)
self._build_ranklist(phrase_list)
def get_ranked_phrases(self):
"""Method to fetch ranked keyword strings.
:return: List of strings where each string represents an extracted
keyword string.
"""
return self.ranked_phrases
def get_ranked_phrases_with_scores(self):
"""Method to fetch ranked keyword strings along with their scores.
:return: List of tuples where each tuple is formed of an extracted
keyword string and its score. Ex: (5.68, 'Four Scoures')
"""
return self.rank_list
def get_word_frequency_distribution(self):
"""Method to fetch the word frequency distribution in the given text.
:return: Dictionary (defaultdict) of the format `word -> frequency`.
"""
return self.frequency_dist
def get_word_degrees(self):
"""Method to fetch the degree of words in the given text. Degree can be
defined as sum of co-occurances of the word with other words in the
given text.
:return: Dictionary (defaultdict) of the format `word -> degree`.
"""
return self.degree
def _build_frequency_dist(self, phrase_list):
"""Builds frequency distribution of the words in the given body of text.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
self.frequency_dist = Counter(chain.from_iterable(phrase_list))
def _build_word_co_occurance_graph(self, phrase_list):
"""Builds the co-occurance graph of words in the given body of text to
compute degree of each word.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
for phrase in phrase_list:
# For each phrase in the phrase list, count co-occurances of the
# word with other words in the phrase.
#
# Note: Keep the co-occurances graph as is, to help facilitate its
# use in other creative ways if required later.
for (word, coword) in product(phrase, phrase):
co_occurance_graph[word][coword] += 1
self.degree = defaultdict(lambda: 0)
for key in co_occurance_graph:
self.degree[key] = sum(co_occurance_graph[key].values())
def _build_ranklist(self, phrase_list):
"""Method to rank each contender phrase using the formula
phrase_score = sum of scores of words in the phrase.
word_score = d(w)/f(w) where d is degree and f is frequency.
:param phrase_list: List of List of strings where each sublist is a
collection of words which form a contender phrase.
"""
self.rank_list = []
for phrase in phrase_list:
rank = 0.0
for word in phrase:
rank += 1.0 * self.degree[word] / self.frequency_dist[word]
self.rank_list.append((rank, ' '.join(phrase)))
self.rank_list.sort(reverse=True)
self.ranked_phrases = [ph[1] for ph in self.rank_list]
def _generate_phrases(self, sentences):
"""Method to generate contender phrases given the sentences of the text
document.
:param sentences: List of strings where each string represents a
sentence which forms the text.
:return: Set of string tuples where each tuple is a collection
of words forming a contender phrase.
"""
phrase_list = set()
# Create contender phrases from sentences.
for sentence in sentences:
word_list = [word.lower() for word in pythainlp.word_tokenize(sentence)]
phrase_list.update(self._get_phrase_list_from_words(word_list))
return phrase_list
def _get_phrase_list_from_words(self, word_list):
"""Method to create contender phrases from the list of words that form
a sentence by dropping stopwords and punctuations and grouping the left
words into phrases. Ex:
Sentence: Red apples, are good in flavour.
List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
List after dropping punctuations and stopwords.
List of words: ['red', 'apples', *, *, good, *, 'flavour']
List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
:param word_list: List of words which form a sentence when joined in
the same order.
:return: List of contender phrases that are formed after dropping
stopwords and punctuations.
"""
groups = groupby(word_list, lambda x: x not in self.to_ignore)
return [tuple(group[1]) for group in groups if group[0]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment