Skip to content

Instantly share code, notes, and snippets.

@Stringsaeed
Created February 12, 2019 15:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Stringsaeed/1c9d470bebc9430a64019649e0f73075 to your computer and use it in GitHub Desktop.
Save Stringsaeed/1c9d470bebc9430a64019649e0f73075 to your computer and use it in GitHub Desktop.
This is a naive implementation of textrank algorithm to summarize some text, I just looking for edits
import re
import string
import networkx as nx
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob.wordnet import NOUN, VERB, ADJ, ADV
from sklearn.metrics.pairwise import cosine_similarity
_WORD_PAT = r"\w[\w']{3,}"
_SENT_PAT = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
_nw_line = r'[\n]+'
class TextCleaner:
def __init__(self, input_sent):
self.stop_words = set(stopwords.words("english"))
self.punctuations = set(string.punctuation)
self.pos_tags = {
NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
ADJ: ['JJ', 'JJR', 'JJS'],
ADV: ['RB', 'RBR', 'RBS', 'WRB']
}
self.input_sent = input_sent
def _remove_stop_words(self, words):
return [w for w in words if w not in self.stop_words]
def _remove_regex(self):
self.input_sent = self.input_sent.lower()
self.input_sent = re.sub(r"i'm", "i am", self.input_sent)
self.input_sent = re.sub(r"he's", "he is", self.input_sent)
self.input_sent = re.sub(r"she's", "she is", self.input_sent)
self.input_sent = re.sub(r"that's", "that is", self.input_sent)
self.input_sent = re.sub(r"what's", "what is", self.input_sent)
self.input_sent = re.sub(r"where's", "where is", self.input_sent)
self.input_sent = re.sub(r"\'ll", " will", self.input_sent)
self.input_sent = re.sub(r"\'ve", " have", self.input_sent)
self.input_sent = re.sub(r"\'re", " are", self.input_sent)
self.input_sent = re.sub(r"\'d", " would", self.input_sent)
self.input_sent = re.sub(r"won't", "will not", self.input_sent)
self.input_sent = re.sub(r"can't", "cannot", self.input_sent)
self.input_sent = re.sub(r"don't", "do not", self.input_sent)
patterns = re.finditer("#[\w]*", self.input_sent)
for pattern in patterns:
self.input_sent = re.sub(pattern.group().strip(), "", self.input_sent)
self.input_sent = "".join(ch for ch in self.input_sent if ch not in self.punctuations)
def _tokenize(self):
return re.findall(_WORD_PAT, self.input_sent)
def _process_content_for_pos(self, words):
tagged_words = pos_tag(words)
pos_words = []
for word in tagged_words:
flag = False
for key, value in self.pos_tags.items():
if word[1] in value:
pos_words.append((word[0], key))
flag = True
break
if not flag:
pos_words.append((word[0], NOUN))
return pos_words
def _remove_noise(self):
self._remove_regex()
words = self._tokenize()
noise_free_words = self._remove_stop_words(words)
return noise_free_words
def _normalize_text(self, words):
lem = WordNetLemmatizer()
pos_words = self._process_content_for_pos(words)
normalized_words = [lem.lemmatize(w, pos=p) for w, p in pos_words]
return normalized_words
def sent_tokenize(self):
return re.split(_SENT_PAT, self.input_sent)
def clean_up(self):
cleaned_words = self._remove_noise()
cleaned_words = self._normalize_text(cleaned_words)
return cleaned_words
def to_text(it):
return " ".join(it)
def read_word_embding():
file = open('glove.6B.100d.txt', 'r', encoding='utf-8')
we = dict()
for line in file:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
we[word] = coefs
file.close()
return we
def textrank(text):
sents = TextCleaner(text).sent_tokenize()
clean_sentences = [to_text(TextCleaner(sent).clean_up()) for sent in sents]
sentence_vectors = []
we = read_word_embding()
for i in clean_sentences:
if len(i) != 0:
v = sum([we.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
else:
v = np.zeros((100,))
sentence_vectors.append(v)
sim_mat = np.zeros([len(sents), len(sents)])
for i in range(len(sents)):
for j in range(len(sents)):
if i != j:
sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sents)), reverse=True)
for i in range(10):
print(ranked_sentences[i][1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment