kasnder/top_senteces_from_text.py

## top_senteces_from_text.py
# Authored by Konrad Kollnig
# Oxford, 20 April 2019

import wikipedia
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from collections import defaultdict
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

data = wikipedia.page("New York City").content
allSentences = sent_tokenize(data)
lenghts = [len(s) for s in allSentences]
m = np.array(lenghts).mean()
longSentences = [s for s in allSentences if len(s) > m/2

sentences = longSentences

# Get number of tokens
cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
cv_fit = cv.fit(sentences)
counts = cv_fit.transform(longSentences).toarray().sum(axis=1)

# Get weighting of words, compared to other sentences
tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
tf_fit = tf.fit(sentences)
weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten()
results = weights/counts

# Extract top positions
topPositions = np.sort(np.flip(np.argsort(results))[0:20])
topSentences = [longSentences[p] for p in topPositions]
topLenghts = [len(s) for s in topSentences]
print(topSentences)
	# Authored by Konrad Kollnig
	# Oxford, 20 April 2019

	import wikipedia
	from nltk.tokenize import sent_tokenize, RegexpTokenizer
	from nltk.corpus import stopwords
	from collections import defaultdict
	import math
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

	data = wikipedia.page("New York City").content
	allSentences = sent_tokenize(data)
	lenghts = [len(s) for s in allSentences]
	m = np.array(lenghts).mean()
	longSentences = [s for s in allSentences if len(s) > m/2

	sentences = longSentences

	# Get number of tokens
	cv = CountVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
	cv_fit = cv.fit(sentences)
	counts = cv_fit.transform(longSentences).toarray().sum(axis=1)

	# Get weighting of words, compared to other sentences
	tf = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b[A-Za-z\-]{4,}\b") # can also use \w
	tf_fit = tf.fit(sentences)
	weights = np.asarray(tf_fit.transform(longSentences).sum(axis=1).squeeze()).flatten()
	results = weights/counts

	# Extract top positions
	topPositions = np.sort(np.flip(np.argsort(results))[0:20])
	topSentences = [longSentences[p] for p in topPositions]
	topLenghts = [len(s) for s in topSentences]
	print(topSentences)