Skip to content

Instantly share code, notes, and snippets.

@haroonrasheed333
Created September 9, 2013 04:54
Show Gist options
  • Save haroonrasheed333/6491624 to your computer and use it in GitHub Desktop.
Save haroonrasheed333/6491624 to your computer and use it in GitHub Desktop.
from __future__ import division
from nltk.book import *
# Function to calculate the lexical diversity of a text
def lexical_diversity(text):
return len(text) / len(set(text))
def percentage(count, total):
return 100 * count / total
def text_stats(text, word):
"""Write a function that takes as input a text object (from nltk.book) and a
word, and prints out the text?s name, the total number of words in the text,
the size of the vocabulary, how often the word occurs in the text, the
concordance for the word, and plots the dispersion of the word in the text."""
text_name = text.name
num_words = len(text)
vocab_size = len(set(text))
count_word = text.count(word)
percent_word = percentage(count_word, num_words)
print("Name of Text: " + text_name)
print("Number of words in text: " + str(num_words))
print("The size of vocabulary: " + str(vocab_size))
print("Number of occurrence of the word " + str(word) + " in the text: " + str(count_word))
print("Percentage of occurrence of the word " + str(word) + " in the text: " + str(percent_word))
print("The concordance of the word " + str(word))
print(text.concordance(word))
text.dispersion_plot([word])
def main():
# 1.1 Expressions
print("SECTION 1.1")
print("Expressions")
print 1 + 5 * 2 - 3
print 2 * 3 + 5 - 2 + (4 / 2)
# 1.1 Concordance - Occurrence of a word in the text along with context
print("Concordance of a word in text")
print("Concordance of the word young in text1 - " + text1.name)
print(text1.concordance("young"))
print("Concordance of the word affection in text2 - " + text2.name)
print(text2.concordance("affection"))
print("Concordance of the word lived in text3 - " + text3.name)
print(text3.concordance("lived"))
# Concordance for the word "love". Returns 5 lines with width 50 each
print(text1.concordance("love", width=50, lines=5))
# 1.1 Print all the words that occur in the same range of contexts as the given word "extremely" in text2
print("Words in same range of contexts as extremely in text2 - " + text2.name)
print(text2.similar("extremely"))
# 1.1 Print the common contexts shared by the given words "extremely" and "so" in text2
print("Common contexts shared by words extremely and so in text2 - " + text2.name)
print(text2.common_contexts(["extremely", "so"]))
# Lexical dispersion plot of the given words in text
text2.dispersion_plot(['very', 'exceedingly', 'heartily', 'remarkably', 'monstrous', 'so'])
text4.dispersion_plot(['liberty', 'constitution'])
# Number and percentage of occurrence of the word 'lol' in text5
count_lol = text5.count('lol')
print("Number of occurrences of the word lol in text5: " + str(count_lol))
percent_lol = 100 * text5.count('lol') / len(text5)
print("Percentage of occurrence of the word lol in text5: " + str(percent_lol))
# Lexical diversity of text3 and text5
print("Lexical Diversity of text3: " + str(lexical_diversity(text3)))
print("Lexical Diversity of text5: " + str(lexical_diversity(text5)))
# Percentage of occurrence of the word 'a' in text4
percent_a = percentage(text4.count('a'), len(text4))
print("Percentage of occurrence of the word a in text4: " + str(percent_a))
# 1.2 Lists
# Define a list ex1
ex1 = ['Courage', 'is', 'not', 'the', 'absence', 'of', 'fear', 'but', 'rather', 'the', 'judgement', 'that', 'something', 'else', 'is', 'more', 'important', 'than', 'fear']
# Print the sorted list
print("New List ex1: " + str(ex1))
print("Sorted list of ex1: " + str(sorted(ex1)))
print("Length of ex1: " + str(len(ex1)))
# Print the vocabulary of ex1
print("Number of vocabulary items in ex1: " + str(len(set(ex1))))
# Concatenate two lists
print("Concatenating lists")
print(['Courage', 'is', 'not', 'the', 'absence', 'of', 'fear', 'but', 'rather'] + ['the', 'judgement', 'that', 'something', 'else', 'is', 'more', 'important', 'than', 'fear'])
print(sent1 + sent3)
ex2 = ['What', 'Does', 'Not', 'Kill', 'Me', 'Makes', 'Me', 'Stronger']
print("New list ex2: " + str(ex2))
# Print the index of word 'Kill' in ex2
print("Index of the word Kill in ex2: " + str(ex2.index('Kill')))
# Print the item in index 3. (index starts from 0)
print("Item in index 3 of ex2: " + str(ex2[3]))
# Access sublists using slicing
print("Access sublists using slicing: ")
print(ex2[1:3])
print(ex2[4:7])
print(ex2[:3])
print(ex2[3:])
# Replace items in an index with another
ex2[4] = 'you'
ex2[6] = 'you'
print("Replace some items in ex2: ")
print(ex2)
# Replace a sublist (slice) with another
print("Replacing a sublist(slice) in ex2")
ex2[1:3] = ['Doesnt']
print(ex2)
# 1.3
# Frequency Distribution - frequency of each vocabulary item in the text
fdist = FreqDist(text2)
# Store the vocabulary items of text2 in variable vocab
vocab = fdist.keys()
# Print the first 30 vocabulary items
print("First 30 vocabulary items of text2 - " + text2.name)
print(vocab[:30])
# Print the number of occurrences of word 'weakness'
print("Number of occurrences of the word weakness in text2: " + str(fdist['weakness']))
# Plot the top 50 high frequency words in the text
fdist.plot(50, cumulative=True)
# Print a sorted list of all the words greater than length 14 in text2
V = set(text2)
long_words = [word for word in V if len(word) > 14]
print("Sorted list of all words greater than length 14 in text2 - " + text2.name)
print(sorted(long_words))
print("Function to print text stats: ")
text_stats(text5, 'hello')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment