Created
September 9, 2013 04:54
-
-
Save haroonrasheed333/6491624 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from nltk.book import * | |
# Function to calculate the lexical diversity of a text | |
def lexical_diversity(text): | |
return len(text) / len(set(text)) | |
def percentage(count, total): | |
return 100 * count / total | |
def text_stats(text, word): | |
"""Write a function that takes as input a text object (from nltk.book) and a | |
word, and prints out the text?s name, the total number of words in the text, | |
the size of the vocabulary, how often the word occurs in the text, the | |
concordance for the word, and plots the dispersion of the word in the text.""" | |
text_name = text.name | |
num_words = len(text) | |
vocab_size = len(set(text)) | |
count_word = text.count(word) | |
percent_word = percentage(count_word, num_words) | |
print("Name of Text: " + text_name) | |
print("Number of words in text: " + str(num_words)) | |
print("The size of vocabulary: " + str(vocab_size)) | |
print("Number of occurrence of the word " + str(word) + " in the text: " + str(count_word)) | |
print("Percentage of occurrence of the word " + str(word) + " in the text: " + str(percent_word)) | |
print("The concordance of the word " + str(word)) | |
print(text.concordance(word)) | |
text.dispersion_plot([word]) | |
def main(): | |
# 1.1 Expressions | |
print("SECTION 1.1") | |
print("Expressions") | |
print 1 + 5 * 2 - 3 | |
print 2 * 3 + 5 - 2 + (4 / 2) | |
# 1.1 Concordance - Occurrence of a word in the text along with context | |
print("Concordance of a word in text") | |
print("Concordance of the word young in text1 - " + text1.name) | |
print(text1.concordance("young")) | |
print("Concordance of the word affection in text2 - " + text2.name) | |
print(text2.concordance("affection")) | |
print("Concordance of the word lived in text3 - " + text3.name) | |
print(text3.concordance("lived")) | |
# Concordance for the word "love". Returns 5 lines with width 50 each | |
print(text1.concordance("love", width=50, lines=5)) | |
# 1.1 Print all the words that occur in the same range of contexts as the given word "extremely" in text2 | |
print("Words in same range of contexts as extremely in text2 - " + text2.name) | |
print(text2.similar("extremely")) | |
# 1.1 Print the common contexts shared by the given words "extremely" and "so" in text2 | |
print("Common contexts shared by words extremely and so in text2 - " + text2.name) | |
print(text2.common_contexts(["extremely", "so"])) | |
# Lexical dispersion plot of the given words in text | |
text2.dispersion_plot(['very', 'exceedingly', 'heartily', 'remarkably', 'monstrous', 'so']) | |
text4.dispersion_plot(['liberty', 'constitution']) | |
# Number and percentage of occurrence of the word 'lol' in text5 | |
count_lol = text5.count('lol') | |
print("Number of occurrences of the word lol in text5: " + str(count_lol)) | |
percent_lol = 100 * text5.count('lol') / len(text5) | |
print("Percentage of occurrence of the word lol in text5: " + str(percent_lol)) | |
# Lexical diversity of text3 and text5 | |
print("Lexical Diversity of text3: " + str(lexical_diversity(text3))) | |
print("Lexical Diversity of text5: " + str(lexical_diversity(text5))) | |
# Percentage of occurrence of the word 'a' in text4 | |
percent_a = percentage(text4.count('a'), len(text4)) | |
print("Percentage of occurrence of the word a in text4: " + str(percent_a)) | |
# 1.2 Lists | |
# Define a list ex1 | |
ex1 = ['Courage', 'is', 'not', 'the', 'absence', 'of', 'fear', 'but', 'rather', 'the', 'judgement', 'that', 'something', 'else', 'is', 'more', 'important', 'than', 'fear'] | |
# Print the sorted list | |
print("New List ex1: " + str(ex1)) | |
print("Sorted list of ex1: " + str(sorted(ex1))) | |
print("Length of ex1: " + str(len(ex1))) | |
# Print the vocabulary of ex1 | |
print("Number of vocabulary items in ex1: " + str(len(set(ex1)))) | |
# Concatenate two lists | |
print("Concatenating lists") | |
print(['Courage', 'is', 'not', 'the', 'absence', 'of', 'fear', 'but', 'rather'] + ['the', 'judgement', 'that', 'something', 'else', 'is', 'more', 'important', 'than', 'fear']) | |
print(sent1 + sent3) | |
ex2 = ['What', 'Does', 'Not', 'Kill', 'Me', 'Makes', 'Me', 'Stronger'] | |
print("New list ex2: " + str(ex2)) | |
# Print the index of word 'Kill' in ex2 | |
print("Index of the word Kill in ex2: " + str(ex2.index('Kill'))) | |
# Print the item in index 3. (index starts from 0) | |
print("Item in index 3 of ex2: " + str(ex2[3])) | |
# Access sublists using slicing | |
print("Access sublists using slicing: ") | |
print(ex2[1:3]) | |
print(ex2[4:7]) | |
print(ex2[:3]) | |
print(ex2[3:]) | |
# Replace items in an index with another | |
ex2[4] = 'you' | |
ex2[6] = 'you' | |
print("Replace some items in ex2: ") | |
print(ex2) | |
# Replace a sublist (slice) with another | |
print("Replacing a sublist(slice) in ex2") | |
ex2[1:3] = ['Doesnt'] | |
print(ex2) | |
# 1.3 | |
# Frequency Distribution - frequency of each vocabulary item in the text | |
fdist = FreqDist(text2) | |
# Store the vocabulary items of text2 in variable vocab | |
vocab = fdist.keys() | |
# Print the first 30 vocabulary items | |
print("First 30 vocabulary items of text2 - " + text2.name) | |
print(vocab[:30]) | |
# Print the number of occurrences of word 'weakness' | |
print("Number of occurrences of the word weakness in text2: " + str(fdist['weakness'])) | |
# Plot the top 50 high frequency words in the text | |
fdist.plot(50, cumulative=True) | |
# Print a sorted list of all the words greater than length 14 in text2 | |
V = set(text2) | |
long_words = [word for word in V if len(word) > 14] | |
print("Sorted list of all words greater than length 14 in text2 - " + text2.name) | |
print(sorted(long_words)) | |
print("Function to print text stats: ") | |
text_stats(text5, 'hello') | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment