ketanghumatkar/ir_system_nlp.py

## ir_system_nlp.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 14:26:40 2020

@author: kets
"""
####################################################################################
#               HTML PARSER
####################################################################################
import requests
import re
from bs4 import BeautifulSoup

class HtmlParser:
    def __init__(self, path):
        self.path = path

    def parse(self):
        print(f'Parsing corpus..')
        soup = BeautifulSoup(open(self.path, encoding="utf8"), "html.parser")
        text = soup.get_text()
        text = re.sub(r'\[[0-9]*\]', ' ', text)
        text = re.sub(r'\s+',' ', text)
        text = text.lower()
        text = re.sub(r'\W',' ', text)
        text = re.sub(r'\s+',' ', text)
        return text

    def print(self):
        print(self.parse())

####################################################################################
#                NGRAM GENERATION
####################################################################################

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer

class Ngram:

    def __init__(self, text):
        self.tokenize_text = self.tokenize(text)

    def extract_ngrams(self, num):
        n_grams = ngrams(self.tokenize_text, num)
        return [ ' '.join(grams) for grams in n_grams]

    def extract_ngrams_with_stemming(self, num):
        ps = PorterStemmer()
        self.stem_tokenize_text = [ps.stem(txt) for txt in self.tokenize_text]
        n_grams = ngrams(self.stem_tokenize_text, num)
        return [ ' '.join(grams) for grams in n_grams]

    def tokenize(self, text):
        print("Tokenize words...")
        return word_tokenize(text)


####################################################################################
#                PLOT GRAPH AND ANALYSIS
####################################################################################

import matplotlib
import matplotlib.pyplot as plt
import os
import pandas as pd

class Analysis:
    def __init__(self, fd):
        self.fd = fd

    def plot_logarithmic_distribution(self):
        current_path = os.getcwd()
        row_list = []
        rank = 0
        for x in self.fd.most_common():
            word = x[0]
            count = x[1]
            rank += 1
            row = {'Word': word, 'Rank': rank, 'Frequency': count}
            row_list.append(row)

        df = pd.DataFrame(row_list, columns=['Word', 'Rank', 'Frequency'])
        df.to_csv(os.path.join(current_path, 'zipflaw.csv'), index=False, encoding='utf-8')
        df = None

        csv_data = pd.read_csv(os.path.join(current_path, 'zipflaw.csv'), encoding='utf-8')
        plt.loglog(csv_data['Rank'], csv_data['Frequency'])
        plt.xlabel('Frequency of ngrams', fontsize=14, fontweight='bold')
        plt.ylabel('Rank of ngrams', fontsize=14, fontweight='bold')
        plt.title('Ngram distribution with rank of ngram(logarithmic)')
        plt.grid(True)
        plt.show()

    def top_20_words(self):
        common_words = map(lambda p: str(p[0]) + ' ' + str(p[1]), self.fd.most_common(20))
        for cw in common_words:
            print(f' \t{cw}')

    def plot_top_20_word(self):
        self.fd.plot(20, title='Top 20 ngram distribution with rank of ngram to verify zipf law')


    def for_90_percent_coverage(self):
        sum = 0
        counter = 0
        exceptedsum = self.ninty_percent_ngram()

        for x in self.fd.most_common():
            word = x[0]
            count = x[1]

            if exceptedsum<=sum:
                break
            sum = sum + count
            counter = counter + 1
        return counter

    def ninty_percent_ngram(self):
        return round(self.fd.N()*90/100)

    def total_ngram(self):
        return self.fd.N()

    def unique_ngram(self):
        return self.fd.B()


####################################################################################
#                ASSIGNMENT ANSWERS
####################################################################################

file = HtmlParser("../download/AB/wiki_00")
# file = HtmlParser("../download/sample")
text = file.parse()
ngram = Ngram(text)


####################################################################################
#                FOR UNIGRAM
####################################################################################

print(f'\nQ1. Unigram analysis:\n ')
print(f' (a) Mention the total unique unigrams present in the corpus.')
unigrams = ngram.extract_ngrams(1)
ud = FreqDist(unigrams)
analysis = Analysis(ud)
print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')

print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n (b) Plot the distribution of the unigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#                FOR BIAGRAM
####################################################################################

print(f'\nQ2. Bigram analysis:\n ')
print(f'  (a) Mention the total unique bigrams present in the corpus.')
biagram = ngram.extract_ngrams(2)
bd = FreqDist(biagram)
analysis = Analysis(bd)

print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n  (b) Plot the distribution of the bigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n  (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f'  Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#                FOR TRIGRAM
####################################################################################

print(f'\nQ3. Trigram analysis:\n ')
print(f'  (a) Mention the total unique trigram present in the corpus.')
biagram = ngram.extract_ngrams(3)
bd = FreqDist(biagram)
analysis = Analysis(bd)

print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n  (b) Plot the distribution of the trigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n  (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f'  Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#               AFTER PERFORMING STEMMING ON TOKEN
####################################################################################

print('Q4. Repeat Q1, Q2, and Q3 after performing the stemming process on the tokens.')

####################################################################################
#                FOR UNIGRAM WITH STEMING
####################################################################################

print(f'\nQ4->Q1. Unigram analysis:\n ')
print(f' (a) Mention the total unique unigrams present in the corpus.')
unigrams = ngram.extract_ngrams_with_stemming(1)
ud = FreqDist(unigrams)
analysis = Analysis(ud)
print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')

print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n (b) Plot the distribution of the unigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#                FOR BIAGRAM WITH STEMING
####################################################################################

print(f'\nQ4->Q2. Bigram analysis:\n ')
print(f'  (a) Mention the total unique bigrams present in the corpus.')
biagram = ngram.extract_ngrams_with_stemming(2)
bd = FreqDist(biagram)
analysis = Analysis(bd)

print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n  (b) Plot the distribution of the bigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n  (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f'  Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#                FOR TRIGRAM WITH STEMING
####################################################################################

print(f'\nQ4->Q3. Trigram analysis:\n ')
print(f'  (a) Mention the total unique trigram present in the corpus.')
biagram = ngram.extract_ngrams_with_stemming(3)
bd = FreqDist(biagram)
analysis = Analysis(bd)

print(f'  Answer => {analysis.unique_ngram()}')

print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()

print(f'\n  (b) Plot the distribution of the trigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()

print(f' \n  (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f'  Answer => {counter}')

print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90%  of total number of ngrams: { analysis.ninty_percent_ngram() }')

####################################################################################
#               ZIPF'S LAW CLARIFICATION
####################################################################################

print(f"\nQ5. Briefly summarize and discuss the frequency distributions obtained in Q1 to Q4. Do these distributions approximately follow Zipf's law?")
zipf_answer = """
Answer=>
When words are ranked according to their frequencies and when the frequency is plotted against the rank,the result
is a logarithmic curve. Or as we plat graph on a log scale, the result is a straight line.
This confirm that it follows zipf's law approximatly.
"""
print(zipf_answer)


####################################################################################
#               LIBRARY AND TOOLS USED
####################################################################################


print('Q6. What library you used for tokenization and stemming? What were the underlying algorithms used by the library for these tasks?')
lib_answer ="""
NLTK library is used for tokenization and stemming.
For tokenization, it uses punkt word segmentation algorithm
For stemming, it uses Porter algorithm
"""
print(lib_answer)


####################################################################################
#               EXAMPLE OF LIMITATION OF TOKENIZATION
####################################################################################

print('Q7. Report three examples based on your observation, where the tool used for tokenization did not tokenize the character sequence properly. ')
tokenization_problems="""
1. Tokenization consider "'s" as separate word
eg. In the document word switzerland's is tokenize as ['switzerland', 's']

2. / is taken wrongly for 10/01
eg. <a href="Q%20%28magazine%29">"Q" magazine</a> (10/01, p. 152) - 4 stars out of 5 - "This remains an excellent starting point for this most curious band".
is tokenize as ['q', 'magazine', '10', '01', 'p', '152', '4', 'stars', 'out', 'of', '5', 'this', 'remains', 'an', 'excellent', 'starting', 'point', 'for', 'this', 'most', 'curious', 'band']

3. Tokenization consider , as separator for number
Number 12,500 BC is tokenize as ['12', '500', 'bc']

4. Hypen in word is taken wrongly
eg. word co-operation is tokenize as ['co', 'operation']
In an op-ed that he wrote for the <a href="The%20Economist">"Economist"</a> in 2003, ElBaradei outlined his idea for the future of the <a href="nuclear%20fuel%20cycle">nuclear fuel cycle</a>. His suggestion was to "limit the processing of weapon-usable material in civilian nuclear programs, as well as the production of new material, by agreeing to restrict these operations exclusively to facilities under multinational control." Also, "nuclear-energy systems should be deployed that, by design, avoid the use of materials that may be applied directly to making nuclear weapons." He concluded by saying that "considerable advantages would be gained from international co-operation in these stages of the nuclear-fuel cycle. These initiatives would not simply add more non-proliferation controls, to limit access to weapon-usable nuclear material; they would also provide access to the benefits of nuclear technology for more people in more countries."

"""
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Thu Apr 23 14:26:40 2020

	@author: kets
	"""
	####################################################################################
	# HTML PARSER
	####################################################################################
	import requests
	import re
	from bs4 import BeautifulSoup

	class HtmlParser:
	def __init__(self, path):
	self.path = path

	def parse(self):
	print(f'Parsing corpus..')
	soup = BeautifulSoup(open(self.path, encoding="utf8"), "html.parser")
	text = soup.get_text()
	text = re.sub(r'\[[0-9]*\]', ' ', text)
	text = re.sub(r'\s+',' ', text)
	text = text.lower()
	text = re.sub(r'\W',' ', text)
	text = re.sub(r'\s+',' ', text)
	return text

	def print(self):
	print(self.parse())

	####################################################################################
	# NGRAM GENERATION
	####################################################################################

	import nltk
	from nltk import word_tokenize
	from nltk.util import ngrams
	from collections import Counter
	from nltk.probability import FreqDist
	from nltk.stem import PorterStemmer

	class Ngram:

	def __init__(self, text):
	self.tokenize_text = self.tokenize(text)

	def extract_ngrams(self, num):
	n_grams = ngrams(self.tokenize_text, num)
	return [ ' '.join(grams) for grams in n_grams]

	def extract_ngrams_with_stemming(self, num):
	ps = PorterStemmer()
	self.stem_tokenize_text = [ps.stem(txt) for txt in self.tokenize_text]
	n_grams = ngrams(self.stem_tokenize_text, num)
	return [ ' '.join(grams) for grams in n_grams]

	def tokenize(self, text):
	print("Tokenize words...")
	return word_tokenize(text)


	####################################################################################
	# PLOT GRAPH AND ANALYSIS
	####################################################################################

	import matplotlib
	import matplotlib.pyplot as plt
	import os
	import pandas as pd

	class Analysis:
	def __init__(self, fd):
	self.fd = fd

	def plot_logarithmic_distribution(self):
	current_path = os.getcwd()
	row_list = []
	rank = 0
	for x in self.fd.most_common():
	word = x[0]
	count = x[1]
	rank += 1
	row = {'Word': word, 'Rank': rank, 'Frequency': count}
	row_list.append(row)

	df = pd.DataFrame(row_list, columns=['Word', 'Rank', 'Frequency'])
	df.to_csv(os.path.join(current_path, 'zipflaw.csv'), index=False, encoding='utf-8')
	df = None

	csv_data = pd.read_csv(os.path.join(current_path, 'zipflaw.csv'), encoding='utf-8')
	plt.loglog(csv_data['Rank'], csv_data['Frequency'])
	plt.xlabel('Frequency of ngrams', fontsize=14, fontweight='bold')
	plt.ylabel('Rank of ngrams', fontsize=14, fontweight='bold')
	plt.title('Ngram distribution with rank of ngram(logarithmic)')
	plt.grid(True)
	plt.show()

	def top_20_words(self):
	common_words = map(lambda p: str(p[0]) + ' ' + str(p[1]), self.fd.most_common(20))
	for cw in common_words:
	print(f' \t{cw}')

	def plot_top_20_word(self):
	self.fd.plot(20, title='Top 20 ngram distribution with rank of ngram to verify zipf law')


	def for_90_percent_coverage(self):
	sum = 0
	counter = 0
	exceptedsum = self.ninty_percent_ngram()

	for x in self.fd.most_common():
	word = x[0]
	count = x[1]

	if exceptedsum<=sum:
	break
	sum = sum + count
	counter = counter + 1
	return counter

	def ninty_percent_ngram(self):
	return round(self.fd.N()*90/100)

	def total_ngram(self):
	return self.fd.N()

	def unique_ngram(self):
	return self.fd.B()


	####################################################################################
	# ASSIGNMENT ANSWERS
	####################################################################################

	file = HtmlParser("../download/AB/wiki_00")
	# file = HtmlParser("../download/sample")
	text = file.parse()
	ngram = Ngram(text)


	####################################################################################
	# FOR UNIGRAM
	####################################################################################

	print(f'\nQ1. Unigram analysis:\n ')
	print(f' (a) Mention the total unique unigrams present in the corpus.')
	unigrams = ngram.extract_ngrams(1)
	ud = FreqDist(unigrams)
	analysis = Analysis(ud)
	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')

	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the unigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# FOR BIAGRAM
	####################################################################################

	print(f'\nQ2. Bigram analysis:\n ')
	print(f' (a) Mention the total unique bigrams present in the corpus.')
	biagram = ngram.extract_ngrams(2)
	bd = FreqDist(biagram)
	analysis = Analysis(bd)

	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the bigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# FOR TRIGRAM
	####################################################################################

	print(f'\nQ3. Trigram analysis:\n ')
	print(f' (a) Mention the total unique trigram present in the corpus.')
	biagram = ngram.extract_ngrams(3)
	bd = FreqDist(biagram)
	analysis = Analysis(bd)

	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the trigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# AFTER PERFORMING STEMMING ON TOKEN
	####################################################################################

	print('Q4. Repeat Q1, Q2, and Q3 after performing the stemming process on the tokens.')

	####################################################################################
	# FOR UNIGRAM WITH STEMING
	####################################################################################

	print(f'\nQ4->Q1. Unigram analysis:\n ')
	print(f' (a) Mention the total unique unigrams present in the corpus.')
	unigrams = ngram.extract_ngrams_with_stemming(1)
	ud = FreqDist(unigrams)
	analysis = Analysis(ud)
	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')

	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the unigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# FOR BIAGRAM WITH STEMING
	####################################################################################

	print(f'\nQ4->Q2. Bigram analysis:\n ')
	print(f' (a) Mention the total unique bigrams present in the corpus.')
	biagram = ngram.extract_ngrams_with_stemming(2)
	bd = FreqDist(biagram)
	analysis = Analysis(bd)

	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the bigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# FOR TRIGRAM WITH STEMING
	####################################################################################

	print(f'\nQ4->Q3. Trigram analysis:\n ')
	print(f' (a) Mention the total unique trigram present in the corpus.')
	biagram = ngram.extract_ngrams_with_stemming(3)
	bd = FreqDist(biagram)
	analysis = Analysis(bd)

	print(f' Answer => {analysis.unique_ngram()}')

	print(f' \n\tDiscription =>')
	print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
	print(f' \t2. Top 20 words')
	analysis.top_20_words()

	print(f'\n (b) Plot the distribution of the trigram frequencies.')
	analysis.plot_logarithmic_distribution()
	analysis.plot_top_20_word()

	print(f' \n (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
	counter = analysis.for_90_percent_coverage()
	print(f' Answer => {counter}')

	print(f' \tDiscription => ')
	print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
	print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')

	####################################################################################
	# ZIPF'S LAW CLARIFICATION
	####################################################################################

	print(f"\nQ5. Briefly summarize and discuss the frequency distributions obtained in Q1 to Q4. Do these distributions approximately follow Zipf's law?")
	zipf_answer = """
	Answer=>
	When words are ranked according to their frequencies and when the frequency is plotted against the rank,the result
	is a logarithmic curve. Or as we plat graph on a log scale, the result is a straight line.
	This confirm that it follows zipf's law approximatly.
	"""
	print(zipf_answer)


	####################################################################################
	# LIBRARY AND TOOLS USED
	####################################################################################


	print('Q6. What library you used for tokenization and stemming? What were the underlying algorithms used by the library for these tasks?')
	lib_answer ="""
	NLTK library is used for tokenization and stemming.
	For tokenization, it uses punkt word segmentation algorithm
	For stemming, it uses Porter algorithm
	"""
	print(lib_answer)


	####################################################################################
	# EXAMPLE OF LIMITATION OF TOKENIZATION
	####################################################################################

	print('Q7. Report three examples based on your observation, where the tool used for tokenization did not tokenize the character sequence properly. ')
	tokenization_problems="""
	1. Tokenization consider "'s" as separate word
	eg. In the document word switzerland's is tokenize as ['switzerland', 's']

	2. / is taken wrongly for 10/01
	eg. <a href="Q%20%28magazine%29">"Q" magazine</a> (10/01, p. 152) - 4 stars out of 5 - "This remains an excellent starting point for this most curious band".
	is tokenize as ['q', 'magazine', '10', '01', 'p', '152', '4', 'stars', 'out', 'of', '5', 'this', 'remains', 'an', 'excellent', 'starting', 'point', 'for', 'this', 'most', 'curious', 'band']

	3. Tokenization consider , as separator for number
	Number 12,500 BC is tokenize as ['12', '500', 'bc']

	4. Hypen in word is taken wrongly
	eg. word co-operation is tokenize as ['co', 'operation']
	In an op-ed that he wrote for the <a href="The%20Economist">"Economist"</a> in 2003, ElBaradei outlined his idea for the future of the <a href="nuclear%20fuel%20cycle">nuclear fuel cycle</a>. His suggestion was to "limit the processing of weapon-usable material in civilian nuclear programs, as well as the production of new material, by agreeing to restrict these operations exclusively to facilities under multinational control." Also, "nuclear-energy systems should be deployed that, by design, avoid the use of materials that may be applied directly to making nuclear weapons." He concluded by saying that "considerable advantages would be gained from international co-operation in these stages of the nuclear-fuel cycle. These initiatives would not simply add more non-proliferation controls, to limit access to weapon-usable nuclear material; they would also provide access to the benefits of nuclear technology for more people in more countries."

	"""