Skip to content

Instantly share code, notes, and snippets.

@ketanghumatkar
Created April 25, 2020 19:56
Show Gist options
  • Save ketanghumatkar/3434de559d1f4f6d3fb1900b5be06367 to your computer and use it in GitHub Desktop.
Save ketanghumatkar/3434de559d1f4f6d3fb1900b5be06367 to your computer and use it in GitHub Desktop.
Mtech IR system
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 14:26:40 2020
@author: kets
"""
####################################################################################
# HTML PARSER
####################################################################################
import requests
import re
from bs4 import BeautifulSoup
class HtmlParser:
def __init__(self, path):
self.path = path
def parse(self):
print(f'Parsing corpus..')
soup = BeautifulSoup(open(self.path, encoding="utf8"), "html.parser")
text = soup.get_text()
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+',' ', text)
text = text.lower()
text = re.sub(r'\W',' ', text)
text = re.sub(r'\s+',' ', text)
return text
def print(self):
print(self.parse())
####################################################################################
# NGRAM GENERATION
####################################################################################
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
class Ngram:
def __init__(self, text):
self.tokenize_text = self.tokenize(text)
def extract_ngrams(self, num):
n_grams = ngrams(self.tokenize_text, num)
return [ ' '.join(grams) for grams in n_grams]
def extract_ngrams_with_stemming(self, num):
ps = PorterStemmer()
self.stem_tokenize_text = [ps.stem(txt) for txt in self.tokenize_text]
n_grams = ngrams(self.stem_tokenize_text, num)
return [ ' '.join(grams) for grams in n_grams]
def tokenize(self, text):
print("Tokenize words...")
return word_tokenize(text)
####################################################################################
# PLOT GRAPH AND ANALYSIS
####################################################################################
import matplotlib
import matplotlib.pyplot as plt
import os
import pandas as pd
class Analysis:
def __init__(self, fd):
self.fd = fd
def plot_logarithmic_distribution(self):
current_path = os.getcwd()
row_list = []
rank = 0
for x in self.fd.most_common():
word = x[0]
count = x[1]
rank += 1
row = {'Word': word, 'Rank': rank, 'Frequency': count}
row_list.append(row)
df = pd.DataFrame(row_list, columns=['Word', 'Rank', 'Frequency'])
df.to_csv(os.path.join(current_path, 'zipflaw.csv'), index=False, encoding='utf-8')
df = None
csv_data = pd.read_csv(os.path.join(current_path, 'zipflaw.csv'), encoding='utf-8')
plt.loglog(csv_data['Rank'], csv_data['Frequency'])
plt.xlabel('Frequency of ngrams', fontsize=14, fontweight='bold')
plt.ylabel('Rank of ngrams', fontsize=14, fontweight='bold')
plt.title('Ngram distribution with rank of ngram(logarithmic)')
plt.grid(True)
plt.show()
def top_20_words(self):
common_words = map(lambda p: str(p[0]) + ' ' + str(p[1]), self.fd.most_common(20))
for cw in common_words:
print(f' \t{cw}')
def plot_top_20_word(self):
self.fd.plot(20, title='Top 20 ngram distribution with rank of ngram to verify zipf law')
def for_90_percent_coverage(self):
sum = 0
counter = 0
exceptedsum = self.ninty_percent_ngram()
for x in self.fd.most_common():
word = x[0]
count = x[1]
if exceptedsum<=sum:
break
sum = sum + count
counter = counter + 1
return counter
def ninty_percent_ngram(self):
return round(self.fd.N()*90/100)
def total_ngram(self):
return self.fd.N()
def unique_ngram(self):
return self.fd.B()
####################################################################################
# ASSIGNMENT ANSWERS
####################################################################################
file = HtmlParser("../download/AB/wiki_00")
# file = HtmlParser("../download/sample")
text = file.parse()
ngram = Ngram(text)
####################################################################################
# FOR UNIGRAM
####################################################################################
print(f'\nQ1. Unigram analysis:\n ')
print(f' (a) Mention the total unique unigrams present in the corpus.')
unigrams = ngram.extract_ngrams(1)
ud = FreqDist(unigrams)
analysis = Analysis(ud)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the unigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# FOR BIAGRAM
####################################################################################
print(f'\nQ2. Bigram analysis:\n ')
print(f' (a) Mention the total unique bigrams present in the corpus.')
biagram = ngram.extract_ngrams(2)
bd = FreqDist(biagram)
analysis = Analysis(bd)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the bigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# FOR TRIGRAM
####################################################################################
print(f'\nQ3. Trigram analysis:\n ')
print(f' (a) Mention the total unique trigram present in the corpus.')
biagram = ngram.extract_ngrams(3)
bd = FreqDist(biagram)
analysis = Analysis(bd)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the trigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# AFTER PERFORMING STEMMING ON TOKEN
####################################################################################
print('Q4. Repeat Q1, Q2, and Q3 after performing the stemming process on the tokens.')
####################################################################################
# FOR UNIGRAM WITH STEMING
####################################################################################
print(f'\nQ4->Q1. Unigram analysis:\n ')
print(f' (a) Mention the total unique unigrams present in the corpus.')
unigrams = ngram.extract_ngrams_with_stemming(1)
ud = FreqDist(unigrams)
analysis = Analysis(ud)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the unigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) uni-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# FOR BIAGRAM WITH STEMING
####################################################################################
print(f'\nQ4->Q2. Bigram analysis:\n ')
print(f' (a) Mention the total unique bigrams present in the corpus.')
biagram = ngram.extract_ngrams_with_stemming(2)
bd = FreqDist(biagram)
analysis = Analysis(bd)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the bigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) bia-grams are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# FOR TRIGRAM WITH STEMING
####################################################################################
print(f'\nQ4->Q3. Trigram analysis:\n ')
print(f' (a) Mention the total unique trigram present in the corpus.')
biagram = ngram.extract_ngrams_with_stemming(3)
bd = FreqDist(biagram)
analysis = Analysis(bd)
print(f' Answer => {analysis.unique_ngram()}')
print(f' \n\tDiscription =>')
print(f' \t1. Total number of ngrams: {analysis.total_ngram()}')
print(f' \t2. Top 20 words')
analysis.top_20_words()
print(f'\n (b) Plot the distribution of the trigram frequencies.')
analysis.plot_logarithmic_distribution()
analysis.plot_top_20_word()
print(f' \n (c) How many (most frequent) tri-gram are required to cover the 90% of the complete corpus.')
counter = analysis.for_90_percent_coverage()
print(f' Answer => {counter}')
print(f' \tDiscription => ')
print(f' \tTotal number of ngrams: {analysis.total_ngram()}')
print(f' \t90% of total number of ngrams: { analysis.ninty_percent_ngram() }')
####################################################################################
# ZIPF'S LAW CLARIFICATION
####################################################################################
print(f"\nQ5. Briefly summarize and discuss the frequency distributions obtained in Q1 to Q4. Do these distributions approximately follow Zipf's law?")
zipf_answer = """
Answer=>
When words are ranked according to their frequencies and when the frequency is plotted against the rank,the result
is a logarithmic curve. Or as we plat graph on a log scale, the result is a straight line.
This confirm that it follows zipf's law approximatly.
"""
print(zipf_answer)
####################################################################################
# LIBRARY AND TOOLS USED
####################################################################################
print('Q6. What library you used for tokenization and stemming? What were the underlying algorithms used by the library for these tasks?')
lib_answer ="""
NLTK library is used for tokenization and stemming.
For tokenization, it uses punkt word segmentation algorithm
For stemming, it uses Porter algorithm
"""
print(lib_answer)
####################################################################################
# EXAMPLE OF LIMITATION OF TOKENIZATION
####################################################################################
print('Q7. Report three examples based on your observation, where the tool used for tokenization did not tokenize the character sequence properly. ')
tokenization_problems="""
1. Tokenization consider "'s" as separate word
eg. In the document word switzerland's is tokenize as ['switzerland', 's']
2. / is taken wrongly for 10/01
eg. <a href="Q%20%28magazine%29">"Q" magazine</a> (10/01, p. 152) - 4 stars out of 5 - "This remains an excellent starting point for this most curious band".
is tokenize as ['q', 'magazine', '10', '01', 'p', '152', '4', 'stars', 'out', 'of', '5', 'this', 'remains', 'an', 'excellent', 'starting', 'point', 'for', 'this', 'most', 'curious', 'band']
3. Tokenization consider , as separator for number
Number 12,500 BC is tokenize as ['12', '500', 'bc']
4. Hypen in word is taken wrongly
eg. word co-operation is tokenize as ['co', 'operation']
In an op-ed that he wrote for the <a href="The%20Economist">"Economist"</a> in 2003, ElBaradei outlined his idea for the future of the <a href="nuclear%20fuel%20cycle">nuclear fuel cycle</a>. His suggestion was to "limit the processing of weapon-usable material in civilian nuclear programs, as well as the production of new material, by agreeing to restrict these operations exclusively to facilities under multinational control." Also, "nuclear-energy systems should be deployed that, by design, avoid the use of materials that may be applied directly to making nuclear weapons." He concluded by saying that "considerable advantages would be gained from international co-operation in these stages of the nuclear-fuel cycle. These initiatives would not simply add more non-proliferation controls, to limit access to weapon-usable nuclear material; they would also provide access to the benefits of nuclear technology for more people in more countries."
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment