Skip to content

Instantly share code, notes, and snippets.

@raj-nandu
Last active May 8, 2018 16:20
Show Gist options
  • Save raj-nandu/22492e9800f2adfdd158562ee576df36 to your computer and use it in GitHub Desktop.
Save raj-nandu/22492e9800f2adfdd158562ee576df36 to your computer and use it in GitHub Desktop.
UnFound Assignment Gist
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pickle
wiki = "https://wogma.com/movies/basic/"
page = urlopen(wiki)
soup = BeautifulSoup(page,'lxml')
linkw = "https://wogma.com/"
all_links = soup.find_all("a")
print("Total Number of reviews = " + str(len(all_links)))
count = 1
movie_names = []
movie_tags = []
reviews = []
reviews_with_tags = []
for link in all_links:
if count % 10 == 0:
with open("dumped_data.pkl", 'wb') as data:
dumped_data = (movie_names, reviews, movie_tags, reviews_with_tags)
pickle.dump(dumped_data, data)
# if count >=100:
# break
if link.string == 'wogma review':
try:
review_link = linkw + link.get("href")
review = urlopen(review_link)
review_soup = BeautifulSoup(review, 'lxml')
output = ""
r = review_soup.find('div', class_='wogma-review')
paras = r.find_all('p')
for p in paras:
output += p.text
# print(output)
# tagging reviews
t = []
r = review_soup.find('div', {"id": 'parental_guidance'})
# print(r.text)
if r is not None:
tags = r.find_all('li')
print(str(tags))
# violence
if 'None' not in tags[0].text.split():
t.append('violence')
else:
t.append('non_violent')
# clean
if 'Clean' not in tags[1].text.split():
t.append('clean_language')
else:
t.append("swear_words")
if 'None' not in tags[2].text.split():
t.append('sexual_content')
else:
t.append('no_sexual_content')
# print(t)
# genre
div_genre = review_soup.find_all('div', class_='coloring')
temp = div_genre[-1].text
temp = temp.replace(',', '').lower()
temp = temp.strip('Genres:')
temp = temp.split()
temp = temp[1:]
# print(temp)
movie_tags.append(t+temp)
reviews.append(output)
movie_name = review_soup.find('h3', class_='title').text.lower()
print(movie_name)
movie_names.append(movie_name)
for i in t+temp:
output += " " + i
reviews_with_tags.append(output)
print("Movie Number = " + str(count))
count += 1
except:
print("Exception Occured")
dumped_data = (movie_names, reviews, movie_tags, reviews_with_tags)
with open("dumped_data.pkl", 'wb') as data:
pickle.dump(dumped_data, data)
print(str(movie_names))
print(str(reviews))
print(str(movie_tags))
print(str(reviews_with_tags))
# importing all the dependencies
import gensim
from gensim import corpora
from nltk.corpus import stopwords
import string
import pickle
from nltk.stem.wordnet import WordNetLemmatizer
#cleaning the dataset
with open("dumped_data.pkl", 'rb') as d:
data = pickle.load(d)
reviews = data[-1]
print(str(len(reviews)))
stopwords_set = set(stopwords.words('english'))
print(stopwords_set)
punctuations = set(string.punctuation)
print(punctuations)
lemmatizer = WordNetLemmatizer()
def clean_review(review):
# Removing stopwords
s = " ".join([word for word in review.lower().split() if word not in stopwords_set])
# Removing punctuations
p = ''.join(letter for letter in s if letter not in punctuations)
# Lemmatizing words eg. loves -> love
out = " ".join(lemmatizer.lemmatize(word) for word in p.split())
return out
cleaned_reviews = [clean_review(review).split() for review in reviews]
# length of cleaned reviews
len(cleaned_reviews)
# Creating a dictionary
dictionary = corpora.Dictionary(cleaned_reviews)
inp = [dictionary.doc2bow(review) for review in cleaned_reviews]
# Creating lda model
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(inp, num_topics=50, id2word=dictionary, passes=100)
a = ldamodel.print_topics(num_topics=15, num_words=50)
print(a)
#loading topics and inference
topic_dict = {}
topics = open('topics.txt',"r")
for line in topics:
a, b = line.split()
topic_dict[int(a)] = b
topics.close()
print(topic_dict)
test = "So, Airlift has one of Akshay Kumar's best performances - along with other sterling acts. Rich Indian-origin businessman Ranjit Katyal (Akshay) loves profits and dislikes the idea of India. But Ranjit's golden life in the desert crumbles when Iraqi forces invade Kuwait. Suddenly, people are savagely shot in the streets, houses looted, buildings blown up, tanks taking over, choppers hovering maliciously amidst minarets. Airlift features Akshay at his best - based on real-life characters, there's little khiladi-wala swag in Akshay's performance and more mature control. As Ranjit, who goes from protecting his kin to his countrymen, Akshay does a polished, restrained, powerful job. Nimrat conveys brittle edginess - she asks Ranjit, Ye Indian-Indian khelna, kya drama ho raha hai? - but grows into a woman who loves her husband's humanitarian heart. Certain cameos are outstanding. Prakash Belawadi brings alive surly, suspicious George, Kumud Mishra deeply impresses as a quietly determined MEA babu and Inaamulhuq oozes smooth menace as he quotes 'Amytabh Bachchan' to Ranjit - before showing him his business partner, hanging from a crane outside a palace full of bloodied marble and broken glass. Airlift's scale is impressive and editing (Hemanti Sarkar) deft. Some sequences - Iraqi soldiers brutally molesting an Indian girl, looting even onions, cheerily singing 'Ek Do Teen', dragging out a young mother, so the boys can have some fun with her - are intense. But the movie could've increased this intensity, the horror and taut, time-ticking dread that typify unforgettable siege/rescue films like Hotel Rwanda (2004), where you vividly felt humanity running out each second. Airlift depicts desperation but with more sound and light than darkness, more broad strokes than fine detail. Yet, Airlift works because it conveys a time when armies will attack civilians - you're struck by how IS was born from the Iraqi army's core - and raises Bollywood's generic bar. Plus, it movingly celebrates the most beautiful flag in the world."
test = clean_review(test).split()
#print(test)
test = dictionary.doc2bow(test)
a = list(sorted(ldamodel[test], key=lambda x: x[1]))
# Least related topic to the test review is the first element of the sorted list a
print("The words associated with least related topic to the test review are")
print(ldamodel.print_topic(a[0][0]))
if a[0][0] in topic_dict:
print(topic_dict[a[0][0]])
else:
print("Unknown topic")
# Most probable topic related to the test review is the last element of the sorted list a
print("The words associated with most probable topic related to the test review are")
print(ldamodel.print_topic(a[-1][0]))
if a[-1][0] in topic_dict:
print(topic_dict[a[-1][0]])
else:
print("Unknown topic")
48 marathi
33 musical
8 relationship
27 comedy
42 film
43 hindi
45 sexualcontent
13 violence
3 violent
44 animation
5 cleanlanguage
11 superheroes
24 action
22 hero
4 suspense
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment