Last active
April 6, 2022 03:23
-
-
Save yssharma/8269000 to your computer and use it in GitHub Desktop.
Python Script to get movie ratings from IMDB/OMDB and save as file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import re | |
import urllib2 | |
import json | |
import os, sys | |
import csv, operator | |
def getMovieList(moviedir_path, movielist_path): | |
print 'INFO: Creating Movie list.' | |
logfile = open(movielist_path, "w") | |
output, error = subprocess.Popen( | |
["ls", "-R", moviesdir_path], stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE).communicate() | |
logfile.write(output) | |
logfile.close() | |
def cleanMovieList(movielist_path, cleanmovielist_path, scanonly_videofile_name='N'): | |
print 'INFO: Cleaning Movie names.' | |
infile = open(movielist_path, 'r') | |
outfile = open(cleanmovielist_path,"a") | |
line = infile.readline() | |
junkwords = ['dvdrip', 'x264', 'txt', 'srt', | |
'brrip', '480p', 'aac', 'mgb', 'bluray', | |
'ts', 'cam', 'camrip', 'dts', 's4a', 'subs', | |
'hdtvrip', 'hdrip']; | |
allowedfileformats = ["3g2","3gp","asf","asx","avi","flv","m4v", | |
"mov","mp4","mpg","rm","srt","swf","vob", | |
"wmv", "mkv", "xvid", "divx"]; | |
while line: | |
line = line.lower() | |
line = re.sub(r'\[.*?\]|\(.*?\)|\W', ' ', line) | |
newline = '' | |
if len(line.strip()) <= 0: | |
line = infile.readline() | |
continue | |
splittedname = line.split() | |
if scanonly_videofile_name == 'Y': | |
if splittedname[len(splittedname) -1] not in allowedfileformats: | |
line = infile.readline() | |
continue | |
for word in splittedname: | |
if re.search('\\d+(\\.\\d+)?', word) and len(word)==4: | |
break | |
if word in junkwords: | |
continue | |
if word in allowedfileformats: | |
continue | |
if len(word) >= 13: | |
continue | |
newline += ' ' +word | |
outfile.write(newline.strip()+'\n') | |
line = infile.readline() | |
infile.close() | |
outfile.close() | |
def getRatings(cleanmovielist_path, movierating_path, movierating_notfound_path): | |
print 'INFO: Fetching ratings.' | |
url = 'http://www.omdbapi.com/?t=' | |
infile = open(cleanmovielist_path, 'r') | |
finaloutstr = '' | |
imdbIds = [] | |
buferrecords = [] | |
buffererrorrecords = [] | |
count =0 | |
for moviename in infile.readlines(): | |
if len(moviename.strip()) <=0: | |
continue | |
movie = moviename | |
finaloutstr = '' | |
finalurl = url + urllib2.quote(moviename.rstrip()) | |
#print finalurl | |
response = urllib2.urlopen(finalurl) | |
jsonstr = response.read() | |
data = json.loads(jsonstr) | |
if data.get('Title'): | |
if data["imdbID"] in imdbIds: | |
continue | |
finaloutstr = data["Title"] + ',' + data["Year"] + ',' + data["imdbRating"] | |
imdbIds.append(data["imdbID"]) | |
buferrecords.append(finaloutstr.encode('utf-8')+'\n') | |
else: | |
finaloutstr = movie.rstrip() + ',N/A,Not Found' | |
buffererrorrecords.append(finaloutstr.encode('utf-8')+'\n') | |
if(len(buferrecords) % 20 == 0): | |
outfile = open(movierating_path,"a") | |
for line in buferrecords: | |
outfile.write(line) | |
del buferrecords[:] | |
outfile.close() | |
if(len(buffererrorrecords) % 20 == 0): | |
errorfile = open(movierating_notfound_path,"a") | |
for line in buffererrorrecords: | |
errorfile.write(line) | |
del buffererrorrecords[:] | |
errorfile.close() | |
#print finaloutstr.encode('utf-8') | |
count += 1 | |
print '.', | |
if count % 20 == 0: | |
print count+ ' Names Scanned\n', | |
outfile = open(movierating_path,"a") | |
for line in buferrecords: | |
outfile.write(line) | |
del buferrecords[:] | |
outfile.close() | |
errorfile = open(movierating_notfound_path,"a") | |
for line in buffererrorrecords: | |
errorfile.write(line) | |
del buffererrorrecords[:] | |
errorfile.close() | |
print '\n' | |
infile.close() | |
def sortlist(movierating_path, movierating_path2): | |
print 'INFO: Sorting Data.' | |
outpath = open(movierating_path2,"a") | |
writer = csv.writer(outpath) | |
reader = csv.reader(open(movierating_path), delimiter=",") | |
sortedlist = sorted(reader, key=operator.itemgetter(2), reverse=True) | |
writer.writerows(sortedlist) | |
#for line in sortedlist: | |
# outpath.write(line) | |
outpath.close() | |
def deletefiles(files): | |
print 'INFO: Removing temp files.' | |
for filename in files: | |
os.path.exists(filename) and os.remove(filename) | |
if __name__=='__main__': | |
moviesdir_path = '/media/yash/fb32bb89-17c2-4d8a-8c91-2f7910434834/English' | |
movielist_path = '/home/yash/Desktop/Movie_List.txt' | |
cleanmovielist_path = '/home/yash/Desktop/CleanMovie_List.txt' | |
movierating_path = '/home/yash/Desktop/MovieRatings_temp.csv' | |
movierating_path_sorted = '/home/yash/Desktop/MovieRatings.csv' | |
movierating_notfound_path = '/home/yash/Desktop/MovieRatings_NotFound.csv' | |
# Flag to scan only video files or all files in dir | |
scanonly_videofile_name = "N" | |
deletefiles([movielist_path, cleanmovielist_path, movierating_path, movierating_notfound_path, movierating_path_sorted]) | |
getMovieList(moviesdir_path, movielist_path) | |
cleanMovieList(movielist_path,cleanmovielist_path, scanonly_videofile_name) | |
getRatings(cleanmovielist_path, movierating_path, movierating_notfound_path) | |
sortlist(movierating_path, movierating_path_sorted) | |
deletefiles([movielist_path, cleanmovielist_path, movierating_path]) | |
print 'Over N Out !!' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment