Created
December 29, 2012 18:59
-
-
Save Reboare/4408690 to your computer and use it in GitHub Desktop.
Infer a movies attributes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Infer a movies attributes by extracting data from the movie filename | |
@author: Josiah | |
''' | |
import re, datetime | |
from sys import argv, stdout | |
import logging | |
import os | |
logger = logging.getLogger(name = __name__) | |
VIDEO_CODECS = ("xvid", "x264", "h264", "divx") | |
RESOLUTIONS = ("720p", "1080p", "576p") | |
MEDIAS = ("DVDRip", "BRRip", "BluRay", "HDDvd", "HDTv", "BDRip", "VHS", | |
"Screener", "R5", "DVD") | |
GROUPS = ("ESiR", "REFiNED", "SEPTiC", "HDChina", "HiDt", "WiKi", | |
"HaB", "FraMeSToR", "LooKMaNe", "AMIABLE", "XTSF", | |
"NODLABS", "iNFAMOUS", "BLiND", "CHD", "DIMENSION", | |
"CiNEFiLE", "EbP", "SiNNERS", "SPARKS") | |
ATTRIBUTES = ("EC", "LIMITED", "", "NTSC", "Anniversary", "Edition", "Extended", ) | |
def _search(filename, list_search): | |
"""search a filename for ocurrences of items contained in a given list | |
Arguments: | |
filename -- the movie filename | |
list-search -- the list to search | |
Return Values: | |
string -- the first ocurring string found""" | |
searched = [value for value in list_search | |
if value.lower() in filename.lower()] | |
length = len(searched) > 0 | |
logger.debug("Attribute from %s matched %s" % (filename, | |
searched [0] if length else "nothing")) | |
return searched[0] if length else None | |
def _source(filename): return _search(filename, MEDIAS) | |
def _resolution(filename): return _search(filename, RESOLUTIONS) | |
def _codec(filename): return _search(filename, VIDEO_CODECS) | |
def _group(filename): return _search(filename, GROUPS) | |
def _year(filename): | |
"""returns any date information in a movie filename | |
Return Values: | |
int -- year in the range 1900 to current year + 1 | |
""" | |
max_year = datetime.datetime.now().year + 1 | |
matches = re.findall("[0-9]{4}(?![0-9])", filename) | |
matches = [int(year) for year in matches | |
if int(year)>=1900 and int(year)<=max_year] | |
if len(matches) == 0: | |
return None | |
result = matches[-1] | |
index = filename.rfind(str(result)) | |
if not index > 0: | |
return None | |
logger.debug("Identified %s as being released in %i" % (filename, result)) | |
return result | |
def _multi_part(filename): | |
"""parses a given filename and extracts part number or disc number | |
where appropriate | |
Arguments: | |
filename -- the movie filename | |
""" | |
#These need serious improvement | |
part = "[cC]+[dD]+[ ]*[0-9]{1}" | |
disk = "(?<![cC])[dD]+[]*[0-9]{1}" | |
disc_num = re.search(disk, filename) | |
part_num = re.search(part, filename) | |
disc_num = disc_num.group(0) if disc_num != None else None | |
part_num = part_num.group(0) if part_num != None else None | |
logger.debug("Identified %s as having disc number %s and part number %s" % | |
(filename, disc_num, part_num )) | |
return disc_num, part_num | |
def infer(filename): | |
"""break a given movie file name into parts | |
Arguments: | |
filename -- the movie title | |
Return Values: | |
dictionary -- title : the movie title | |
codec : the codec the file is encoded with | |
resolution : the given resolution e.g. 720p, 1080p | |
source : the movie source e.g. bluray, dvd | |
year : the movie release date | |
group : the encoder of the file | |
disc : the disc number for a multi-disc movie | |
part : the part number for a multi-part movie | |
""" | |
values = {"title": None, | |
"codec":_codec(filename), | |
"resolution":_resolution(filename), | |
"source":_source(filename), | |
"year":_year(filename), | |
"group": _group(filename), | |
"disc": None, | |
"part": None} | |
values['disc'], values['part'] = _multi_part(filename) | |
minimum_list = [val for val in values.itervalues() if val != None] | |
title = [filename] | |
for value in minimum_list: | |
title = filter(lambda x: x!="", title) | |
title = title[0].split(str(value)) | |
title = title[0] | |
title = " ".join(re.split("[\.\- _]", title)).strip("() ./") | |
values['title'] = title | |
logger.info("Movie title identified as %s" % (title)) | |
return values | |
def inferf(name, formatting = ("%s (%d)", "title, year")): | |
"""Hopefully at some point implemenet a proper format arguing. | |
This simply provides a shortcut to output a nicely formatted string form of infer""" | |
inferred = infer(name) | |
for key, value in inferred.iteritems(): | |
locals()[key] = value | |
tup = tuple([locals()[each.strip()] for each in formatting[1].split(",")]) | |
string = "{0} ({1})".format(inferred["title"] , inferred["year"]) | |
return string | |
def splittriple(filepath): | |
"""Returns a triple of path, name, ext from a filepath""" | |
path, filejoin = os.path.split(filepath) | |
name, ext = os.path.splitext(filejoin) | |
return (path, name, ext) | |
def rename_file(filepath): | |
triplet = splittriple(filepath) | |
triplet[1] = format_infer(triplet[1]) | |
destination = "".join(triplet) | |
os.rename(filepath, destination) | |
def main(): | |
inferred = infer(argv[1]) | |
for attribute in ["title", "year", "codec", "resolution", | |
"source", "disc", "part", "group"]: | |
if inferred[attribute] != None: | |
print "%s : %s" % (attribute, inferred[attribute]) | |
if __name__ == "__main__": | |
import glob | |
for movie in glob.glob("G:\Videos\Movies\*.mkv"): | |
path, name, ext = splittriple(movie) | |
print inferf(name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment