djds23/match_videos.py

## match_videos.py
import csv
import sets
import json
import string
import urllib
import operator

from datetime import datetime

with open('master.csv', 'rb') as f: #csv from client with celeb name and description
    noteable = [row for row in csv.reader(f.read().splitlines())]

with open('data.json','rb') as j: #json file of video database
    vamp_data = j.read()
    json_obj = json.loads(vamp_data)

def prepare_text(text):
    '''takes text, removes punctuation and filler words, returns list of keywords'''
    table = string.maketrans("","")
    raw_keywords = text.translate(table, string.punctuation)
    keyword_list = map(lambda x:x.lower(), raw_keywords.split())
    stop_words = ['of','and', 'then','is','the', 'in', 'and', 'my', 'from', 'to']
    return [w for w in keyword_list if w not in stop_words]


def search(json_obj, noteable):
    '''compares the list of people against the title from vamp'''
    noteable = noteable[1:] #remove the column headers from csv
    json_obj = json_obj['objects'] #access array of video objects
    match_list = []
    for person in noteable:
        keywords = prepare_text(person[1]+ ' ' + person[3]) #Name & Description field respectively
        for video in json_obj:
            title = video['title'].encode('ascii', 'ignore')
            title = prepare_text(title)
            match_number = len(set(keywords).intersection(title)) #provide a number for matching words
            if match_number >= 3: #only keep videos that match 3 words or more
                video.update({'match number':match_number, 'person': person})
                match_list.append(video)
                json_obj.remove(video)
    sorted_match_list = sorted(match_list, key=operator.itemgetter('match number'), reverse=True)
    return build_download_links(sorted_match_list)

def build_download_links(sorted_match_list):
    '''build a download link for the video'''
    base_url = 'http://internal_server.video/archive/'
    for video in sorted_match_list:
        filename = video['source'].encode('ascii', 'ignore')
        mezz_name = filename[:-4]+'-mezzanine.mp4' #grab the mezzanine not the source
        url = base_url +  video['content_dir'].encode('ascii', 'ignore') + '/' + mezz_name
        print 'Downloading:', url
        urllib.urlretrieve(url, mezz_name)
    return final_report(sorted_match_list)

def final_report(sorted_match_list):
    '''write the matching files to a text file'''
    filename = 'refresh_' + str(datetime.now()) + '.txt'
    with open(filename, 'wb') as f:
        f.write(str(sorted_match_list))
        f.close()


if __name__=='__main__':
    search(json_obj, noteable)
	import csv
	import sets
	import json
	import string
	import urllib
	import operator

	from datetime import datetime

	with open('master.csv', 'rb') as f: #csv from client with celeb name and description
	noteable = [row for row in csv.reader(f.read().splitlines())]

	with open('data.json','rb') as j: #json file of video database
	vamp_data = j.read()
	json_obj = json.loads(vamp_data)

	def prepare_text(text):
	'''takes text, removes punctuation and filler words, returns list of keywords'''
	table = string.maketrans("","")
	raw_keywords = text.translate(table, string.punctuation)
	keyword_list = map(lambda x:x.lower(), raw_keywords.split())
	stop_words = ['of','and', 'then','is','the', 'in', 'and', 'my', 'from', 'to']
	return [w for w in keyword_list if w not in stop_words]


	def search(json_obj, noteable):
	'''compares the list of people against the title from vamp'''
	noteable = noteable[1:] #remove the column headers from csv
	json_obj = json_obj['objects'] #access array of video objects
	match_list = []
	for person in noteable:
	keywords = prepare_text(person[1]+ ' ' + person[3]) #Name & Description field respectively
	for video in json_obj:
	title = video['title'].encode('ascii', 'ignore')
	title = prepare_text(title)
	match_number = len(set(keywords).intersection(title)) #provide a number for matching words
	if match_number >= 3: #only keep videos that match 3 words or more
	video.update({'match number':match_number, 'person': person})
	match_list.append(video)
	json_obj.remove(video)
	sorted_match_list = sorted(match_list, key=operator.itemgetter('match number'), reverse=True)
	return build_download_links(sorted_match_list)

	def build_download_links(sorted_match_list):
	'''build a download link for the video'''
	base_url = 'http://internal_server.video/archive/'
	for video in sorted_match_list:
	filename = video['source'].encode('ascii', 'ignore')
	mezz_name = filename[:-4]+'-mezzanine.mp4' #grab the mezzanine not the source
	url = base_url + video['content_dir'].encode('ascii', 'ignore') + '/' + mezz_name
	print 'Downloading:', url
	urllib.urlretrieve(url, mezz_name)
	return final_report(sorted_match_list)

	def final_report(sorted_match_list):
	'''write the matching files to a text file'''
	filename = 'refresh_' + str(datetime.now()) + '.txt'
	with open(filename, 'wb') as f:
	f.write(str(sorted_match_list))
	f.close()


	if __name__=='__main__':
	search(json_obj, noteable)