Created
April 24, 2014 21:16
-
-
Save djds23/11269778 to your computer and use it in GitHub Desktop.
Instead of gathering videos by hand, this script will take a csv with video descriptions, and a json file of the database, then download the videos that match.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sets | |
import json | |
import string | |
import urllib | |
import operator | |
from datetime import datetime | |
with open('master.csv', 'rb') as f: #csv from client with celeb name and description | |
noteable = [row for row in csv.reader(f.read().splitlines())] | |
with open('data.json','rb') as j: #json file of video database | |
vamp_data = j.read() | |
json_obj = json.loads(vamp_data) | |
def prepare_text(text): | |
'''takes text, removes punctuation and filler words, returns list of keywords''' | |
table = string.maketrans("","") | |
raw_keywords = text.translate(table, string.punctuation) | |
keyword_list = map(lambda x:x.lower(), raw_keywords.split()) | |
stop_words = ['of','and', 'then','is','the', 'in', 'and', 'my', 'from', 'to'] | |
return [w for w in keyword_list if w not in stop_words] | |
def search(json_obj, noteable): | |
'''compares the list of people against the title from vamp''' | |
noteable = noteable[1:] #remove the column headers from csv | |
json_obj = json_obj['objects'] #access array of video objects | |
match_list = [] | |
for person in noteable: | |
keywords = prepare_text(person[1]+ ' ' + person[3]) #Name & Description field respectively | |
for video in json_obj: | |
title = video['title'].encode('ascii', 'ignore') | |
title = prepare_text(title) | |
match_number = len(set(keywords).intersection(title)) #provide a number for matching words | |
if match_number >= 3: #only keep videos that match 3 words or more | |
video.update({'match number':match_number, 'person': person}) | |
match_list.append(video) | |
json_obj.remove(video) | |
sorted_match_list = sorted(match_list, key=operator.itemgetter('match number'), reverse=True) | |
return build_download_links(sorted_match_list) | |
def build_download_links(sorted_match_list): | |
'''build a download link for the video''' | |
base_url = 'http://internal_server.video/archive/' | |
for video in sorted_match_list: | |
filename = video['source'].encode('ascii', 'ignore') | |
mezz_name = filename[:-4]+'-mezzanine.mp4' #grab the mezzanine not the source | |
url = base_url + video['content_dir'].encode('ascii', 'ignore') + '/' + mezz_name | |
print 'Downloading:', url | |
urllib.urlretrieve(url, mezz_name) | |
return final_report(sorted_match_list) | |
def final_report(sorted_match_list): | |
'''write the matching files to a text file''' | |
filename = 'refresh_' + str(datetime.now()) + '.txt' | |
with open(filename, 'wb') as f: | |
f.write(str(sorted_match_list)) | |
f.close() | |
if __name__=='__main__': | |
search(json_obj, noteable) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment