Skip to content

Instantly share code, notes, and snippets.

@djds23
Created April 24, 2014 21:16
Show Gist options
  • Save djds23/11269778 to your computer and use it in GitHub Desktop.
Save djds23/11269778 to your computer and use it in GitHub Desktop.
Instead of gathering videos by hand, this script will take a csv with video descriptions, and a json file of the database, then download the videos that match.
import csv
import sets
import json
import string
import urllib
import operator
from datetime import datetime
with open('master.csv', 'rb') as f: #csv from client with celeb name and description
noteable = [row for row in csv.reader(f.read().splitlines())]
with open('data.json','rb') as j: #json file of video database
vamp_data = j.read()
json_obj = json.loads(vamp_data)
def prepare_text(text):
'''takes text, removes punctuation and filler words, returns list of keywords'''
table = string.maketrans("","")
raw_keywords = text.translate(table, string.punctuation)
keyword_list = map(lambda x:x.lower(), raw_keywords.split())
stop_words = ['of','and', 'then','is','the', 'in', 'and', 'my', 'from', 'to']
return [w for w in keyword_list if w not in stop_words]
def search(json_obj, noteable):
'''compares the list of people against the title from vamp'''
noteable = noteable[1:] #remove the column headers from csv
json_obj = json_obj['objects'] #access array of video objects
match_list = []
for person in noteable:
keywords = prepare_text(person[1]+ ' ' + person[3]) #Name & Description field respectively
for video in json_obj:
title = video['title'].encode('ascii', 'ignore')
title = prepare_text(title)
match_number = len(set(keywords).intersection(title)) #provide a number for matching words
if match_number >= 3: #only keep videos that match 3 words or more
video.update({'match number':match_number, 'person': person})
match_list.append(video)
json_obj.remove(video)
sorted_match_list = sorted(match_list, key=operator.itemgetter('match number'), reverse=True)
return build_download_links(sorted_match_list)
def build_download_links(sorted_match_list):
'''build a download link for the video'''
base_url = 'http://internal_server.video/archive/'
for video in sorted_match_list:
filename = video['source'].encode('ascii', 'ignore')
mezz_name = filename[:-4]+'-mezzanine.mp4' #grab the mezzanine not the source
url = base_url + video['content_dir'].encode('ascii', 'ignore') + '/' + mezz_name
print 'Downloading:', url
urllib.urlretrieve(url, mezz_name)
return final_report(sorted_match_list)
def final_report(sorted_match_list):
'''write the matching files to a text file'''
filename = 'refresh_' + str(datetime.now()) + '.txt'
with open(filename, 'wb') as f:
f.write(str(sorted_match_list))
f.close()
if __name__=='__main__':
search(json_obj, noteable)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment