alxrz/Instruction

## Instruction
-q "query" - Search term
-p 1 - <optional> number of pages to scrape (50 results per page)
-r playlist,channel - <optional> recursively extract videos from channels or playlists, default is off (be careful some channels / playlists may have thousands of videos, api is slow to process all)
--headers true - print with headers or not (false value)

Examples:

prints to console itself:
python youtube_search.py -q "async await" -p 1 -r playlist --headers false

in order to print to file (csv is the best, file doesn't have to exist) put at the end:
python youtube_search.py -q "async await" -p 1 -r playlist --headers false > some_file.csv

Get to file (windows):
Win + R > type cmd and click enter > use "cd folder_name" to navigate to desired folder with script

Requirements:
install python (version <3.0, haven't tested with higher versions)
install package manager (http://stackoverflow.com/questions/4750806/how-do-i-install-pip-on-windows)
install youtube api by running "pip install --upgrade apiclient"
if you get an error install also: "pip install --upgrade argparse"

## youtube_search.py
from apiclient.discovery import build
from apiclient.errors import HttpError
import argparse

from math import ceil
import csv

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

DEVELOPER_KEY = "AIzaSyCM2TfAq7A7u8bGiNtpBquVJR1QZtyrmNQ"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
    developerKey=DEVELOPER_KEY)

def youtube_search(options):
  videos = []
  page = 0

  search_request = youtube.search().list(
      q=options.query,
      part="id,snippet",
      maxResults=50
    )

  type_mapping = {
    "youtube#video":"videoId",
    "youtube#channel":"channelId",
    "youtube#playlist":"playlistId"
  }

  channels = []
  playlists = []
  playlist_titles = []

  while search_request and page < options.pages:
    search_response = search_request.execute()

    for sr in search_response.get("items", []):
      kind = sr["id"]["kind"]
      rid = sr["id"][type_mapping.get(sr["id"]["kind"])]

      videos.append([kind, sr["snippet"]["title"], rid])

      if kind == "youtube#playlist":
        playlists.append(rid)
        playlist_titles.append(sr["snippet"]["title"])
      elif kind == "youtube#channel":
        channels.append(rid)

    search_request = youtube.search().list_next(search_request, search_response)
    page += 1

  wr = csv.writer(sys.stdout, delimiter=',', lineterminator='\n')

  headers = ['type', 'title', 'id']

  if options.headers:
    if options.recursive:
      headers.extend(['channel title', 'channel id'])
    wr.writerow(headers)

  wr.writerows(videos)

  if options.recursive and 'channel' in options.recursive:
    print_videos_from_channels(channels)

  if options.recursive and 'playlist' in options.recursive:
    print_videos_from_playlists(playlists, playlist_titles)


### gets all playlists from a channel and prints out ###
# returns nothing
# could be faster by using search with channelId param and type youtube#video
def print_videos_from_channels(channels=[]):

  for channel in channels:
    print '\n', 'Channel Id (%s)' % channel, '\n'

    playlists = get_playlists_from_channel(channel)
    playlist_ids = [v[0] for v in playlists]
    titles = [v[1] for v in playlists]

    print_videos_from_playlists(playlist_ids, titles)


### get list of playlists from a channel ###
# Returns id, title
def get_playlists_from_channel(channel):
  search_request = youtube.playlists().list(
    channelId=channel,
    part="id,snippet",
    maxResults=50
    )

  listIds = []
  playlists = []

  while search_request:
    search_results = search_request.execute()

    for sr in search_results.get("items",[]):
      if sr["kind"] == "youtube#playlist":
        playlists.append([sr['id'], sr['snippet']['title']])

    search_request = youtube.playlistItems().list_next(search_request, search_results)

  return playlists

##### print videos from all playlists #####
def print_videos_from_playlists(playlist_ids, titles=None):
  with_titles = (len(titles) == len(playlist_ids))

  wr = csv.writer(sys.stdout, delimiter=',', lineterminator='\n')

  for i in range(0, len(playlist_ids)):
    title = titles[i] if with_titles else ""
    playlist = playlist_ids[i]

    videos = []

    print '\n', "Playlist: %s (%s)"  % (title, playlist), '\n'

    playlist_videos = get_videos_from_playlist(playlist)

    # parse additional params
    for vid in playlist_videos:
      videos.append([
        "youtube#video", vid["snippet"]["title"], vid["snippet"]["resourceId"]["videoId"],
        vid["snippet"]["channelTitle"], vid["snippet"]["channelId"]
      ])

    wr.writerows(videos)


##### get videos from a playlist #####
## returns item object
def get_videos_from_playlist(playlist):
  params = dict(
    part="id,snippet",
    playlistId=playlist,
    maxResults=50
  )

  videos = []

  max_pages = 4 # max 200 videos per playlist
  page = 0

  search_request = youtube.playlistItems().list(**params)

  while search_request and page < max_pages:
    search_results = search_request.execute()

    for sr in search_results.get("items", []):
      if sr["kind"] == "youtube#playlistItem" and sr["snippet"]["resourceId"]["kind"] == "youtube#video":
        videos.append(sr)

    search_request = youtube.playlistItems().list_next(search_request, search_results)
    page += 1

  return videos


if __name__ == "__main__":

  parser = argparse.ArgumentParser()
  parser.add_argument("-q", "--query", help="Search term")
  parser.add_argument("-p", "--pages", help="Optional: Number of returned pages (50 per page)", type=int, default=1)
  parser.add_argument("-r", "--recursive", help="Optional: Recursively extract videos from playlist|channel, use comma. Default is off", default=None)
  parser.add_argument("--headers", help="Print out with headers true|false")

  args = parser.parse_args()
  args.headers = False if args.headers == 'false' else True

  # print print_videos_from_channels(["UCCgsgo7EmZHJ2tmSLROXspQ"])
  # print print_videos_from_playlists([""])

  youtube_search(args)

  try:
    pass
  except HttpError, e:
    print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)
	-q "query" - Search term
	-p 1 - <optional> number of pages to scrape (50 results per page)
	-r playlist,channel - <optional> recursively extract videos from channels or playlists, default is off (be careful some channels / playlists may have thousands of videos, api is slow to process all)
	--headers true - print with headers or not (false value)

	Examples:

	prints to console itself:
	python youtube_search.py -q "async await" -p 1 -r playlist --headers false

	in order to print to file (csv is the best, file doesn't have to exist) put at the end:
	python youtube_search.py -q "async await" -p 1 -r playlist --headers false > some_file.csv

	Get to file (windows):
	Win + R > type cmd and click enter > use "cd folder_name" to navigate to desired folder with script

	Requirements:
	install python (version <3.0, haven't tested with higher versions)
	install package manager (http://stackoverflow.com/questions/4750806/how-do-i-install-pip-on-windows)
	install youtube api by running "pip install --upgrade apiclient"
	if you get an error install also: "pip install --upgrade argparse"
	from apiclient.discovery import build
	from apiclient.errors import HttpError
	import argparse

	from math import ceil
	import csv

	import sys
	reload(sys)
	sys.setdefaultencoding('utf-8')

	DEVELOPER_KEY = "AIzaSyCM2TfAq7A7u8bGiNtpBquVJR1QZtyrmNQ"
	YOUTUBE_API_SERVICE_NAME = "youtube"
	YOUTUBE_API_VERSION = "v3"

	youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
	developerKey=DEVELOPER_KEY)

	def youtube_search(options):
	videos = []
	page = 0

	search_request = youtube.search().list(
	q=options.query,
	part="id,snippet",
	maxResults=50
	)

	type_mapping = {
	"youtube#video":"videoId",
	"youtube#channel":"channelId",
	"youtube#playlist":"playlistId"
	}

	channels = []
	playlists = []
	playlist_titles = []

	while search_request and page < options.pages:
	search_response = search_request.execute()

	for sr in search_response.get("items", []):
	kind = sr["id"]["kind"]
	rid = sr["id"][type_mapping.get(sr["id"]["kind"])]

	videos.append([kind, sr["snippet"]["title"], rid])

	if kind == "youtube#playlist":
	playlists.append(rid)
	playlist_titles.append(sr["snippet"]["title"])
	elif kind == "youtube#channel":
	channels.append(rid)

	search_request = youtube.search().list_next(search_request, search_response)
	page += 1

	wr = csv.writer(sys.stdout, delimiter=',', lineterminator='\n')

	headers = ['type', 'title', 'id']

	if options.headers:
	if options.recursive:
	headers.extend(['channel title', 'channel id'])
	wr.writerow(headers)

	wr.writerows(videos)

	if options.recursive and 'channel' in options.recursive:
	print_videos_from_channels(channels)

	if options.recursive and 'playlist' in options.recursive:
	print_videos_from_playlists(playlists, playlist_titles)


	### gets all playlists from a channel and prints out ###
	# returns nothing
	# could be faster by using search with channelId param and type youtube#video
	def print_videos_from_channels(channels=[]):

	for channel in channels:
	print '\n', 'Channel Id (%s)' % channel, '\n'

	playlists = get_playlists_from_channel(channel)
	playlist_ids = [v[0] for v in playlists]
	titles = [v[1] for v in playlists]

	print_videos_from_playlists(playlist_ids, titles)


	### get list of playlists from a channel ###
	# Returns id, title
	def get_playlists_from_channel(channel):
	search_request = youtube.playlists().list(
	channelId=channel,
	part="id,snippet",
	maxResults=50
	)

	listIds = []
	playlists = []

	while search_request:
	search_results = search_request.execute()

	for sr in search_results.get("items",[]):
	if sr["kind"] == "youtube#playlist":
	playlists.append([sr['id'], sr['snippet']['title']])

	search_request = youtube.playlistItems().list_next(search_request, search_results)

	return playlists

	##### print videos from all playlists #####
	def print_videos_from_playlists(playlist_ids, titles=None):
	with_titles = (len(titles) == len(playlist_ids))

	wr = csv.writer(sys.stdout, delimiter=',', lineterminator='\n')

	for i in range(0, len(playlist_ids)):
	title = titles[i] if with_titles else ""
	playlist = playlist_ids[i]

	videos = []

	print '\n', "Playlist: %s (%s)" % (title, playlist), '\n'

	playlist_videos = get_videos_from_playlist(playlist)

	# parse additional params
	for vid in playlist_videos:
	videos.append([
	"youtube#video", vid["snippet"]["title"], vid["snippet"]["resourceId"]["videoId"],
	vid["snippet"]["channelTitle"], vid["snippet"]["channelId"]
	])

	wr.writerows(videos)


	##### get videos from a playlist #####
	## returns item object
	def get_videos_from_playlist(playlist):
	params = dict(
	part="id,snippet",
	playlistId=playlist,
	maxResults=50
	)

	videos = []

	max_pages = 4 # max 200 videos per playlist
	page = 0

	search_request = youtube.playlistItems().list(**params)

	while search_request and page < max_pages:
	search_results = search_request.execute()

	for sr in search_results.get("items", []):
	if sr["kind"] == "youtube#playlistItem" and sr["snippet"]["resourceId"]["kind"] == "youtube#video":
	videos.append(sr)

	search_request = youtube.playlistItems().list_next(search_request, search_results)
	page += 1

	return videos


	if __name__ == "__main__":

	parser = argparse.ArgumentParser()
	parser.add_argument("-q", "--query", help="Search term")
	parser.add_argument("-p", "--pages", help="Optional: Number of returned pages (50 per page)", type=int, default=1)
	parser.add_argument("-r", "--recursive", help="Optional: Recursively extract videos from playlist\|channel, use comma. Default is off", default=None)
	parser.add_argument("--headers", help="Print out with headers true\|false")

	args = parser.parse_args()
	args.headers = False if args.headers == 'false' else True

	# print print_videos_from_channels(["UCCgsgo7EmZHJ2tmSLROXspQ"])
	# print print_videos_from_playlists([""])

	youtube_search(args)

	try:
	pass
	except HttpError, e:
	print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)