zhangtemplar/instagram.py

## instagram.py
"""
This script crawl the videos from instagram for a given url.
"""
from selenium import webdriver
import os
import time
import random
from sets import Set


def listVideos(driver):
    # type: (webdriver) -> list(str)
    """
    Get the links to each individual video by scanning the webpage.

    :param driver: a webdriver opens that the url
    :return: the lists of links to each individual video
    """
    results = Set()
    while True:
        newVideoFound = False
        videos = driver.find_elements_by_xpath("//div[@class='_mck9w _gvoze _tn0ps']/a")
        for video in videos:
            videoLink = video.get_attribute('href')
            print 'find video', videoLink
            if videoLink not in results:
                results.add(videoLink)
                newVideoFound = True
        if not newVideoFound:
            break
        # As Instagram uses dynamic loads more video, thus we need to scroll down the page to find more videos.
        driver.execute_script("window.scrollBy(0,1000)")
        time.sleep(10)
    return results


def getVideo(driver, videoLink):
    # type: (webdriver, str) -> None
    """
    Download the video included in the page pointed by videoLink to current directory.

    :param driver: a webdriver
    :param videoLink: the link points to a page containing the video
    :return: None
    """
    driver.get(videoLink)
    start = driver.page_source.find('"video_url":"')
    end = driver.page_source.find('"', start + len('"video_url":"'))
    sourceLink = driver.page_source[start + len('"video_url":"'): end]
    os.system("wget --no-check-certificate {}".format(sourceLink))

def checkVideoDurationSeconds():
    # type: () -> None
    """
    Print the video name and its duration in seconds.

    :return: None
    """
    for video in os.listdir(os.curdir()):
        if not video.endswith("mp4"):
            continue
        print video
        os.system('ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(os.path.join(os.curdir(), video)))


def main():
    # type: () -> None
    """
    The main function to download the video.

    <p>Please provide the link to the page that contains a collection of videos. Also specify the path of your
    webdriver. The code has been tested with Chromedriver.

    :return: None
    """
    sourceLink = 'https://www.instagram.com/explore/tags/hyperlapse/'
    driver = webdriver.Chrome("/Users/zhangqiang/chromedriver")
    driver.get(sourceLink)
    videoLinks = listVideos(driver)
    print 'get {} number of videos'.format(len(videoLinks))
    for videoLink in videoLinks:
        print 'download', videoLink
        getVideo(driver, videoLink)
        time.sleep(random.randint(10, 30))
    driver.close()
    checkVideoDurationSeconds()


if __name__ == "__main__":
    main()
	"""
	This script crawl the videos from instagram for a given url.
	"""
	from selenium import webdriver
	import os
	import time
	import random
	from sets import Set


	def listVideos(driver):
	# type: (webdriver) -> list(str)
	"""
	Get the links to each individual video by scanning the webpage.

	:param driver: a webdriver opens that the url
	:return: the lists of links to each individual video
	"""
	results = Set()
	while True:
	newVideoFound = False
	videos = driver.find_elements_by_xpath("//div[@class='_mck9w _gvoze _tn0ps']/a")
	for video in videos:
	videoLink = video.get_attribute('href')
	print 'find video', videoLink
	if videoLink not in results:
	results.add(videoLink)
	newVideoFound = True
	if not newVideoFound:
	break
	# As Instagram uses dynamic loads more video, thus we need to scroll down the page to find more videos.
	driver.execute_script("window.scrollBy(0,1000)")
	time.sleep(10)
	return results


	def getVideo(driver, videoLink):
	# type: (webdriver, str) -> None
	"""
	Download the video included in the page pointed by videoLink to current directory.

	:param driver: a webdriver
	:param videoLink: the link points to a page containing the video
	:return: None
	"""
	driver.get(videoLink)
	start = driver.page_source.find('"video_url":"')
	end = driver.page_source.find('"', start + len('"video_url":"'))
	sourceLink = driver.page_source[start + len('"video_url":"'): end]
	os.system("wget --no-check-certificate {}".format(sourceLink))

	def checkVideoDurationSeconds():
	# type: () -> None
	"""
	Print the video name and its duration in seconds.

	:return: None
	"""
	for video in os.listdir(os.curdir()):
	if not video.endswith("mp4"):
	continue
	print video
	os.system('ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(os.path.join(os.curdir(), video)))


	def main():
	# type: () -> None
	"""
	The main function to download the video.

	<p>Please provide the link to the page that contains a collection of videos. Also specify the path of your
	webdriver. The code has been tested with Chromedriver.

	:return: None
	"""
	sourceLink = 'https://www.instagram.com/explore/tags/hyperlapse/'
	driver = webdriver.Chrome("/Users/zhangqiang/chromedriver")
	driver.get(sourceLink)
	videoLinks = listVideos(driver)
	print 'get {} number of videos'.format(len(videoLinks))
	for videoLink in videoLinks:
	print 'download', videoLink
	getVideo(driver, videoLink)
	time.sleep(random.randint(10, 30))
	driver.close()
	checkVideoDurationSeconds()


	if __name__ == "__main__":
	main()