Skip to content

Instantly share code, notes, and snippets.

@zhangtemplar
Created February 8, 2018 18:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhangtemplar/81bea3bdd8ac24d193c47adda156d9c3 to your computer and use it in GitHub Desktop.
Save zhangtemplar/81bea3bdd8ac24d193c47adda156d9c3 to your computer and use it in GitHub Desktop.
Download video from instagram given a hashtag
"""
This script crawl the videos from instagram for a given url.
"""
from selenium import webdriver
import os
import time
import random
from sets import Set
def listVideos(driver):
# type: (webdriver) -> list(str)
"""
Get the links to each individual video by scanning the webpage.
:param driver: a webdriver opens that the url
:return: the lists of links to each individual video
"""
results = Set()
while True:
newVideoFound = False
videos = driver.find_elements_by_xpath("//div[@class='_mck9w _gvoze _tn0ps']/a")
for video in videos:
videoLink = video.get_attribute('href')
print 'find video', videoLink
if videoLink not in results:
results.add(videoLink)
newVideoFound = True
if not newVideoFound:
break
# As Instagram uses dynamic loads more video, thus we need to scroll down the page to find more videos.
driver.execute_script("window.scrollBy(0,1000)")
time.sleep(10)
return results
def getVideo(driver, videoLink):
# type: (webdriver, str) -> None
"""
Download the video included in the page pointed by videoLink to current directory.
:param driver: a webdriver
:param videoLink: the link points to a page containing the video
:return: None
"""
driver.get(videoLink)
start = driver.page_source.find('"video_url":"')
end = driver.page_source.find('"', start + len('"video_url":"'))
sourceLink = driver.page_source[start + len('"video_url":"'): end]
os.system("wget --no-check-certificate {}".format(sourceLink))
def checkVideoDurationSeconds():
# type: () -> None
"""
Print the video name and its duration in seconds.
:return: None
"""
for video in os.listdir(os.curdir()):
if not video.endswith("mp4"):
continue
print video
os.system('ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(os.path.join(os.curdir(), video)))
def main():
# type: () -> None
"""
The main function to download the video.
<p>Please provide the link to the page that contains a collection of videos. Also specify the path of your
webdriver. The code has been tested with Chromedriver.
:return: None
"""
sourceLink = 'https://www.instagram.com/explore/tags/hyperlapse/'
driver = webdriver.Chrome("/Users/zhangqiang/chromedriver")
driver.get(sourceLink)
videoLinks = listVideos(driver)
print 'get {} number of videos'.format(len(videoLinks))
for videoLink in videoLinks:
print 'download', videoLink
getVideo(driver, videoLink)
time.sleep(random.randint(10, 30))
driver.close()
checkVideoDurationSeconds()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment