Created
February 8, 2018 18:39
-
-
Save zhangtemplar/81bea3bdd8ac24d193c47adda156d9c3 to your computer and use it in GitHub Desktop.
Download video from instagram given a hashtag
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script crawl the videos from instagram for a given url. | |
""" | |
from selenium import webdriver | |
import os | |
import time | |
import random | |
from sets import Set | |
def listVideos(driver): | |
# type: (webdriver) -> list(str) | |
""" | |
Get the links to each individual video by scanning the webpage. | |
:param driver: a webdriver opens that the url | |
:return: the lists of links to each individual video | |
""" | |
results = Set() | |
while True: | |
newVideoFound = False | |
videos = driver.find_elements_by_xpath("//div[@class='_mck9w _gvoze _tn0ps']/a") | |
for video in videos: | |
videoLink = video.get_attribute('href') | |
print 'find video', videoLink | |
if videoLink not in results: | |
results.add(videoLink) | |
newVideoFound = True | |
if not newVideoFound: | |
break | |
# As Instagram uses dynamic loads more video, thus we need to scroll down the page to find more videos. | |
driver.execute_script("window.scrollBy(0,1000)") | |
time.sleep(10) | |
return results | |
def getVideo(driver, videoLink): | |
# type: (webdriver, str) -> None | |
""" | |
Download the video included in the page pointed by videoLink to current directory. | |
:param driver: a webdriver | |
:param videoLink: the link points to a page containing the video | |
:return: None | |
""" | |
driver.get(videoLink) | |
start = driver.page_source.find('"video_url":"') | |
end = driver.page_source.find('"', start + len('"video_url":"')) | |
sourceLink = driver.page_source[start + len('"video_url":"'): end] | |
os.system("wget --no-check-certificate {}".format(sourceLink)) | |
def checkVideoDurationSeconds(): | |
# type: () -> None | |
""" | |
Print the video name and its duration in seconds. | |
:return: None | |
""" | |
for video in os.listdir(os.curdir()): | |
if not video.endswith("mp4"): | |
continue | |
print video | |
os.system('ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(os.path.join(os.curdir(), video))) | |
def main(): | |
# type: () -> None | |
""" | |
The main function to download the video. | |
<p>Please provide the link to the page that contains a collection of videos. Also specify the path of your | |
webdriver. The code has been tested with Chromedriver. | |
:return: None | |
""" | |
sourceLink = 'https://www.instagram.com/explore/tags/hyperlapse/' | |
driver = webdriver.Chrome("/Users/zhangqiang/chromedriver") | |
driver.get(sourceLink) | |
videoLinks = listVideos(driver) | |
print 'get {} number of videos'.format(len(videoLinks)) | |
for videoLink in videoLinks: | |
print 'download', videoLink | |
getVideo(driver, videoLink) | |
time.sleep(random.randint(10, 30)) | |
driver.close() | |
checkVideoDurationSeconds() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment