Created
October 27, 2018 16:36
-
-
Save pbindustries/d59235cfb0f98370f53ceb5ba69266b7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as ec | |
from selenium.webdriver.common.by import By | |
from bs4 import BeautifulSoup | |
from urllib.request import urlretrieve | |
import os | |
import tkinter.filedialog as tkFileDialog | |
import tkinter as Tkinter | |
import tkinter.constants as Tkconstants | |
import time | |
def videoscrape(): | |
try: | |
driver = webdriver.Chrome() | |
driver.maximize_window() | |
for i in range(1, searchPage + 1): | |
url = "https://www.shutterstock.com/video/search/" + searchTerm + "?page=" + str(i) | |
driver.get(url) | |
print("Page " + str(i)) | |
for j in range (0, 50): | |
while True: | |
container = driver.find_elements_by_xpath("//div[@data-automation='VideoGrid_video_videoClipPreview_" + str(j) + "']") | |
if len(container) != 0: | |
break | |
if len(driver.find_elements_by_xpath("//div[@data-automation='VideoGrid_video_videoClipPreview_" + str(j + 1) + "']")) == 0 and i == searchPage: | |
driver.close() | |
return | |
time.sleep(10) | |
driver.get(url) | |
container[0].click() | |
while True: | |
wait = WebDriverWait(driver, 60).until(ec.visibility_of_element_located((By.XPATH, "//video[@data-automation='VideoPlayer_video_video']"))) | |
video_url = driver.current_url | |
data = driver.execute_script("return document.documentElement.outerHTML") | |
scraper = BeautifulSoup(data, "lxml") | |
video_container = scraper.find_all("video", {"data-automation":"VideoPlayer_video_video"}) | |
if len(video_container) != 0: | |
break | |
time.sleep(10) | |
driver.get(video_url) | |
video_array = video_container[0].find_all("source") | |
video_src = video_array[1].get("src") | |
name = video_src.rsplit("/", 1)[-1] | |
try: | |
urlretrieve(video_src, os.path.join(scrape_directory, os.path.basename(video_src))) | |
print("Scraped " + name) | |
except Exception as e: | |
print(e) | |
driver.get(url) | |
except Exception as e: | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment