Last active
June 20, 2016 06:58
-
-
Save theely/7ab9eb5a7e4b17c20a170d353324d849 to your computer and use it in GitHub Desktop.
Docker dynamic crawling proof of concept
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM centos:centos7.2.1511 | |
#Install Google Chrome, Java 8, Xvfb and unzip | |
#--------------------------------------------- | |
RUN echo -e "[google-chrome]\n\ | |
name=google-chrome\n\ | |
baseurl=http://dl.google.com/linux/chrome/rpm/stable/\$basearch\n\ | |
enabled=1\n\ | |
gpgcheck=1\n\ | |
gpgkey=https://dl-ssl.google.com/linux/linux_signing_key.pub" > /etc/yum.repos.d/google-chrome.repo | |
RUN yum install -y java-1.8.0-openjdk-devel.x86_64 google-chrome-stable.x86_64 xorg-x11-server-Xvfb unzip wget | |
#Download Selenium and Google Chrome driver | |
#--------------------------------------------- | |
RUN wget http://selenium-release.storage.googleapis.com/2.53/selenium-java-2.53.0.zip && unzip selenium-java-2.53.0.zip | |
RUN wget http://chromedriver.storage.googleapis.com/2.22/chromedriver_linux64.zip && unzip chromedriver_linux64.zip | |
#Compile small java application to run Selenium | |
#---------------------------------------------- | |
RUN echo -e "\ | |
import org.openqa.selenium.*;\n\ | |
import org.openqa.selenium.chrome.*;\n\ | |
import org.openqa.selenium.support.ui.*;\n\ | |
public class DynamicCrawler{\n\ | |
public static void main(String[] args){\n\ | |
ChromeDriver driver = new ChromeDriver();\n\ | |
try {\n\ | |
driver.navigate().to(args[0]);\n\ | |
Wait<WebDriver> wait = new WebDriverWait(driver, 30);\n\ | |
wait.until(_driver -> String.valueOf(((JavascriptExecutor) _driver).executeScript(\"return document.readyState\")).equals(\"complete\"));\n\ | |
System.out.println(driver.getPageSource());\n\ | |
} finally {driver.quit();}\n\ | |
}\n\ | |
}" > DynamicCrawler.java && javac -cp selenium-2.53.0/selenium-java-2.53.0.jar:selenium-2.53.0/libs/* DynamicCrawler.java | |
#Start-up script to load Xvfb and the java app | |
#---------------------------------------------- | |
ENV DISPLAY=:10 | |
RUN echo -e "#!/bin/bash\n\ | |
/usr/bin/Xvfb \$DISPLAY -screen 0 1366x768x24 -ac&\n\ | |
java -cp selenium-2.53.0/selenium-java-2.53.0.jar:selenium-2.53.0/libs/*:. -Dwebdriver.chrome.driver=./chromedriver DynamicCrawler \$1 \ | |
" > /start.sh | |
#Entrypoint and args (first arg is the webpage to crawl) | |
#-------------------------------------------------------- | |
ENTRYPOINT ["/bin/bash","/start.sh"] | |
CMD [""] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment