Skip to content

Instantly share code, notes, and snippets.

@OsmanMutlu
Created March 14, 2019 06:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OsmanMutlu/2fdf602d5226f3f56eebf930cbf28abe to your computer and use it in GitHub Desktop.
Save OsmanMutlu/2fdf602d5226f3f56eebf930cbf28abe to your computer and use it in GitHub Desktop.
import lxml.html
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
import codecs
import json
with codecs.open("urls.jl","r","utf-8") as f:
links = f.readlines()
browser = webdriver.PhantomJS("~/phantomjs-2.1.1-linux-x86_64/bin/phantomjs")
browser.set_page_load_timeout(30)
for url in links:
url = re.sub(r'\n|\r| |"', r'', url)
try:
browser.get(url)
except TimeoutException:
print("Couln't load page : " + url)
continue
delay = 3 # seconds
try:
myElem = WebDriverWait(browser, delay).until(EC.staleness_of(browser.find_element_by_tag_name("html")))
print("Page is ready!")
except TimeoutException:
print("Loading took too much time!")
filename = re.sub(r"\/|:", r"_", url)
with codecs.open("news/" + filename, "w", "utf-8") as g:
g.write(browser.page_source)
print("Finished : " + url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment