Skip to content

Instantly share code, notes, and snippets.

@patdavid
Created March 21, 2017 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save patdavid/2892e6e4c22975c3d0709e10a35a0ce5 to your computer and use it in GitHub Desktop.
Save patdavid/2892e6e4c22975c3d0709e10a35a0ce5 to your computer and use it in GitHub Desktop.
Scraping the digikam website for news content to migrate to a new system
#!/usr/bin/python
from __future__ import unicode_literals
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import datetime
import os
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
# Setup our regex matches
# for YYYY-MM-DD
yyyymmdd = re.compile('\d\d\d\d-\d\d-\d\d')
yyyymmddtime = re.compile('\d\d\d\d-\d\d-\d\d \d\d:\d\d')
_submitter = re.compile('by (.+?) on')
def truncate_string(mystring, numwords):
return ' '.join(mystring.split()[:numwords])
print "#### BEGIN"
print "#### %s" % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
apage = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
baseurl = "http://www.digikam.org/news?page="
for page in range(0,36):
#for page in range(0,1):
url = baseurl + "%d" % page
#print( url )
driver.get( url )
#list = driver.find_elements_by_xpath("//div[contains(@itemprop, 'articleBody')]//table")
list = driver.find_elements_by_xpath("//div[contains(@class, 'node')]/h2[contains(@class, 'title')]/a")
#print( list )
for entry in list:
print( "=====================" )
#print( entry.text )
href = entry.get_attribute('href')
#print( entry.get_attribute('href') )
#print( href )
#print( "Fetching..." )
apage.get( href )
title = apage.find_element_by_xpath("//div[contains(@id, 'main')]/h1[contains(@class, 'title')]")
submitted = apage.find_element_by_xpath("//div[contains(@class, 'node')]/span[contains(@class, 'submitted')]")
content = apage.find_element_by_xpath("//div[contains(@class, 'node')]/div[contains(@class, 'content')]")
print( title.text )
#print( submitted.text )
try:
shortdate = yyyymmdd.search( submitted.text ).group()
except AttributeError:
shortdate = "2000-01-01"
#print( shortdate)
try:
longdate = yyyymmddtime.search( submitted.text ).group()
# 2017-03-14 09:24
except AttributeError:
longdate = "2000-01-01 00:00"
#print( longdate )
_date = time.strptime( longdate, "%Y-%m-%d %H:%M" )
#print "_date tuple: %s" % _date
_isodate = time.strftime('%Y-%m-%dT%H:%M:%SZ', _date)
print( _isodate )
try:
submitter = _submitter.search( submitted.text ).group(1)
except AttributeError:
submitter = "Nobody"
print( submitter )
_title = re.sub(r'\.{2,}', '', title.text) # Remove ellipses
_title = re.sub(r'[^a-zA-Z0-9-\.]', '_', _title)
#filename = shortdate +'_'+ title.text +'.md'
filename = shortdate +'_'+ _title +'.md'
print( filename )
#print( content.get_attribute('innerHTML') )
_description = truncate_string(content.text, 25)
_description = re.sub(r'"', '\"', _description)
print( _description )
if not os.path.exists( filename ):
#print filename, " doesn't exist - creating..."
with open( filename, 'w') as afile:
afile.write('---\n')
afile.write('date: "'+ _isodate +'"\n')
afile.write('title: "'+ title.text.encode('utf-8') +'"\n')
afile.write('author: "'+ submitter.encode('utf-8') +'"\n')
afile.write('description: "'+ _description +'"\n')
afile.write('taxonomies: "news"\n')
afile.write('orig_url: "'+ href +'"\n')
afile.write('\n---\n\n')
afile.write( content.get_attribute('innerHTML').encode('utf-8') )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment