patdavid/scrape-digikam.py

## scrape-digikam.py
 #!/usr/bin/python

from __future__ import unicode_literals
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import datetime
import os
import re
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

# Setup our regex matches
# for YYYY-MM-DD
yyyymmdd = re.compile('\d\d\d\d-\d\d-\d\d')
yyyymmddtime = re.compile('\d\d\d\d-\d\d-\d\d \d\d:\d\d')
_submitter = re.compile('by (.+?) on')

def truncate_string(mystring, numwords):
    return ' '.join(mystring.split()[:numwords])

print "#### BEGIN"
print "#### %s" % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')


driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
apage  = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')


baseurl = "http://www.digikam.org/news?page="

for page in range(0,36):
#for page in range(0,1):

    url = baseurl + "%d" % page
    #print( url )
    driver.get( url )
    #list = driver.find_elements_by_xpath("//div[contains(@itemprop, 'articleBody')]//table")
    list = driver.find_elements_by_xpath("//div[contains(@class, 'node')]/h2[contains(@class, 'title')]/a")
    #print( list )
    for entry in list:
        print( "=====================" )
        #print( entry.text )
        href = entry.get_attribute('href')
        #print( entry.get_attribute('href') )
        #print( href )
        #print( "Fetching..." )

        apage.get( href )

        title = apage.find_element_by_xpath("//div[contains(@id, 'main')]/h1[contains(@class, 'title')]")
        submitted = apage.find_element_by_xpath("//div[contains(@class, 'node')]/span[contains(@class, 'submitted')]")
        content = apage.find_element_by_xpath("//div[contains(@class, 'node')]/div[contains(@class, 'content')]")

        print( title.text )
        #print( submitted.text )

        try:
            shortdate = yyyymmdd.search( submitted.text ).group()
        except AttributeError:
            shortdate = "2000-01-01"
        #print( shortdate)

        try:
            longdate = yyyymmddtime.search( submitted.text ).group()
            # 2017-03-14 09:24
        except AttributeError:
            longdate = "2000-01-01 00:00"
        #print( longdate )
        _date = time.strptime( longdate, "%Y-%m-%d %H:%M" )
        #print "_date tuple: %s" % _date
        _isodate = time.strftime('%Y-%m-%dT%H:%M:%SZ', _date)
        print( _isodate )

        try:
            submitter = _submitter.search( submitted.text ).group(1)
        except AttributeError:
            submitter = "Nobody"
        print( submitter )

        _title = re.sub(r'\.{2,}', '', title.text) # Remove ellipses
        _title = re.sub(r'[^a-zA-Z0-9-\.]', '_', _title)

        #filename = shortdate +'_'+ title.text +'.md'
        filename = shortdate +'_'+ _title +'.md'
        print( filename )


        #print( content.get_attribute('innerHTML') )
        _description = truncate_string(content.text, 25)
        _description = re.sub(r'"', '\"', _description)
        print( _description )

        if not os.path.exists( filename ):
            #print filename, " doesn't exist - creating..."
            with open( filename, 'w') as afile:
                afile.write('---\n')
                afile.write('date: "'+ _isodate +'"\n')
                afile.write('title: "'+ title.text.encode('utf-8') +'"\n')
                afile.write('author: "'+ submitter.encode('utf-8') +'"\n')
                afile.write('description: "'+ _description +'"\n')
                afile.write('taxonomies: "news"\n')
                afile.write('orig_url: "'+ href +'"\n')
                afile.write('\n---\n\n')
                afile.write( content.get_attribute('innerHTML').encode('utf-8') )
	#!/usr/bin/python

	from __future__ import unicode_literals
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import time
	import datetime
	import os
	import re
	import sys

	reload(sys)
	sys.setdefaultencoding('utf-8')

	# Setup our regex matches
	# for YYYY-MM-DD
	yyyymmdd = re.compile('\d\d\d\d-\d\d-\d\d')
	yyyymmddtime = re.compile('\d\d\d\d-\d\d-\d\d \d\d:\d\d')
	_submitter = re.compile('by (.+?) on')

	def truncate_string(mystring, numwords):
	return ' '.join(mystring.split()[:numwords])

	print "#### BEGIN"
	print "#### %s" % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')


	driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')
	apage = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs')


	baseurl = "http://www.digikam.org/news?page="

	for page in range(0,36):
	#for page in range(0,1):

	url = baseurl + "%d" % page
	#print( url )
	driver.get( url )
	#list = driver.find_elements_by_xpath("//div[contains(@itemprop, 'articleBody')]//table")
	list = driver.find_elements_by_xpath("//div[contains(@class, 'node')]/h2[contains(@class, 'title')]/a")
	#print( list )
	for entry in list:
	print( "=====================" )
	#print( entry.text )
	href = entry.get_attribute('href')
	#print( entry.get_attribute('href') )
	#print( href )
	#print( "Fetching..." )

	apage.get( href )

	title = apage.find_element_by_xpath("//div[contains(@id, 'main')]/h1[contains(@class, 'title')]")
	submitted = apage.find_element_by_xpath("//div[contains(@class, 'node')]/span[contains(@class, 'submitted')]")
	content = apage.find_element_by_xpath("//div[contains(@class, 'node')]/div[contains(@class, 'content')]")

	print( title.text )
	#print( submitted.text )

	try:
	shortdate = yyyymmdd.search( submitted.text ).group()
	except AttributeError:
	shortdate = "2000-01-01"
	#print( shortdate)

	try:
	longdate = yyyymmddtime.search( submitted.text ).group()
	# 2017-03-14 09:24
	except AttributeError:
	longdate = "2000-01-01 00:00"
	#print( longdate )
	_date = time.strptime( longdate, "%Y-%m-%d %H:%M" )
	#print "_date tuple: %s" % _date
	_isodate = time.strftime('%Y-%m-%dT%H:%M:%SZ', _date)
	print( _isodate )

	try:
	submitter = _submitter.search( submitted.text ).group(1)
	except AttributeError:
	submitter = "Nobody"
	print( submitter )

	_title = re.sub(r'\.{2,}', '', title.text) # Remove ellipses
	_title = re.sub(r'[^a-zA-Z0-9-\.]', '_', _title)

	#filename = shortdate +'_'+ title.text +'.md'
	filename = shortdate +'_'+ _title +'.md'
	print( filename )


	#print( content.get_attribute('innerHTML') )
	_description = truncate_string(content.text, 25)
	_description = re.sub(r'"', '\"', _description)
	print( _description )

	if not os.path.exists( filename ):
	#print filename, " doesn't exist - creating..."
	with open( filename, 'w') as afile:
	afile.write('---\n')
	afile.write('date: "'+ _isodate +'"\n')
	afile.write('title: "'+ title.text.encode('utf-8') +'"\n')
	afile.write('author: "'+ submitter.encode('utf-8') +'"\n')
	afile.write('description: "'+ _description +'"\n')
	afile.write('taxonomies: "news"\n')
	afile.write('orig_url: "'+ href +'"\n')
	afile.write('\n---\n\n')
	afile.write( content.get_attribute('innerHTML').encode('utf-8') )