jirkapenzes/RoumenBackup.py

## demo.sh
#install dependencies
python3.4 -m pip install htmldom

#start script
python3.4 RoumenBackup.py

## RoumenBackup.py
import http.cookiejar
import os
import urllib.parse
import urllib.request
import time

from htmldom import htmldom
from enum import Enum

__author__ = 'jpenzes'
application = 'RBackup'


class RoumenImage:
    def __init__(self, category, name):
        self.category = category
        self.name = name
        self.url = str()


class States(Enum):
    DOWNLOADED = 'DOWNLOADED'
    FILE_EXIST = 'FILE_EXIST'
    EXCEPTION = 'EXCEPTION'


class Roumen:
    """Roumen URL patterns"""
    AUTHENTICATION_URL = 'http://www.rouming.cz/roumingUserLogin.php'
    BASE_URL = 'http://kecy.roumen.cz/'
    TERMS_URL = 'http://maso.roumen.cz/'
    USER_LIKES_URL = 'http://www.rouming.cz/roumingUserLikes.php'
    USER_LIKE_PAGE_PATTERN = 'http://kecy.roumen.cz/roumingUserLikes.php?&page={0}'
    IMAGE_DETAIL_ROUMING_URL_PATTERN = 'http://www.rouming.cz/roumingShow.php?file={0}'
    IMAGE_DETAIL_MASO_URL_PATTERN = 'http://www.roumenovomaso.cz/masoShow.php?agree=on&file={0}'

    def __init__(self, userName, password, sleepTime=0, isFullLogging=True):
        """Stores user settings"""
        self.payload = {
            'action': 'login',
            'name': userName,
            'password': password,
            'persistent': 'true'
        }
        self.sleepTime = sleepTime
        self.isFullLogging = isFullLogging

    def backupImages(self, destination):
        """Backups all images which user liked at Roumen.cz"""
        self.setDestination(destination)
        self.authorize()
        self.analyzePage()
        print('\nRoumen images backup has completed successfully')

    def analyzePage(self):
        """Analyzes first page and finds all links to other favorite pages"""
        html = self.getHtmlDom(self.USER_LIKES_URL)
        pageLinks = self.parsePageLinks(html)
        self.processPageLinks(pageLinks)

    def setDestination(self, destination):
        """Set a target files destination"""
        self.destination = destination

    def parsePageLinks(self, html):
        """Parses the first user likes page and returns links to the other likes pages"""
        pageLinks = []
        pageLinkButtons = self.parseRoumenContents(html).find('span.roumingButton')
        maxPageNumber = int(1)
        for button in pageLinkButtons:
            link = button.find('a')
            try:
                pageNumber = int(link.text())
                maxPageNumber = max(maxPageNumber, pageNumber)
            except ValueError:
                continue

        for pageNumber in range(maxPageNumber):
            pageLink = str.format(self.USER_LIKE_PAGE_PATTERN, pageNumber + 1)
            pageLinks.append(pageLink)

        return pageLinks

    def parseRoumenContents(self, dom):
        """Returns a html element with main content (list of images)"""
        return dom.find('div.roumingList')

    def processPageLinks(self, pageLinks):
        """Processes all page links"""
        for pageLink in pageLinks:
            self.processPageLink(pageLink)

    def processPageLink(self, pageLink):
        """Processes concrete page link"""
        response = self.openUrl(pageLink)
        dom = htmldom.HtmlDom().createDom(str(response.read()))
        self.processImages(dom)

    def processImages(self, htmlDom):
        """Processes all images which it will find in html"""
        tableRows = self.parseRoumenContents(htmlDom).find("tr")

        # skips first row (it is table header)
        for row in tableRows[1:]:
            cells = list(row.find('td'))
            category = cells[0].text().strip()
            name = cells[1].text().strip()

            try:
                roumenImage = RoumenImage(category, name)
                fileNamePath = self.getImageFilePath(roumenImage)

                if not os.path.isfile(fileNamePath):
                    roumenImage.url = self.resolvedImageUrl(category, name)
                    self.downloadImage(roumenImage)
                    self.sleepThread()
                    state = States.DOWNLOADED
                else:
                    state = States.FILE_EXIST

            except BaseException:
                state = States.EXCEPTION

            self.logImageProgress(state, name)

    def logImageProgress(self, state, imageName):
        """Logs image state (downloaded, exception, file_exist)"""
        if self.isFullLogging or not state == States.FILE_EXIST:
            print(str.format('{0}:\t {1}', state, imageName))

    def authorize(self):
        """User authorizes into user section at Roumen.cz"""
        opener = self.prepareOpener()
        urllib.request.install_opener(opener)
        binary_data = urllib.parse.urlencode(self.payload).encode()
        request = urllib.request.Request(self.AUTHENTICATION_URL, binary_data)
        return self.openUrl(request)

    def prepareOpener(self):
        """Prepares request opener"""
        cookieJar = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
        opener.addheaders.append(('User-agent', 'RedditTesting'))
        return opener

    def openUrl(self, url):
        """Opens the specific url"""
        return urllib.request.urlopen(url)

    def getHtmlDom(self, url):
        """Makes the html dom model from url response"""
        response = self.openUrl(url)
        html = str(response.read().decode())
        return htmldom.HtmlDom().createDom(html)

    def resolvedImageUrl(self, category, imageName):
        """Resolves a image direct url from html by category (rouming, maso)"""
        url = self.makeImageDetailUrl(category, imageName)
        html = self.getHtmlDom(url)

        # finds all image elements at page
        for imageElement in html.find('tr td a img'):
            url = imageElement.first().attr('src')

            # if will detect image name in url
            if imageName in url:
                return url

    def sleepThread(self):
        """Sleep main thread to few seconds (can be blocking to many requests to Roumen.cz)"""
        time.sleep(self.sleepTime)

    def makeImageDetailUrl(self, category, imageName):
        """Returns direct url to image detail page (e.g. with comments, information, ads, ...)"""
        urlPattern = self.IMAGE_DETAIL_MASO_URL_PATTERN \
            if category == 'Maso' else self.IMAGE_DETAIL_ROUMING_URL_PATTERN
        return str.format(urlPattern, imageName)

    def downloadImage(self, roumenImage: RoumenImage):
        """Downloads image from url and stores into file system"""
        target = os.path.join(self.destination, application, roumenImage.category)
        if not os.path.exists(target):
            os.makedirs(target)

        # makes file path
        filePath = self.getImageFilePath(roumenImage)
        urllib.request.urlretrieve(roumenImage.url, filePath)

    def getImageFilePath(self, roumenImage: RoumenImage):
        """Makes full image file system path"""
        return os.path.join(self.destination, application, roumenImage.category, roumenImage.name)


# if you have liked to many images (e.g. 750+) than
# please use follow constructor because can be blocking for to many requests to Roumen.cz
# third attributes just set the sleep time to next image request
#roumen = Roumen('<USER_NAME>', '<PASSWORD>', 1)

roumen = Roumen('<USER_NAME>', '<PASSWORD>')
roumen.backupImages('/Users/jpenzes/Desktop')
	#install dependencies
	python3.4 -m pip install htmldom

	#start script
	python3.4 RoumenBackup.py
	import http.cookiejar
	import os
	import urllib.parse
	import urllib.request
	import time

	from htmldom import htmldom
	from enum import Enum

	__author__ = 'jpenzes'
	application = 'RBackup'


	class RoumenImage:
	def __init__(self, category, name):
	self.category = category
	self.name = name
	self.url = str()


	class States(Enum):
	DOWNLOADED = 'DOWNLOADED'
	FILE_EXIST = 'FILE_EXIST'
	EXCEPTION = 'EXCEPTION'


	class Roumen:
	"""Roumen URL patterns"""
	AUTHENTICATION_URL = 'http://www.rouming.cz/roumingUserLogin.php'
	BASE_URL = 'http://kecy.roumen.cz/'
	TERMS_URL = 'http://maso.roumen.cz/'
	USER_LIKES_URL = 'http://www.rouming.cz/roumingUserLikes.php'
	USER_LIKE_PAGE_PATTERN = 'http://kecy.roumen.cz/roumingUserLikes.php?&page={0}'
	IMAGE_DETAIL_ROUMING_URL_PATTERN = 'http://www.rouming.cz/roumingShow.php?file={0}'
	IMAGE_DETAIL_MASO_URL_PATTERN = 'http://www.roumenovomaso.cz/masoShow.php?agree=on&file={0}'

	def __init__(self, userName, password, sleepTime=0, isFullLogging=True):
	"""Stores user settings"""
	self.payload = {
	'action': 'login',
	'name': userName,
	'password': password,
	'persistent': 'true'
	}
	self.sleepTime = sleepTime
	self.isFullLogging = isFullLogging

	def backupImages(self, destination):
	"""Backups all images which user liked at Roumen.cz"""
	self.setDestination(destination)
	self.authorize()
	self.analyzePage()
	print('\nRoumen images backup has completed successfully')

	def analyzePage(self):
	"""Analyzes first page and finds all links to other favorite pages"""
	html = self.getHtmlDom(self.USER_LIKES_URL)
	pageLinks = self.parsePageLinks(html)
	self.processPageLinks(pageLinks)

	def setDestination(self, destination):
	"""Set a target files destination"""
	self.destination = destination

	def parsePageLinks(self, html):
	"""Parses the first user likes page and returns links to the other likes pages"""
	pageLinks = []
	pageLinkButtons = self.parseRoumenContents(html).find('span.roumingButton')
	maxPageNumber = int(1)
	for button in pageLinkButtons:
	link = button.find('a')
	try:
	pageNumber = int(link.text())
	maxPageNumber = max(maxPageNumber, pageNumber)
	except ValueError:
	continue

	for pageNumber in range(maxPageNumber):
	pageLink = str.format(self.USER_LIKE_PAGE_PATTERN, pageNumber + 1)
	pageLinks.append(pageLink)

	return pageLinks

	def parseRoumenContents(self, dom):
	"""Returns a html element with main content (list of images)"""
	return dom.find('div.roumingList')

	def processPageLinks(self, pageLinks):
	"""Processes all page links"""
	for pageLink in pageLinks:
	self.processPageLink(pageLink)

	def processPageLink(self, pageLink):
	"""Processes concrete page link"""
	response = self.openUrl(pageLink)
	dom = htmldom.HtmlDom().createDom(str(response.read()))
	self.processImages(dom)

	def processImages(self, htmlDom):
	"""Processes all images which it will find in html"""
	tableRows = self.parseRoumenContents(htmlDom).find("tr")

	# skips first row (it is table header)
	for row in tableRows[1:]:
	cells = list(row.find('td'))
	category = cells[0].text().strip()
	name = cells[1].text().strip()

	try:
	roumenImage = RoumenImage(category, name)
	fileNamePath = self.getImageFilePath(roumenImage)

	if not os.path.isfile(fileNamePath):
	roumenImage.url = self.resolvedImageUrl(category, name)
	self.downloadImage(roumenImage)
	self.sleepThread()
	state = States.DOWNLOADED
	else:
	state = States.FILE_EXIST

	except BaseException:
	state = States.EXCEPTION

	self.logImageProgress(state, name)

	def logImageProgress(self, state, imageName):
	"""Logs image state (downloaded, exception, file_exist)"""
	if self.isFullLogging or not state == States.FILE_EXIST:
	print(str.format('{0}:\t {1}', state, imageName))

	def authorize(self):
	"""User authorizes into user section at Roumen.cz"""
	opener = self.prepareOpener()
	urllib.request.install_opener(opener)
	binary_data = urllib.parse.urlencode(self.payload).encode()
	request = urllib.request.Request(self.AUTHENTICATION_URL, binary_data)
	return self.openUrl(request)

	def prepareOpener(self):
	"""Prepares request opener"""
	cookieJar = http.cookiejar.CookieJar()
	opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
	opener.addheaders.append(('User-agent', 'RedditTesting'))
	return opener

	def openUrl(self, url):
	"""Opens the specific url"""
	return urllib.request.urlopen(url)

	def getHtmlDom(self, url):
	"""Makes the html dom model from url response"""
	response = self.openUrl(url)
	html = str(response.read().decode())
	return htmldom.HtmlDom().createDom(html)

	def resolvedImageUrl(self, category, imageName):
	"""Resolves a image direct url from html by category (rouming, maso)"""
	url = self.makeImageDetailUrl(category, imageName)
	html = self.getHtmlDom(url)

	# finds all image elements at page
	for imageElement in html.find('tr td a img'):
	url = imageElement.first().attr('src')

	# if will detect image name in url
	if imageName in url:
	return url

	def sleepThread(self):
	"""Sleep main thread to few seconds (can be blocking to many requests to Roumen.cz)"""
	time.sleep(self.sleepTime)

	def makeImageDetailUrl(self, category, imageName):
	"""Returns direct url to image detail page (e.g. with comments, information, ads, ...)"""
	urlPattern = self.IMAGE_DETAIL_MASO_URL_PATTERN \
	if category == 'Maso' else self.IMAGE_DETAIL_ROUMING_URL_PATTERN
	return str.format(urlPattern, imageName)

	def downloadImage(self, roumenImage: RoumenImage):
	"""Downloads image from url and stores into file system"""
	target = os.path.join(self.destination, application, roumenImage.category)
	if not os.path.exists(target):
	os.makedirs(target)

	# makes file path
	filePath = self.getImageFilePath(roumenImage)
	urllib.request.urlretrieve(roumenImage.url, filePath)

	def getImageFilePath(self, roumenImage: RoumenImage):
	"""Makes full image file system path"""
	return os.path.join(self.destination, application, roumenImage.category, roumenImage.name)


	# if you have liked to many images (e.g. 750+) than
	# please use follow constructor because can be blocking for to many requests to Roumen.cz
	# third attributes just set the sleep time to next image request
	#roumen = Roumen('<USER_NAME>', '<PASSWORD>', 1)

	roumen = Roumen('<USER_NAME>', '<PASSWORD>')
	roumen.backupImages('/Users/jpenzes/Desktop')