Skip to content

Instantly share code, notes, and snippets.

@jirkapenzes
Last active August 29, 2015 13:57
Show Gist options
  • Save jirkapenzes/9739287 to your computer and use it in GitHub Desktop.
Save jirkapenzes/9739287 to your computer and use it in GitHub Desktop.
Backup tool for Roumen.cz (backups all liked images from user profile)
#install dependencies
python3.4 -m pip install htmldom
#start script
python3.4 RoumenBackup.py
import http.cookiejar
import os
import urllib.parse
import urllib.request
import time
from htmldom import htmldom
from enum import Enum
__author__ = 'jpenzes'
application = 'RBackup'
class RoumenImage:
def __init__(self, category, name):
self.category = category
self.name = name
self.url = str()
class States(Enum):
DOWNLOADED = 'DOWNLOADED'
FILE_EXIST = 'FILE_EXIST'
EXCEPTION = 'EXCEPTION'
class Roumen:
"""Roumen URL patterns"""
AUTHENTICATION_URL = 'http://www.rouming.cz/roumingUserLogin.php'
BASE_URL = 'http://kecy.roumen.cz/'
TERMS_URL = 'http://maso.roumen.cz/'
USER_LIKES_URL = 'http://www.rouming.cz/roumingUserLikes.php'
USER_LIKE_PAGE_PATTERN = 'http://kecy.roumen.cz/roumingUserLikes.php?&page={0}'
IMAGE_DETAIL_ROUMING_URL_PATTERN = 'http://www.rouming.cz/roumingShow.php?file={0}'
IMAGE_DETAIL_MASO_URL_PATTERN = 'http://www.roumenovomaso.cz/masoShow.php?agree=on&file={0}'
def __init__(self, userName, password, sleepTime=0, isFullLogging=True):
"""Stores user settings"""
self.payload = {
'action': 'login',
'name': userName,
'password': password,
'persistent': 'true'
}
self.sleepTime = sleepTime
self.isFullLogging = isFullLogging
def backupImages(self, destination):
"""Backups all images which user liked at Roumen.cz"""
self.setDestination(destination)
self.authorize()
self.analyzePage()
print('\nRoumen images backup has completed successfully')
def analyzePage(self):
"""Analyzes first page and finds all links to other favorite pages"""
html = self.getHtmlDom(self.USER_LIKES_URL)
pageLinks = self.parsePageLinks(html)
self.processPageLinks(pageLinks)
def setDestination(self, destination):
"""Set a target files destination"""
self.destination = destination
def parsePageLinks(self, html):
"""Parses the first user likes page and returns links to the other likes pages"""
pageLinks = []
pageLinkButtons = self.parseRoumenContents(html).find('span.roumingButton')
maxPageNumber = int(1)
for button in pageLinkButtons:
link = button.find('a')
try:
pageNumber = int(link.text())
maxPageNumber = max(maxPageNumber, pageNumber)
except ValueError:
continue
for pageNumber in range(maxPageNumber):
pageLink = str.format(self.USER_LIKE_PAGE_PATTERN, pageNumber + 1)
pageLinks.append(pageLink)
return pageLinks
def parseRoumenContents(self, dom):
"""Returns a html element with main content (list of images)"""
return dom.find('div.roumingList')
def processPageLinks(self, pageLinks):
"""Processes all page links"""
for pageLink in pageLinks:
self.processPageLink(pageLink)
def processPageLink(self, pageLink):
"""Processes concrete page link"""
response = self.openUrl(pageLink)
dom = htmldom.HtmlDom().createDom(str(response.read()))
self.processImages(dom)
def processImages(self, htmlDom):
"""Processes all images which it will find in html"""
tableRows = self.parseRoumenContents(htmlDom).find("tr")
# skips first row (it is table header)
for row in tableRows[1:]:
cells = list(row.find('td'))
category = cells[0].text().strip()
name = cells[1].text().strip()
try:
roumenImage = RoumenImage(category, name)
fileNamePath = self.getImageFilePath(roumenImage)
if not os.path.isfile(fileNamePath):
roumenImage.url = self.resolvedImageUrl(category, name)
self.downloadImage(roumenImage)
self.sleepThread()
state = States.DOWNLOADED
else:
state = States.FILE_EXIST
except BaseException:
state = States.EXCEPTION
self.logImageProgress(state, name)
def logImageProgress(self, state, imageName):
"""Logs image state (downloaded, exception, file_exist)"""
if self.isFullLogging or not state == States.FILE_EXIST:
print(str.format('{0}:\t {1}', state, imageName))
def authorize(self):
"""User authorizes into user section at Roumen.cz"""
opener = self.prepareOpener()
urllib.request.install_opener(opener)
binary_data = urllib.parse.urlencode(self.payload).encode()
request = urllib.request.Request(self.AUTHENTICATION_URL, binary_data)
return self.openUrl(request)
def prepareOpener(self):
"""Prepares request opener"""
cookieJar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar))
opener.addheaders.append(('User-agent', 'RedditTesting'))
return opener
def openUrl(self, url):
"""Opens the specific url"""
return urllib.request.urlopen(url)
def getHtmlDom(self, url):
"""Makes the html dom model from url response"""
response = self.openUrl(url)
html = str(response.read().decode())
return htmldom.HtmlDom().createDom(html)
def resolvedImageUrl(self, category, imageName):
"""Resolves a image direct url from html by category (rouming, maso)"""
url = self.makeImageDetailUrl(category, imageName)
html = self.getHtmlDom(url)
# finds all image elements at page
for imageElement in html.find('tr td a img'):
url = imageElement.first().attr('src')
# if will detect image name in url
if imageName in url:
return url
def sleepThread(self):
"""Sleep main thread to few seconds (can be blocking to many requests to Roumen.cz)"""
time.sleep(self.sleepTime)
def makeImageDetailUrl(self, category, imageName):
"""Returns direct url to image detail page (e.g. with comments, information, ads, ...)"""
urlPattern = self.IMAGE_DETAIL_MASO_URL_PATTERN \
if category == 'Maso' else self.IMAGE_DETAIL_ROUMING_URL_PATTERN
return str.format(urlPattern, imageName)
def downloadImage(self, roumenImage: RoumenImage):
"""Downloads image from url and stores into file system"""
target = os.path.join(self.destination, application, roumenImage.category)
if not os.path.exists(target):
os.makedirs(target)
# makes file path
filePath = self.getImageFilePath(roumenImage)
urllib.request.urlretrieve(roumenImage.url, filePath)
def getImageFilePath(self, roumenImage: RoumenImage):
"""Makes full image file system path"""
return os.path.join(self.destination, application, roumenImage.category, roumenImage.name)
# if you have liked to many images (e.g. 750+) than
# please use follow constructor because can be blocking for to many requests to Roumen.cz
# third attributes just set the sleep time to next image request
#roumen = Roumen('<USER_NAME>', '<PASSWORD>', 1)
roumen = Roumen('<USER_NAME>', '<PASSWORD>')
roumen.backupImages('/Users/jpenzes/Desktop')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment