Last active
August 29, 2015 13:57
-
-
Save jirkapenzes/9739287 to your computer and use it in GitHub Desktop.
Backup tool for Roumen.cz (backups all liked images from user profile)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#install dependencies | |
python3.4 -m pip install htmldom | |
#start script | |
python3.4 RoumenBackup.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import http.cookiejar | |
import os | |
import urllib.parse | |
import urllib.request | |
import time | |
from htmldom import htmldom | |
from enum import Enum | |
__author__ = 'jpenzes' | |
application = 'RBackup' | |
class RoumenImage: | |
def __init__(self, category, name): | |
self.category = category | |
self.name = name | |
self.url = str() | |
class States(Enum): | |
DOWNLOADED = 'DOWNLOADED' | |
FILE_EXIST = 'FILE_EXIST' | |
EXCEPTION = 'EXCEPTION' | |
class Roumen: | |
"""Roumen URL patterns""" | |
AUTHENTICATION_URL = 'http://www.rouming.cz/roumingUserLogin.php' | |
BASE_URL = 'http://kecy.roumen.cz/' | |
TERMS_URL = 'http://maso.roumen.cz/' | |
USER_LIKES_URL = 'http://www.rouming.cz/roumingUserLikes.php' | |
USER_LIKE_PAGE_PATTERN = 'http://kecy.roumen.cz/roumingUserLikes.php?&page={0}' | |
IMAGE_DETAIL_ROUMING_URL_PATTERN = 'http://www.rouming.cz/roumingShow.php?file={0}' | |
IMAGE_DETAIL_MASO_URL_PATTERN = 'http://www.roumenovomaso.cz/masoShow.php?agree=on&file={0}' | |
def __init__(self, userName, password, sleepTime=0, isFullLogging=True): | |
"""Stores user settings""" | |
self.payload = { | |
'action': 'login', | |
'name': userName, | |
'password': password, | |
'persistent': 'true' | |
} | |
self.sleepTime = sleepTime | |
self.isFullLogging = isFullLogging | |
def backupImages(self, destination): | |
"""Backups all images which user liked at Roumen.cz""" | |
self.setDestination(destination) | |
self.authorize() | |
self.analyzePage() | |
print('\nRoumen images backup has completed successfully') | |
def analyzePage(self): | |
"""Analyzes first page and finds all links to other favorite pages""" | |
html = self.getHtmlDom(self.USER_LIKES_URL) | |
pageLinks = self.parsePageLinks(html) | |
self.processPageLinks(pageLinks) | |
def setDestination(self, destination): | |
"""Set a target files destination""" | |
self.destination = destination | |
def parsePageLinks(self, html): | |
"""Parses the first user likes page and returns links to the other likes pages""" | |
pageLinks = [] | |
pageLinkButtons = self.parseRoumenContents(html).find('span.roumingButton') | |
maxPageNumber = int(1) | |
for button in pageLinkButtons: | |
link = button.find('a') | |
try: | |
pageNumber = int(link.text()) | |
maxPageNumber = max(maxPageNumber, pageNumber) | |
except ValueError: | |
continue | |
for pageNumber in range(maxPageNumber): | |
pageLink = str.format(self.USER_LIKE_PAGE_PATTERN, pageNumber + 1) | |
pageLinks.append(pageLink) | |
return pageLinks | |
def parseRoumenContents(self, dom): | |
"""Returns a html element with main content (list of images)""" | |
return dom.find('div.roumingList') | |
def processPageLinks(self, pageLinks): | |
"""Processes all page links""" | |
for pageLink in pageLinks: | |
self.processPageLink(pageLink) | |
def processPageLink(self, pageLink): | |
"""Processes concrete page link""" | |
response = self.openUrl(pageLink) | |
dom = htmldom.HtmlDom().createDom(str(response.read())) | |
self.processImages(dom) | |
def processImages(self, htmlDom): | |
"""Processes all images which it will find in html""" | |
tableRows = self.parseRoumenContents(htmlDom).find("tr") | |
# skips first row (it is table header) | |
for row in tableRows[1:]: | |
cells = list(row.find('td')) | |
category = cells[0].text().strip() | |
name = cells[1].text().strip() | |
try: | |
roumenImage = RoumenImage(category, name) | |
fileNamePath = self.getImageFilePath(roumenImage) | |
if not os.path.isfile(fileNamePath): | |
roumenImage.url = self.resolvedImageUrl(category, name) | |
self.downloadImage(roumenImage) | |
self.sleepThread() | |
state = States.DOWNLOADED | |
else: | |
state = States.FILE_EXIST | |
except BaseException: | |
state = States.EXCEPTION | |
self.logImageProgress(state, name) | |
def logImageProgress(self, state, imageName): | |
"""Logs image state (downloaded, exception, file_exist)""" | |
if self.isFullLogging or not state == States.FILE_EXIST: | |
print(str.format('{0}:\t {1}', state, imageName)) | |
def authorize(self): | |
"""User authorizes into user section at Roumen.cz""" | |
opener = self.prepareOpener() | |
urllib.request.install_opener(opener) | |
binary_data = urllib.parse.urlencode(self.payload).encode() | |
request = urllib.request.Request(self.AUTHENTICATION_URL, binary_data) | |
return self.openUrl(request) | |
def prepareOpener(self): | |
"""Prepares request opener""" | |
cookieJar = http.cookiejar.CookieJar() | |
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar)) | |
opener.addheaders.append(('User-agent', 'RedditTesting')) | |
return opener | |
def openUrl(self, url): | |
"""Opens the specific url""" | |
return urllib.request.urlopen(url) | |
def getHtmlDom(self, url): | |
"""Makes the html dom model from url response""" | |
response = self.openUrl(url) | |
html = str(response.read().decode()) | |
return htmldom.HtmlDom().createDom(html) | |
def resolvedImageUrl(self, category, imageName): | |
"""Resolves a image direct url from html by category (rouming, maso)""" | |
url = self.makeImageDetailUrl(category, imageName) | |
html = self.getHtmlDom(url) | |
# finds all image elements at page | |
for imageElement in html.find('tr td a img'): | |
url = imageElement.first().attr('src') | |
# if will detect image name in url | |
if imageName in url: | |
return url | |
def sleepThread(self): | |
"""Sleep main thread to few seconds (can be blocking to many requests to Roumen.cz)""" | |
time.sleep(self.sleepTime) | |
def makeImageDetailUrl(self, category, imageName): | |
"""Returns direct url to image detail page (e.g. with comments, information, ads, ...)""" | |
urlPattern = self.IMAGE_DETAIL_MASO_URL_PATTERN \ | |
if category == 'Maso' else self.IMAGE_DETAIL_ROUMING_URL_PATTERN | |
return str.format(urlPattern, imageName) | |
def downloadImage(self, roumenImage: RoumenImage): | |
"""Downloads image from url and stores into file system""" | |
target = os.path.join(self.destination, application, roumenImage.category) | |
if not os.path.exists(target): | |
os.makedirs(target) | |
# makes file path | |
filePath = self.getImageFilePath(roumenImage) | |
urllib.request.urlretrieve(roumenImage.url, filePath) | |
def getImageFilePath(self, roumenImage: RoumenImage): | |
"""Makes full image file system path""" | |
return os.path.join(self.destination, application, roumenImage.category, roumenImage.name) | |
# if you have liked to many images (e.g. 750+) than | |
# please use follow constructor because can be blocking for to many requests to Roumen.cz | |
# third attributes just set the sleep time to next image request | |
#roumen = Roumen('<USER_NAME>', '<PASSWORD>', 1) | |
roumen = Roumen('<USER_NAME>', '<PASSWORD>') | |
roumen.backupImages('/Users/jpenzes/Desktop') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment