Created
December 12, 2012 02:20
-
-
Save anonymous/4264323 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ | |
# This file is used to download all web pages that are bookmarked in Chrome | |
# to the local folder that runs the program. It works by parsing the | |
# Chrome bookmark file for URLs and then uses the wget linux command to | |
# download the list of URLs. Downloading web pages is useful so that they | |
# still be viewed offline. | |
############################################################################ | |
# Import os for checking if files exist and making system calls | |
import os | |
# Import re for using regular expressions to parse bookmark file for URLs | |
import re | |
# Import sys for exiting program when necessary | |
import sys | |
#TODO: find a way to update bookmark file | |
#TODO: adapt for use in windows | |
#TODO: adapt for use for other browsers | |
#TODO: streamline installation process. maybe packaging python in installer | |
# Get the current working directory | |
cwd = os.getcwd() | |
# Get the user's home folder path on linux | |
home = os.getenv("HOME") | |
# File where Chrome automatically stores bookmark URLs and metadata in Linux. | |
# We will use regex to extract only the urls from this file. | |
# Note there is no file extension for the bookmarks file. | |
bookmarkFile = home + "/.config/google-chrome/Default/Bookmarks" | |
# File for keeping track of all URLs that have been previously downloaded | |
# We will check to see if a URL already exists in this file before actually | |
# writing the URL to our file used for wget. | |
masterList = cwd + "/bookmarksMaster.txt" | |
# The file actually used by the wget call. This file will contain only | |
# the new URLs that are not already contained in the master list. This | |
# is used to prevent re-downloading all previous URLs we have already | |
# downloaded. (--no-clobber option for wget is supposed to prevent this | |
# from happening, but it doesn't seem to work correctly when the | |
# --html-option is also used with wget. | |
newList = cwd + "/bookmarksNew.txt" | |
# This function is used to rename all files to a format compatible with | |
# Windows. The reasoning for this is in case someone runs script from Linux | |
# and they want their files to be downloaded to Dropbox to sync on Windows | |
def renameFilesForWindows(): | |
for dirpath, dirs, files in os.walk(cwd): | |
for oldFilename in files: | |
if ('?' in oldFilename or '<' in oldFilename or '>' in oldFilename | |
or ':' in oldFilename or '"' in oldFilename or '\\' in oldFilename | |
or '*' in oldFilename or '|' in oldFilename): | |
newFilename = re.sub('[<>:"\\\?|*]', '_', oldFilename) | |
os.rename(os.path.join(dirpath, oldFilename), | |
os.path.join(dirpath, newFilename)) | |
# Make sure the bookmark file exists. | |
bookmarkFileExists = os.path.isfile(bookmarkFile) | |
# Open file and store contents in data variable. | |
if (bookmarkFileExists): | |
fil = open(bookmarkFile, "r") | |
data = fil.read() | |
fil.close() | |
else: | |
print "\n'" + bookmarkFile + "'" + " does not exist." | |
print "This is what was specified as your Chrome bookmark file.\n" | |
sys.exit() | |
# Store just the URLs in a URL tuple variable. | |
urls = re.findall(r"((https?):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)", data) | |
# Store the descriptions of each URL | |
desc = re.findall(r'"name": "(.*?)",\n[ ]*"type": "url",', data) | |
# Make sure the master list bookmark file exists. | |
masterListExists = os.path.isfile(masterList) | |
# Store contents of master list of URLs in masterListURLs variable | |
if (masterListExists): | |
mlfil = open(masterList, "r") | |
masterListURLs = mlfil.read() | |
mlfil.close() | |
else: | |
mlfil = open(masterList, "w") | |
masterListURLs = "" | |
mlfil.close() | |
# Open file for writing new URLs | |
wrfile = open(newList, "w") | |
# Open file for containing each description with its associated URL | |
descriptionFile = open("BookmarkDownloaderDescriptions.txt", "w") | |
# Variable for cycling through description array | |
i = 0 | |
# For every URL, index 0 gets just url part of tuple, | |
# then write it to the file if it doesn't already exist in master list. | |
# Also write the information to the description file. | |
for line in urls: | |
line = line[0] | |
if (masterListURLs.find(line) < 0): # Returns < 0 if not found | |
wrfile.write(line + "\n") | |
with open(masterList, "a") as ml: # Append new URL to master list | |
ml.write(line + "\n") | |
print line | |
descriptionFile.write(desc[i] + "\n") | |
descriptionFile.write(line + "\n\n") | |
i = i + 1 | |
wrfile.close() | |
descriptionFile.close() | |
# Use wget linux command for downloading the web pages | |
# -i specifies a file to get the urls from | |
# -P specifies an output folder (no longer used) | |
# -p specifies downloading the whole web page (images and all) | |
# --no-clobber specifies ignoring files that have already been downloaded | |
# --timeout=10 specifies 10 second timeout for trying to connect to page | |
# --tries=3 specifies max number of tries for a page before skipping | |
# --html-extension saves the files with an html extension | |
os.system("wget -i " + newList + " -p --no-clobber --html-extension --timeout=10 --tries=3") | |
renameFilesForWindows() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment