Skip to content

Instantly share code, notes, and snippets.

  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jeremydouglass/3bbde0e6ce87bd54dd115eda395c2c93 to your computer and use it in GitHub Desktop.
Processing Vanilla forum bookmark page link extractor
# -*- coding: utf-8 -*-
"""Processing Vanilla forum bookmark page link extractor
1. Manually save html pages of bookmarks from forum before decomissioned
2. Run this to extract link data
"""
import fnmatch
import os
from bs4 import BeautifulSoup
def scrape_bookmarks(filename):
"""Extract perosnal bookmark data from Processing Vanilla forum pages"""
bookmark_list = []
with open(filename, 'r') as inputfile:
filestring=inputfile.read()
soup = BeautifulSoup(filestring, 'html.parser')
passages = soup.select('div.Title')
for psg in passages:
plink = psg.a['href']
ptitle = psg.get_text()
bookmark_list.append((ptitle.strip(), plink))
return bookmark_list
def fpath_to_fnamelist(fpath, fnpattern):
"""
Filepath to filename list:
Take a directory and pattern, return a list of file paths.
fnpattern filters results use Unix shell-style wildcards: (*, ?, [abc], [!abc])
Uses fnmatch.filter.
"""
return [os.path.join(dirpath, f)
for dirpath, _dirnames, files in os.walk(fpath)
for f in fnmatch.filter(files, fnpattern)]
def save_bookmarks(list_, filename):
"""Save list_ of lines into text file."""
if not list_: raise ValueError('No data to write.')
if not filename: raise ValueError('No filename given.')
try:
with open(filename, 'w') as outputfile:
for item in list_:
for title, url in item:
outputfile.write("{}\t{}\n".format(title.encode('utf-8'), url))
except OSError:
print "File not written."
if __name__ == '__main__':
results = []
fname_list = fpath_to_fnamelist('./', '*.html')
for file in fname_list:
results.append(scrape_bookmarks(file))
save_bookmarks(results, 'bookmarks.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment