Skip to content

Instantly share code, notes, and snippets.

@Svenito
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Svenito/fce6a9e44eb4984c464f to your computer and use it in GitHub Desktop.
Save Svenito/fce6a9e44eb4984c464f to your computer and use it in GitHub Desktop.
Extracts the main Phrack article and writes it to a file for easier printing. (or download the tar of the issue if you are boring; ))
#!/bin/env python2.7
from BeautifulSoup import BeautifulSoup
import requests
import sys
import re
import os
import argparse
def validateUrl(phrack_url):
'''
Prefix with http if missing and check it's in the expected format of
http://phrack.org/issues/67/9.html
'''
if not phrack_url.startswith('htt'):
phrack_url = 'http://'+phrack_url
m = re.match('^http(s)?:\/\/phrack\.org\/issues\/(\d)+\/(\d)+\.htm(l)?', phrack_url)
if m is None:
print('%s is not a valid URL.' % phrack_url)
print('Something like phrack.org/issues/43/42.html would be nice')
sys.exit(1)
issue_number = m.group(2)
article_number = m.group(3)
return phrack_url, issue_number, article_number
def prep_title_for_filename(title, number):
title = title.replace(' : ', '')
title = title.replace(' ', '_')
title = title.replace('/', '')
return '%02d_%s.txt' % (int(number), title)
def main(url, autofile, outfile=''):
'''
Get the requested URL and parse it to extract the article.
Print to stdout or a file.
'''
phrack_url, issue_number, article_number = validateUrl(url)
content = requests.get(phrack_url)
if content.status_code != 200:
print('Didn`t manage to fetch that page: %d' % content.status_code)
return 1
soup = BeautifulSoup(content.text)
article_title = soup.find('div', {'id':'article'})
article_author = article_title.findNext('div').contents
out_filename = outfile
if autofile:
target_dir = os.path.join(os.getcwd(), 'phrack_'+issue_number)
try:
if not os.path.exists(target_dir):
os.mkdir(target_dir)
except:
print "Unable to create directory."
return 1
title_filename = prep_title_for_filename(article_title.contents[1].string,
article_number)
out_filename = os.path.join(target_dir, title_filename)
if out_filename:
try:
old_stdout = sys.stdout
sys.stdout = open(out_filename, 'w')
except IOError as e:
print('Error opening %s %s' % (out_filename, e))
return 1
print('This article was extracted from: %s\n\n' % phrack_url)
if article_title:
print(article_title.contents[0].string + article_title.contents[1].string)
if article_author:
print(article_author[1].string + article_author[2].string)
print
article = soup.find('pre')
print(article.contents[0])
sys.stdout.close()
sys.stdout = old_stdout
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get a printable Phrack article')
parser.add_argument('url',
help='The URL to the Phrack article. eg: http://phrack.org/issues/67/9.html')
parser.add_argument('-o', '--outfile',
help='File to write output to. If omitted writes to stdout')
parser.add_argument('-a', '--autofile', action='store_true',
help='Filename is title of article and saved to issue number directory.')
args = parser.parse_args()
sys.exit(main(args.url, args.autofile, args.outfile))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment