Svenito/getphrack.py

## getphrack.py
#!/bin/env python2.7

from BeautifulSoup import BeautifulSoup
import requests
import sys
import re
import os
import argparse


def validateUrl(phrack_url):
    '''
    Prefix with http if missing and check it's in the expected format of
    http://phrack.org/issues/67/9.html
    '''
    if not phrack_url.startswith('htt'):
        phrack_url = 'http://'+phrack_url

    m = re.match('^http(s)?:\/\/phrack\.org\/issues\/(\d)+\/(\d)+\.htm(l)?', phrack_url)
    if m is None:
        print('%s is not a valid URL.' % phrack_url)
        print('Something like phrack.org/issues/43/42.html would be nice')
        sys.exit(1)

    issue_number = m.group(2)
    article_number = m.group(3)
    return phrack_url, issue_number, article_number


def prep_title_for_filename(title, number):
    title = title.replace(' : ', '')
    title = title.replace(' ', '_')
    title = title.replace('/', '')
    return '%02d_%s.txt' % (int(number), title)

def main(url, autofile, outfile=''):
    '''
    Get the requested URL and parse it to extract the article.
    Print to stdout or a file.
    '''
    phrack_url, issue_number, article_number = validateUrl(url)

    content = requests.get(phrack_url)
    if content.status_code != 200:
        print('Didn`t manage to fetch that page: %d' % content.status_code)
        return 1


    soup = BeautifulSoup(content.text)

    article_title = soup.find('div', {'id':'article'})
    article_author = article_title.findNext('div').contents

    out_filename = outfile

    if autofile:
        target_dir = os.path.join(os.getcwd(), 'phrack_'+issue_number)
        try:
            if not os.path.exists(target_dir):
                os.mkdir(target_dir)
        except:
            print "Unable to create directory."
            return 1

        title_filename = prep_title_for_filename(article_title.contents[1].string,
                article_number)

        out_filename = os.path.join(target_dir, title_filename)

    if out_filename:
        try:
            old_stdout = sys.stdout
            sys.stdout = open(out_filename, 'w')
        except IOError as e:
            print('Error opening %s %s' % (out_filename, e))
            return 1

    print('This article was extracted from: %s\n\n' % phrack_url)
    if article_title:
        print(article_title.contents[0].string + article_title.contents[1].string)
    if article_author:
        print(article_author[1].string + article_author[2].string)

    print
    article = soup.find('pre')
    print(article.contents[0])
    sys.stdout.close()
    sys.stdout = old_stdout

    return 0

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Get a printable Phrack article')

    parser.add_argument('url',
            help='The URL to the Phrack article. eg: http://phrack.org/issues/67/9.html')
    parser.add_argument('-o', '--outfile',
            help='File to write output to. If omitted writes to stdout')
    parser.add_argument('-a', '--autofile', action='store_true',
            help='Filename is title of article and saved to issue number directory.')

    args = parser.parse_args()
    sys.exit(main(args.url, args.autofile, args.outfile))
	#!/bin/env python2.7

	from BeautifulSoup import BeautifulSoup
	import requests
	import sys
	import re
	import os
	import argparse


	def validateUrl(phrack_url):
	'''
	Prefix with http if missing and check it's in the expected format of
	http://phrack.org/issues/67/9.html
	'''
	if not phrack_url.startswith('htt'):
	phrack_url = 'http://'+phrack_url

	m = re.match('^http(s)?:\/\/phrack\.org\/issues\/(\d)+\/(\d)+\.htm(l)?', phrack_url)
	if m is None:
	print('%s is not a valid URL.' % phrack_url)
	print('Something like phrack.org/issues/43/42.html would be nice')
	sys.exit(1)

	issue_number = m.group(2)
	article_number = m.group(3)
	return phrack_url, issue_number, article_number


	def prep_title_for_filename(title, number):
	title = title.replace(' : ', '')
	title = title.replace(' ', '_')
	title = title.replace('/', '')
	return '%02d_%s.txt' % (int(number), title)

	def main(url, autofile, outfile=''):
	'''
	Get the requested URL and parse it to extract the article.
	Print to stdout or a file.
	'''
	phrack_url, issue_number, article_number = validateUrl(url)

	content = requests.get(phrack_url)
	if content.status_code != 200:
	print('Didn`t manage to fetch that page: %d' % content.status_code)
	return 1


	soup = BeautifulSoup(content.text)

	article_title = soup.find('div', {'id':'article'})
	article_author = article_title.findNext('div').contents

	out_filename = outfile

	if autofile:
	target_dir = os.path.join(os.getcwd(), 'phrack_'+issue_number)
	try:
	if not os.path.exists(target_dir):
	os.mkdir(target_dir)
	except:
	print "Unable to create directory."
	return 1

	title_filename = prep_title_for_filename(article_title.contents[1].string,
	article_number)

	out_filename = os.path.join(target_dir, title_filename)

	if out_filename:
	try:
	old_stdout = sys.stdout
	sys.stdout = open(out_filename, 'w')
	except IOError as e:
	print('Error opening %s %s' % (out_filename, e))
	return 1

	print('This article was extracted from: %s\n\n' % phrack_url)
	if article_title:
	print(article_title.contents[0].string + article_title.contents[1].string)
	if article_author:
	print(article_author[1].string + article_author[2].string)

	print
	article = soup.find('pre')
	print(article.contents[0])
	sys.stdout.close()
	sys.stdout = old_stdout

	return 0

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Get a printable Phrack article')

	parser.add_argument('url',
	help='The URL to the Phrack article. eg: http://phrack.org/issues/67/9.html')
	parser.add_argument('-o', '--outfile',
	help='File to write output to. If omitted writes to stdout')
	parser.add_argument('-a', '--autofile', action='store_true',
	help='Filename is title of article and saved to issue number directory.')

	args = parser.parse_args()
	sys.exit(main(args.url, args.autofile, args.outfile))