mgeeky/PhrackPhiles.py

## PhrackPhiles.py
#!/usr/bin/python

# This script traverses on PHRACK e-zines directory
# and for each downloads corresponding contents. This
# is used to name every file (phile) in issue directory
# with a name as title from webpage contents.
#
# MGeeky, 2012

import os
import urllib
import string
from sys import exit
import re
import HTMLParser


# G L O B A L S

# directory with phrack ezines
g_PhrackDir = r"d:\ebooks\SECURITY\MAGAZINES\PHRACK"

# File name format, must have following tokens:
#       %(issue)d, %(phile)d, %(name)s.
g_NameFmt = "p%(issue)02d-0x%(phile)02x - %(author)s - %(name)s.txt"


###################################

def RenFiles(dir):

    issue = int(re.match(".*phrack(\d{1,2})", dir).groups(None)[0])
    assert issue > 0

    if issue == None:
        print "\t[!] Error while parsing dir name:", dir
        return

    url = "http://phrack.org/issues/%d/1.html" % issue

    h = HTMLParser.HTMLParser()

    for root, dirs, files in os.walk(dir):

        philes = len(files)

        print "\n[>] Renaming #%02d issue philes. There are %d of them." \
            % (issue, philes)

        # opening url resource with list off philes
        u = urllib.urlopen(url)

        if not u:
            print "\t[!] Cannot download '%s' resource" % url
            return

        page = u.read()
        (names, authors) = parsePage(page)

        if not page:
            print "\t[!] Cannot download %d issue!" % issue
            exit(1)

        for f in files:

            if ".tar.gz" in f:
                continue

            m = re.match(r"(\d{1,2})\.txt", f, re.I)
            if not m:
                 print "\t[!] Cannot find phile: %d of #%d issue" \
                        % (phile, issue)
                 continue

            phile = int(m.group(1)) - 1


            # exact renaming...
            #
            name = names[phile]
            author = authors[phile]
            new = g_NameFmt % {"issue":issue, "phile":phile, "author":author, "name":name}

            if f == new or not name:
                continue

            # some corrections...
            new = h.unescape(new)
            mapping = {'<':'-', '>':'-', ':':'-', '"':'-', '/':'-', '\\':'-', '|':'-', '?':'-', '*':'-'}
            for k, v in mapping.iteritems():
                new = new.replace(k, v)

            newf = os.path.join(root, new)
            oldf = os.path.join(root, f)

            try:
                os.rename(oldf, newf)
                print "\t", f, "->", new
            except:
                print "\t[!] Couldn't rename file: '%s' !" % newf


###################################

def parsePage(page):

    names = []
    authors = []
    rex = r'<tr><td align="left"><a href=".+#article">(.+)<.a><.td><td align="right">(.+)<.td><.tr>'
    for m in re.finditer(rex, page, re.I | re.M):
        names.append(m.group(1))
        authors.append(m.group(2))

    assert len(names) == len(authors) and len(names) > 0
    return (names, authors)

###################################

if __name__ == '__main__':

    print "\n[+] PHRACK magazine files namer"

    for root, dirs, files in os.walk(g_PhrackDir):
        for d in dirs:
            RenFiles(os.path.join(g_PhrackDir, d))

    print "\nEnd of script."
	#!/usr/bin/python

	# This script traverses on PHRACK e-zines directory
	# and for each downloads corresponding contents. This
	# is used to name every file (phile) in issue directory
	# with a name as title from webpage contents.
	#
	# MGeeky, 2012

	import os
	import urllib
	import string
	from sys import exit
	import re
	import HTMLParser


	# G L O B A L S

	# directory with phrack ezines
	g_PhrackDir = r"d:\ebooks\SECURITY\MAGAZINES\PHRACK"

	# File name format, must have following tokens:
	# %(issue)d, %(phile)d, %(name)s.
	g_NameFmt = "p%(issue)02d-0x%(phile)02x - %(author)s - %(name)s.txt"



	###################################

	def RenFiles(dir):

	issue = int(re.match(".*phrack(\d{1,2})", dir).groups(None)[0])
	assert issue > 0

	if issue == None:
	print "\t[!] Error while parsing dir name:", dir
	return

	url = "http://phrack.org/issues/%d/1.html" % issue

	h = HTMLParser.HTMLParser()

	for root, dirs, files in os.walk(dir):

	philes = len(files)

	print "\n[>] Renaming #%02d issue philes. There are %d of them." \
	% (issue, philes)

	# opening url resource with list off philes
	u = urllib.urlopen(url)

	if not u:
	print "\t[!] Cannot download '%s' resource" % url
	return

	page = u.read()
	(names, authors) = parsePage(page)

	if not page:
	print "\t[!] Cannot download %d issue!" % issue
	exit(1)

	for f in files:

	if ".tar.gz" in f:
	continue

	m = re.match(r"(\d{1,2})\.txt", f, re.I)
	if not m:
	print "\t[!] Cannot find phile: %d of #%d issue" \
	% (phile, issue)
	continue

	phile = int(m.group(1)) - 1


	# exact renaming...
	#
	name = names[phile]
	author = authors[phile]
	new = g_NameFmt % {"issue":issue, "phile":phile, "author":author, "name":name}

	if f == new or not name:
	continue

	# some corrections...
	new = h.unescape(new)
	mapping = {'<':'-', '>':'-', ':':'-', '"':'-', '/':'-', '\\':'-', '\|':'-', '?':'-', '*':'-'}
	for k, v in mapping.iteritems():
	new = new.replace(k, v)

	newf = os.path.join(root, new)
	oldf = os.path.join(root, f)

	try:
	os.rename(oldf, newf)
	print "\t", f, "->", new
	except:
	print "\t[!] Couldn't rename file: '%s' !" % newf


	###################################

	def parsePage(page):

	names = []
	authors = []
	rex = r'<tr><td align="left"><a href=".+#article">(.+)<.a><.td><td align="right">(.+)<.td><.tr>'
	for m in re.finditer(rex, page, re.I \| re.M):
	names.append(m.group(1))
	authors.append(m.group(2))

	assert len(names) == len(authors) and len(names) > 0
	return (names, authors)

	###################################

	if __name__ == '__main__':

	print "\n[+] PHRACK magazine files namer"

	for root, dirs, files in os.walk(g_PhrackDir):
	for d in dirs:
	RenFiles(os.path.join(g_PhrackDir, d))

	print "\nEnd of script."