willwade/AdaptechParse.py

## AdaptechParse.py
#!/usr/bin/env python
# Need unicodecsv seen here http://docs.python.org/library/csv.html?highlight=csv#csv
# Date: Jan 2012

# -*- coding: iso-8859-15 -*-

from urllib2 import urlopen
from lxml import etree
import csv
from unicodecsv import UnicodeWriter
import re


urls = {"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
        "mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60"}

aurls = {"writing":"http://www.adaptech.org/en/downloads/fandi/cat/10",
        "visual":"http://www.adaptech.org/en/downloads/fandi/cat/20",
        "VI":"http://www.adaptech.org/en/downloads/fandi/cat/30",
        "dictation":"http://www.adaptech.org/en/downloads/fandi/cat/40",
        "keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
        "mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60",
        "misc":"http://www.adaptech.org/en/downloads/fandi/cat/1000"}

def get_content(url):
    '''
    Tries to open the url passed.

    Returns the content of the page if successful or else None.
    '''
    try:
        content = urlopen(url, timeout=20)
        return content
    except:
        print "Error Opening: ", url
    return None

def get_title(entry):
    title = ""

    tdchildren = entry.getchildren()
    for children in tdchildren:
        if children.tag == 'h3':
            title = children.text

    return title

def get_data(entry, name='Manufacturer:', atag=False):
    data = ""

    tdchildren = entry.getchildren()
    for children in tdchildren:
        if children.tag == 'div' and children.text == name:
            next_elem = children.getnext()
            if atag :
               ahrefdata = next_elem.getchildren()
               for childtags in ahrefdata:
                    if childtags.tag == 'a':
                        data = childtags.get('href')
            else:
                data = next_elem.text

    return data

def rename_type(type):
    '''
    remap "trial" or inexpensive to commercial "free" to freeware
    '''
    match = re.search("Inexpensive|Trial", type) # Match
    if match:
        return 'Commercial'
    elif re.search("Free", type):
            return 'Freeware'
    else:
        return 'unknown'

def rename_ostype():
    '''
    Splits the slashes up into seperate components - Windows 2000 /2003/XP/Vista/7
    '''
    return None

def parse_adaptech(tree, type='Unsure'):
    '''
    Parses the adaptech pages.
    '''
    entries = tree.xpath(".//div[contains(@class, 'adaptech-fandi-dbentry')]")
    entryWriter = UnicodeWriter(open('entrys.csv', 'a+'), delimiter=',', quotechar='"')
    for entry in entries:
        eTitle = get_title(entry)
        if eTitle != '':
            ePub =  get_data(entry, 'Manufacturer:')
            eDetail = get_data(entry, 'Description:')
            eURLDL = get_data(entry, 'Download URL:', True)
            eURLInfo = get_data(entry, 'Website:', True)
            eOS = get_data(entry, 'System Requirements:')
            eCat = rename_type(get_data(entry, 'Type:'))
            eLang =  get_data(entry, 'Interface Language(s):')
            entryWriter.writerow([type,eTitle,ePub,eDetail,eURLDL,eURLInfo,eOS,eCat,eLang])

def main():
    for page, url in urls.iteritems():
        content = get_content(url)
        if content is None: continue

        htmlparser = etree.HTMLParser()
        tree = etree.parse(content, htmlparser)

        parse_adaptech(tree,page)

if __name__ == "__main__":
    main()


## CATsURLParser.py
#!/usr/bin/env python
# Contact: Will Wade <will@e-wade.net>
# Date: Jan 2012

# needs to add file type & filename to spreadsheet
# Do we need to also collect 302 redirects?

# -*- coding: iso-8859-15 -*-

from urllib2 import Request, urlopen, URLError, HTTPError

from urlparse import urlparse
from urlparse import urlsplit

from unicodecsv import UnicodeWriter
from unicodecsv import UnicodeReader

import httplib
from os.path import basename

def make_dirs(path):
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def url2name(url):
    return basename(urlsplit(url)[2])

def download(url, dir='.', save=False, localFileName = None):
    '''
    Tries to download the url passed.

    Returns the content of the page if successful or else None.
    '''
    localName = url2name(url)
    req = Request(url)
    try:
        r = urlopen(req)
    except URLError, e:
        if hasattr(e, 'reason'):
            print 'We failed to reach a server.'
            print 'Reason: ', e.reason
        elif hasattr(e, 'code'):
            print 'The server couldn\'t fulfill the request.'
            print 'Error code: ', e.code
        return False, '', '', 0
    else:
        # everything is fine
        fileurl = r.geturl()
        if r.info().has_key('Content-Length'):
            file_size =  r.info()['Content-Length']
        else:
            file_size = 0
        if r.info().has_key('Content-Disposition'):
            # If the response has Content-Disposition, we take file name from it
            localName = r.info()['Content-Disposition'].split('filename=')[1]
            if localName[0] == '"' or localName[0] == "'":
                localName = localName[1:-1]
        elif r.url != url:
            # if we were redirected, the real file name we take from the final URL
            localName = url2name(r.url)
        if localFileName:
            # we can force to save the file as specified name
            localName = localFileName
        if save:
            pth = make_dirs(dir)
            f = open(pth + '/' + localName, 'wb')
            try:
                with open(pth + '/' + localName, 'wb') as f:
                    shutil.copyfileobj(r,f)
            finally:
                r.close()
        return True, fileurl, localName, file_size

def pretty_contenttype(rawstring):
    if rawstring.count('zip') > 0 or rawstring.count('ZIP') > 0:
        return 'zip'
    elif rawstring.count('exe') > 0 or rawstring.count('msdos') > 0 or rawstring.count('Executables') > 0 or rawstring.count('msdownload') > 0:
        return 'exe'
    elif rawstring.count('msi') > 0:
        return 'msi'
    elif rawstring.count('sit') > 0:
        return 'sit'
    elif rawstring.count('xpi') > 0:
        return 'xpi'
    elif rawstring.count('mac-binhex') > 0 or rawstring.count('hqx') > 0:
        return 'hqx'
    elif rawstring.count('diskimage') > 0 or rawstring.count('dmg') > 0:
        return 'dmg'
    else:
        return 'BAD'


def main():
    with open('new.csv', 'wb') as w:
        writer = UnicodeWriter(w)
        with open('catsNew01.csv', 'rb') as f:
            rows = UnicodeReader(f)
            for row in rows:
                # Row 5 is the download one
                if row[5].count('http://') > 0:
                    print row[1]
                    print '  URL listed:' + row[5]
                    success, url, name, size = download(row[5], row[1])
                    #url, content = get_contenttype(row[5])
                    if success is True:
                        # write a row
                        print '  URL got:' + url
                        print '  we think this is type:' + pretty_contenttype(name)
                        print '  we think its so big:' + str(size)
                        print '  its name is:' +name
                        row[10] = str(1)
                        row[11] = url
                        row[12] = pretty_contenttype(name)
                        row[13] = name
                        row[14] = str(size)
                        writer.writerow(row)
                    else:
                        # write a row marking it as not includable
                        row[10] = str(0)
                        writer.writerow(row)
                else:
                    row[10] = str(0)
                    writer.writerow(row)
if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# Need unicodecsv seen here http://docs.python.org/library/csv.html?highlight=csv#csv
	# Date: Jan 2012

	# -- coding: iso-8859-15 --

	from urllib2 import urlopen
	from lxml import etree
	import csv
	from unicodecsv import UnicodeWriter
	import re


	urls = {"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
	"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60"}

	aurls = {"writing":"http://www.adaptech.org/en/downloads/fandi/cat/10",
	"visual":"http://www.adaptech.org/en/downloads/fandi/cat/20",
	"VI":"http://www.adaptech.org/en/downloads/fandi/cat/30",
	"dictation":"http://www.adaptech.org/en/downloads/fandi/cat/40",
	"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
	"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60",
	"misc":"http://www.adaptech.org/en/downloads/fandi/cat/1000"}

	def get_content(url):
	'''
	Tries to open the url passed.

	Returns the content of the page if successful or else None.
	'''
	try:
	content = urlopen(url, timeout=20)
	return content
	except:
	print "Error Opening: ", url
	return None

	def get_title(entry):
	title = ""

	tdchildren = entry.getchildren()
	for children in tdchildren:
	if children.tag == 'h3':
	title = children.text

	return title

	def get_data(entry, name='Manufacturer:', atag=False):
	data = ""

	tdchildren = entry.getchildren()
	for children in tdchildren:
	if children.tag == 'div' and children.text == name:
	next_elem = children.getnext()
	if atag :
	ahrefdata = next_elem.getchildren()
	for childtags in ahrefdata:
	if childtags.tag == 'a':
	data = childtags.get('href')
	else:
	data = next_elem.text

	return data

	def rename_type(type):
	'''
	remap "trial" or inexpensive to commercial "free" to freeware
	'''
	match = re.search("Inexpensive\|Trial", type) # Match
	if match:
	return 'Commercial'
	elif re.search("Free", type):
	return 'Freeware'
	else:
	return 'unknown'

	def rename_ostype():
	'''
	Splits the slashes up into seperate components - Windows 2000 /2003/XP/Vista/7
	'''
	return None

	def parse_adaptech(tree, type='Unsure'):
	'''
	Parses the adaptech pages.
	'''
	entries = tree.xpath(".//div[contains(@class, 'adaptech-fandi-dbentry')]")
	entryWriter = UnicodeWriter(open('entrys.csv', 'a+'), delimiter=',', quotechar='"')
	for entry in entries:
	eTitle = get_title(entry)
	if eTitle != '':
	ePub = get_data(entry, 'Manufacturer:')
	eDetail = get_data(entry, 'Description:')
	eURLDL = get_data(entry, 'Download URL:', True)
	eURLInfo = get_data(entry, 'Website:', True)
	eOS = get_data(entry, 'System Requirements:')
	eCat = rename_type(get_data(entry, 'Type:'))
	eLang = get_data(entry, 'Interface Language(s):')
	entryWriter.writerow([type,eTitle,ePub,eDetail,eURLDL,eURLInfo,eOS,eCat,eLang])

	def main():
	for page, url in urls.iteritems():
	content = get_content(url)
	if content is None: continue

	htmlparser = etree.HTMLParser()
	tree = etree.parse(content, htmlparser)

	parse_adaptech(tree,page)

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python
	# Contact: Will Wade <will@e-wade.net>
	# Date: Jan 2012

	# needs to add file type & filename to spreadsheet
	# Do we need to also collect 302 redirects?

	# -- coding: iso-8859-15 --

	from urllib2 import Request, urlopen, URLError, HTTPError

	from urlparse import urlparse
	from urlparse import urlsplit

	from unicodecsv import UnicodeWriter
	from unicodecsv import UnicodeReader

	import httplib
	from os.path import basename

	def make_dirs(path):
	if not os.path.exists(path):
	os.makedirs(path)
	return path

	def url2name(url):
	return basename(urlsplit(url)[2])

	def download(url, dir='.', save=False, localFileName = None):
	'''
	Tries to download the url passed.

	Returns the content of the page if successful or else None.
	'''
	localName = url2name(url)
	req = Request(url)
	try:
	r = urlopen(req)
	except URLError, e:
	if hasattr(e, 'reason'):
	print 'We failed to reach a server.'
	print 'Reason: ', e.reason
	elif hasattr(e, 'code'):
	print 'The server couldn\'t fulfill the request.'
	print 'Error code: ', e.code
	return False, '', '', 0
	else:
	# everything is fine
	fileurl = r.geturl()
	if r.info().has_key('Content-Length'):
	file_size = r.info()['Content-Length']
	else:
	file_size = 0
	if r.info().has_key('Content-Disposition'):
	# If the response has Content-Disposition, we take file name from it
	localName = r.info()['Content-Disposition'].split('filename=')[1]
	if localName[0] == '"' or localName[0] == "'":
	localName = localName[1:-1]
	elif r.url != url:
	# if we were redirected, the real file name we take from the final URL
	localName = url2name(r.url)
	if localFileName:
	# we can force to save the file as specified name
	localName = localFileName
	if save:
	pth = make_dirs(dir)
	f = open(pth + '/' + localName, 'wb')
	try:
	with open(pth + '/' + localName, 'wb') as f:
	shutil.copyfileobj(r,f)
	finally:
	r.close()
	return True, fileurl, localName, file_size

	def pretty_contenttype(rawstring):
	if rawstring.count('zip') > 0 or rawstring.count('ZIP') > 0:
	return 'zip'
	elif rawstring.count('exe') > 0 or rawstring.count('msdos') > 0 or rawstring.count('Executables') > 0 or rawstring.count('msdownload') > 0:
	return 'exe'
	elif rawstring.count('msi') > 0:
	return 'msi'
	elif rawstring.count('sit') > 0:
	return 'sit'
	elif rawstring.count('xpi') > 0:
	return 'xpi'
	elif rawstring.count('mac-binhex') > 0 or rawstring.count('hqx') > 0:
	return 'hqx'
	elif rawstring.count('diskimage') > 0 or rawstring.count('dmg') > 0:
	return 'dmg'
	else:
	return 'BAD'



	def main():
	with open('new.csv', 'wb') as w:
	writer = UnicodeWriter(w)
	with open('catsNew01.csv', 'rb') as f:
	rows = UnicodeReader(f)
	for row in rows:
	# Row 5 is the download one
	if row[5].count('http://') > 0:
	print row[1]
	print ' URL listed:' + row[5]
	success, url, name, size = download(row[5], row[1])
	#url, content = get_contenttype(row[5])
	if success is True:
	# write a row
	print ' URL got:' + url
	print ' we think this is type:' + pretty_contenttype(name)
	print ' we think its so big:' + str(size)
	print ' its name is:' +name
	row[10] = str(1)
	row[11] = url
	row[12] = pretty_contenttype(name)
	row[13] = name
	row[14] = str(size)
	writer.writerow(row)
	else:
	# write a row marking it as not includable
	row[10] = str(0)
	writer.writerow(row)
	else:
	row[10] = str(0)
	writer.writerow(row)
	if __name__ == "__main__":
	main()