Skip to content

Instantly share code, notes, and snippets.

@willwade
Created March 11, 2012 07:49
Show Gist options
  • Save willwade/2015472 to your computer and use it in GitHub Desktop.
Save willwade/2015472 to your computer and use it in GitHub Desktop.
Collection of AT Software - Parse a bunch of URLs for Software to import
#!/usr/bin/env python
# Need unicodecsv seen here http://docs.python.org/library/csv.html?highlight=csv#csv
# Date: Jan 2012
# -*- coding: iso-8859-15 -*-
from urllib2 import urlopen
from lxml import etree
import csv
from unicodecsv import UnicodeWriter
import re
urls = {"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60"}
aurls = {"writing":"http://www.adaptech.org/en/downloads/fandi/cat/10",
"visual":"http://www.adaptech.org/en/downloads/fandi/cat/20",
"VI":"http://www.adaptech.org/en/downloads/fandi/cat/30",
"dictation":"http://www.adaptech.org/en/downloads/fandi/cat/40",
"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50",
"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60",
"misc":"http://www.adaptech.org/en/downloads/fandi/cat/1000"}
def get_content(url):
'''
Tries to open the url passed.
Returns the content of the page if successful or else None.
'''
try:
content = urlopen(url, timeout=20)
return content
except:
print "Error Opening: ", url
return None
def get_title(entry):
title = ""
tdchildren = entry.getchildren()
for children in tdchildren:
if children.tag == 'h3':
title = children.text
return title
def get_data(entry, name='Manufacturer:', atag=False):
data = ""
tdchildren = entry.getchildren()
for children in tdchildren:
if children.tag == 'div' and children.text == name:
next_elem = children.getnext()
if atag :
ahrefdata = next_elem.getchildren()
for childtags in ahrefdata:
if childtags.tag == 'a':
data = childtags.get('href')
else:
data = next_elem.text
return data
def rename_type(type):
'''
remap "trial" or inexpensive to commercial "free" to freeware
'''
match = re.search("Inexpensive|Trial", type) # Match
if match:
return 'Commercial'
elif re.search("Free", type):
return 'Freeware'
else:
return 'unknown'
def rename_ostype():
'''
Splits the slashes up into seperate components - Windows 2000 /2003/XP/Vista/7
'''
return None
def parse_adaptech(tree, type='Unsure'):
'''
Parses the adaptech pages.
'''
entries = tree.xpath(".//div[contains(@class, 'adaptech-fandi-dbentry')]")
entryWriter = UnicodeWriter(open('entrys.csv', 'a+'), delimiter=',', quotechar='"')
for entry in entries:
eTitle = get_title(entry)
if eTitle != '':
ePub = get_data(entry, 'Manufacturer:')
eDetail = get_data(entry, 'Description:')
eURLDL = get_data(entry, 'Download URL:', True)
eURLInfo = get_data(entry, 'Website:', True)
eOS = get_data(entry, 'System Requirements:')
eCat = rename_type(get_data(entry, 'Type:'))
eLang = get_data(entry, 'Interface Language(s):')
entryWriter.writerow([type,eTitle,ePub,eDetail,eURLDL,eURLInfo,eOS,eCat,eLang])
def main():
for page, url in urls.iteritems():
content = get_content(url)
if content is None: continue
htmlparser = etree.HTMLParser()
tree = etree.parse(content, htmlparser)
parse_adaptech(tree,page)
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Contact: Will Wade <will@e-wade.net>
# Date: Jan 2012
# needs to add file type & filename to spreadsheet
# Do we need to also collect 302 redirects?
# -*- coding: iso-8859-15 -*-
from urllib2 import Request, urlopen, URLError, HTTPError
from urlparse import urlparse
from urlparse import urlsplit
from unicodecsv import UnicodeWriter
from unicodecsv import UnicodeReader
import httplib
from os.path import basename
def make_dirs(path):
if not os.path.exists(path):
os.makedirs(path)
return path
def url2name(url):
return basename(urlsplit(url)[2])
def download(url, dir='.', save=False, localFileName = None):
'''
Tries to download the url passed.
Returns the content of the page if successful or else None.
'''
localName = url2name(url)
req = Request(url)
try:
r = urlopen(req)
except URLError, e:
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
return False, '', '', 0
else:
# everything is fine
fileurl = r.geturl()
if r.info().has_key('Content-Length'):
file_size = r.info()['Content-Length']
else:
file_size = 0
if r.info().has_key('Content-Disposition'):
# If the response has Content-Disposition, we take file name from it
localName = r.info()['Content-Disposition'].split('filename=')[1]
if localName[0] == '"' or localName[0] == "'":
localName = localName[1:-1]
elif r.url != url:
# if we were redirected, the real file name we take from the final URL
localName = url2name(r.url)
if localFileName:
# we can force to save the file as specified name
localName = localFileName
if save:
pth = make_dirs(dir)
f = open(pth + '/' + localName, 'wb')
try:
with open(pth + '/' + localName, 'wb') as f:
shutil.copyfileobj(r,f)
finally:
r.close()
return True, fileurl, localName, file_size
def pretty_contenttype(rawstring):
if rawstring.count('zip') > 0 or rawstring.count('ZIP') > 0:
return 'zip'
elif rawstring.count('exe') > 0 or rawstring.count('msdos') > 0 or rawstring.count('Executables') > 0 or rawstring.count('msdownload') > 0:
return 'exe'
elif rawstring.count('msi') > 0:
return 'msi'
elif rawstring.count('sit') > 0:
return 'sit'
elif rawstring.count('xpi') > 0:
return 'xpi'
elif rawstring.count('mac-binhex') > 0 or rawstring.count('hqx') > 0:
return 'hqx'
elif rawstring.count('diskimage') > 0 or rawstring.count('dmg') > 0:
return 'dmg'
else:
return 'BAD'
def main():
with open('new.csv', 'wb') as w:
writer = UnicodeWriter(w)
with open('catsNew01.csv', 'rb') as f:
rows = UnicodeReader(f)
for row in rows:
# Row 5 is the download one
if row[5].count('http://') > 0:
print row[1]
print ' URL listed:' + row[5]
success, url, name, size = download(row[5], row[1])
#url, content = get_contenttype(row[5])
if success is True:
# write a row
print ' URL got:' + url
print ' we think this is type:' + pretty_contenttype(name)
print ' we think its so big:' + str(size)
print ' its name is:' +name
row[10] = str(1)
row[11] = url
row[12] = pretty_contenttype(name)
row[13] = name
row[14] = str(size)
writer.writerow(row)
else:
# write a row marking it as not includable
row[10] = str(0)
writer.writerow(row)
else:
row[10] = str(0)
writer.writerow(row)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment