Created
March 11, 2012 07:49
-
-
Save willwade/2015472 to your computer and use it in GitHub Desktop.
Collection of AT Software - Parse a bunch of URLs for Software to import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Need unicodecsv seen here http://docs.python.org/library/csv.html?highlight=csv#csv | |
# Date: Jan 2012 | |
# -*- coding: iso-8859-15 -*- | |
from urllib2 import urlopen | |
from lxml import etree | |
import csv | |
from unicodecsv import UnicodeWriter | |
import re | |
urls = {"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50", | |
"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60"} | |
aurls = {"writing":"http://www.adaptech.org/en/downloads/fandi/cat/10", | |
"visual":"http://www.adaptech.org/en/downloads/fandi/cat/20", | |
"VI":"http://www.adaptech.org/en/downloads/fandi/cat/30", | |
"dictation":"http://www.adaptech.org/en/downloads/fandi/cat/40", | |
"keyboard help":"http://www.adaptech.org/en/downloads/fandi/cat/50", | |
"mouse help":"http://www.adaptech.org/en/downloads/fandi/cat/60", | |
"misc":"http://www.adaptech.org/en/downloads/fandi/cat/1000"} | |
def get_content(url): | |
''' | |
Tries to open the url passed. | |
Returns the content of the page if successful or else None. | |
''' | |
try: | |
content = urlopen(url, timeout=20) | |
return content | |
except: | |
print "Error Opening: ", url | |
return None | |
def get_title(entry): | |
title = "" | |
tdchildren = entry.getchildren() | |
for children in tdchildren: | |
if children.tag == 'h3': | |
title = children.text | |
return title | |
def get_data(entry, name='Manufacturer:', atag=False): | |
data = "" | |
tdchildren = entry.getchildren() | |
for children in tdchildren: | |
if children.tag == 'div' and children.text == name: | |
next_elem = children.getnext() | |
if atag : | |
ahrefdata = next_elem.getchildren() | |
for childtags in ahrefdata: | |
if childtags.tag == 'a': | |
data = childtags.get('href') | |
else: | |
data = next_elem.text | |
return data | |
def rename_type(type): | |
''' | |
remap "trial" or inexpensive to commercial "free" to freeware | |
''' | |
match = re.search("Inexpensive|Trial", type) # Match | |
if match: | |
return 'Commercial' | |
elif re.search("Free", type): | |
return 'Freeware' | |
else: | |
return 'unknown' | |
def rename_ostype(): | |
''' | |
Splits the slashes up into seperate components - Windows 2000 /2003/XP/Vista/7 | |
''' | |
return None | |
def parse_adaptech(tree, type='Unsure'): | |
''' | |
Parses the adaptech pages. | |
''' | |
entries = tree.xpath(".//div[contains(@class, 'adaptech-fandi-dbentry')]") | |
entryWriter = UnicodeWriter(open('entrys.csv', 'a+'), delimiter=',', quotechar='"') | |
for entry in entries: | |
eTitle = get_title(entry) | |
if eTitle != '': | |
ePub = get_data(entry, 'Manufacturer:') | |
eDetail = get_data(entry, 'Description:') | |
eURLDL = get_data(entry, 'Download URL:', True) | |
eURLInfo = get_data(entry, 'Website:', True) | |
eOS = get_data(entry, 'System Requirements:') | |
eCat = rename_type(get_data(entry, 'Type:')) | |
eLang = get_data(entry, 'Interface Language(s):') | |
entryWriter.writerow([type,eTitle,ePub,eDetail,eURLDL,eURLInfo,eOS,eCat,eLang]) | |
def main(): | |
for page, url in urls.iteritems(): | |
content = get_content(url) | |
if content is None: continue | |
htmlparser = etree.HTMLParser() | |
tree = etree.parse(content, htmlparser) | |
parse_adaptech(tree,page) | |
if __name__ == "__main__": | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Contact: Will Wade <will@e-wade.net> | |
# Date: Jan 2012 | |
# needs to add file type & filename to spreadsheet | |
# Do we need to also collect 302 redirects? | |
# -*- coding: iso-8859-15 -*- | |
from urllib2 import Request, urlopen, URLError, HTTPError | |
from urlparse import urlparse | |
from urlparse import urlsplit | |
from unicodecsv import UnicodeWriter | |
from unicodecsv import UnicodeReader | |
import httplib | |
from os.path import basename | |
def make_dirs(path): | |
if not os.path.exists(path): | |
os.makedirs(path) | |
return path | |
def url2name(url): | |
return basename(urlsplit(url)[2]) | |
def download(url, dir='.', save=False, localFileName = None): | |
''' | |
Tries to download the url passed. | |
Returns the content of the page if successful or else None. | |
''' | |
localName = url2name(url) | |
req = Request(url) | |
try: | |
r = urlopen(req) | |
except URLError, e: | |
if hasattr(e, 'reason'): | |
print 'We failed to reach a server.' | |
print 'Reason: ', e.reason | |
elif hasattr(e, 'code'): | |
print 'The server couldn\'t fulfill the request.' | |
print 'Error code: ', e.code | |
return False, '', '', 0 | |
else: | |
# everything is fine | |
fileurl = r.geturl() | |
if r.info().has_key('Content-Length'): | |
file_size = r.info()['Content-Length'] | |
else: | |
file_size = 0 | |
if r.info().has_key('Content-Disposition'): | |
# If the response has Content-Disposition, we take file name from it | |
localName = r.info()['Content-Disposition'].split('filename=')[1] | |
if localName[0] == '"' or localName[0] == "'": | |
localName = localName[1:-1] | |
elif r.url != url: | |
# if we were redirected, the real file name we take from the final URL | |
localName = url2name(r.url) | |
if localFileName: | |
# we can force to save the file as specified name | |
localName = localFileName | |
if save: | |
pth = make_dirs(dir) | |
f = open(pth + '/' + localName, 'wb') | |
try: | |
with open(pth + '/' + localName, 'wb') as f: | |
shutil.copyfileobj(r,f) | |
finally: | |
r.close() | |
return True, fileurl, localName, file_size | |
def pretty_contenttype(rawstring): | |
if rawstring.count('zip') > 0 or rawstring.count('ZIP') > 0: | |
return 'zip' | |
elif rawstring.count('exe') > 0 or rawstring.count('msdos') > 0 or rawstring.count('Executables') > 0 or rawstring.count('msdownload') > 0: | |
return 'exe' | |
elif rawstring.count('msi') > 0: | |
return 'msi' | |
elif rawstring.count('sit') > 0: | |
return 'sit' | |
elif rawstring.count('xpi') > 0: | |
return 'xpi' | |
elif rawstring.count('mac-binhex') > 0 or rawstring.count('hqx') > 0: | |
return 'hqx' | |
elif rawstring.count('diskimage') > 0 or rawstring.count('dmg') > 0: | |
return 'dmg' | |
else: | |
return 'BAD' | |
def main(): | |
with open('new.csv', 'wb') as w: | |
writer = UnicodeWriter(w) | |
with open('catsNew01.csv', 'rb') as f: | |
rows = UnicodeReader(f) | |
for row in rows: | |
# Row 5 is the download one | |
if row[5].count('http://') > 0: | |
print row[1] | |
print ' URL listed:' + row[5] | |
success, url, name, size = download(row[5], row[1]) | |
#url, content = get_contenttype(row[5]) | |
if success is True: | |
# write a row | |
print ' URL got:' + url | |
print ' we think this is type:' + pretty_contenttype(name) | |
print ' we think its so big:' + str(size) | |
print ' its name is:' +name | |
row[10] = str(1) | |
row[11] = url | |
row[12] = pretty_contenttype(name) | |
row[13] = name | |
row[14] = str(size) | |
writer.writerow(row) | |
else: | |
# write a row marking it as not includable | |
row[10] = str(0) | |
writer.writerow(row) | |
else: | |
row[10] = str(0) | |
writer.writerow(row) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment