marcusdoesstuff/gist:17cbd36b88f05b4b6d80 Secret

## gistfile1.txt
import sys
import re
import csv
import pyodbc

MDB = 'E:\Design Work\Evans Vanodine\Database\EvansDB.accdb'
DRV = '{Microsoft Access Driver (*.mdb, *.accdb)}'
PWD = ''

con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
cur = con.cursor()


# --- Scraper script starts here --- #

singleLinePatterns = [
    r'(Product name)[,"]+(.*?)"*,{2,}',
    r'(Product number)[,]+(.*?),',
    r'(Internal identification)[,]+(.*?),',
    r'(Identified uses)[,"]+(.*?)"*,{2,}',
    r'(Signal word)[,]+(.*?),',
    r'(Contains)[,"]+(.*?)"*,{2,}'
]

singleLineMatchers = [re.compile(p) for p in singleLinePatterns]

multiLinePatterns = [
    r'Physical hazards',
    r'Health hazards',
    r'Environmental hazards',
    r'Hazard statements',
    r'Precautionary statements',
    r'Appearance',
    r'Colour',
    r'Odour(?!less)',
    r'Supplemental label information'
]

multiLineMatchers = [re.compile(p) for p in multiLinePatterns]

multiLineEndPatterns = [
    r'Classification \(67/548/EEC or 1999/45/EC\)',
    r'Contains,',
    r'2.3. Other hazards,',
    r'pH[,{2,}]',
    r'Melting point'
]

multiLineEndMatchers = [re.compile(p) for p in multiLineEndPatterns]

data = {}
currentKey = None

for filename in sys.argv[1:]:
    for line in open(filename, 'r'):
        if re.match('SECTION 10', line):
            break

        matched = False
        for m in multiLineEndMatchers:
            if m.match(line):
                currentKey = None

        for m in multiLineMatchers:
            match = m.match(line)
            if match:
                matched = True
                currentKey = match.group(0)
                if currentKey not in data:
                    data[currentKey] = []

        if matched: continue

        for m in singleLineMatchers:
            match = m.match(line)
            if match:
                matched = True
                (key,value) = match.groups()
                if key not in data:
                    data[key] = []

                data[key].append(value)

                currentKey = None

        if matched: continue

        if currentKey is not None:
            data[currentKey].append(line.strip('\n,'))

if 'Signal word' not in data:
    data['Signal word'] = [None]
if 'Contains' not in data:
    data['Contains'] = [None]
if 'Internal identification' not in data:
    data['Internal identification'] = [None]
if 'Identified uses' not in data:
    data['Identified uses'] = [None]
if 'Appearance' not in data:
    data['Appearance'] = ['????']
if 'Colour' not in data:
    data['Colour'] = ['????']
if 'Odour' not in data:
    data['Odour'] = ['????']

print "---"
for i, val in data.iteritems():
    print i, val
print "---"

# --- Scraper script ends here --- #


cur.execute("INSERT INTO [Product] ([Product Number], [Product Name], [Internal ID], [Identified Uses]) VALUES (?, ?, ?, ?)",
    data['Product number'][0], data['Product name'][0], data['Internal identification'][0], data['Identified uses'][0])
[fkey] = cur.execute("SELECT @@IDENTITY")
cur.execute("INSERT INTO [Product Classification] ([Product ID], [Signal Word], [Physical Hazards], [Health Hazards], [Contains]) VALUES (?, ?, ?, ?, ?)",
    fkey[0], data['Signal word'][0], data['Physical hazards'][0], data['Health hazards'][0], data['Contains'][0])
cur.execute("INSERT INTO [Product Properties] ([Product ID], [Appearance], [Colour], [Odour]) VALUES (?, ?, ?, ?)",
    fkey[0], data['Appearance'][0], data['Colour'][0], data['Odour'][0])
if 'Hazard statements' in data:
    for val in data['Hazard statements']:
	    cur.execute("INSERT INTO [Product H] ([Product ID], [Hazard Statement]) VALUES (?, ?)", fkey[0], val)
if 'Precautionary statements' in data:
    for val in data['Precautionary statements']:
        cur.execute("INSERT INTO [Product P] ([Product ID], [Precautionary Statement]) VALUES (?, ?)", fkey[0], val)


con.commit()
cur.close()
con.close()

print "COMPLETE"
	import sys
	import re
	import csv
	import pyodbc

	MDB = 'E:\Design Work\Evans Vanodine\Database\EvansDB.accdb'
	DRV = '{Microsoft Access Driver (.mdb, .accdb)}'
	PWD = ''

	con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
	cur = con.cursor()



	# --- Scraper script starts here --- #

	singleLinePatterns = [
	r'(Product name)[,"]+(.?)",{2,}',
	r'(Product number)[,]+(.*?),',
	r'(Internal identification)[,]+(.*?),',
	r'(Identified uses)[,"]+(.?)",{2,}',
	r'(Signal word)[,]+(.*?),',
	r'(Contains)[,"]+(.?)",{2,}'
	]

	singleLineMatchers = [re.compile(p) for p in singleLinePatterns]

	multiLinePatterns = [
	r'Physical hazards',
	r'Health hazards',
	r'Environmental hazards',
	r'Hazard statements',
	r'Precautionary statements',
	r'Appearance',
	r'Colour',
	r'Odour(?!less)',
	r'Supplemental label information'
	]

	multiLineMatchers = [re.compile(p) for p in multiLinePatterns]

	multiLineEndPatterns = [
	r'Classification \(67/548/EEC or 1999/45/EC\)',
	r'Contains,',
	r'2.3. Other hazards,',
	r'pH[,{2,}]',
	r'Melting point'
	]

	multiLineEndMatchers = [re.compile(p) for p in multiLineEndPatterns]

	data = {}
	currentKey = None

	for filename in sys.argv[1:]:
	for line in open(filename, 'r'):
	if re.match('SECTION 10', line):
	break

	matched = False
	for m in multiLineEndMatchers:
	if m.match(line):
	currentKey = None

	for m in multiLineMatchers:
	match = m.match(line)
	if match:
	matched = True
	currentKey = match.group(0)
	if currentKey not in data:
	data[currentKey] = []

	if matched: continue

	for m in singleLineMatchers:
	match = m.match(line)
	if match:
	matched = True
	(key,value) = match.groups()
	if key not in data:
	data[key] = []

	data[key].append(value)

	currentKey = None

	if matched: continue

	if currentKey is not None:
	data[currentKey].append(line.strip('\n,'))

	if 'Signal word' not in data:
	data['Signal word'] = [None]
	if 'Contains' not in data:
	data['Contains'] = [None]
	if 'Internal identification' not in data:
	data['Internal identification'] = [None]
	if 'Identified uses' not in data:
	data['Identified uses'] = [None]
	if 'Appearance' not in data:
	data['Appearance'] = ['????']
	if 'Colour' not in data:
	data['Colour'] = ['????']
	if 'Odour' not in data:
	data['Odour'] = ['????']

	print "---"
	for i, val in data.iteritems():
	print i, val
	print "---"

	# --- Scraper script ends here --- #



	cur.execute("INSERT INTO [Product] ([Product Number], [Product Name], [Internal ID], [Identified Uses]) VALUES (?, ?, ?, ?)",
	data['Product number'][0], data['Product name'][0], data['Internal identification'][0], data['Identified uses'][0])
	[fkey] = cur.execute("SELECT @@IDENTITY")
	cur.execute("INSERT INTO [Product Classification] ([Product ID], [Signal Word], [Physical Hazards], [Health Hazards], [Contains]) VALUES (?, ?, ?, ?, ?)",
	fkey[0], data['Signal word'][0], data['Physical hazards'][0], data['Health hazards'][0], data['Contains'][0])
	cur.execute("INSERT INTO [Product Properties] ([Product ID], [Appearance], [Colour], [Odour]) VALUES (?, ?, ?, ?)",
	fkey[0], data['Appearance'][0], data['Colour'][0], data['Odour'][0])
	if 'Hazard statements' in data:
	for val in data['Hazard statements']:
	cur.execute("INSERT INTO [Product H] ([Product ID], [Hazard Statement]) VALUES (?, ?)", fkey[0], val)
	if 'Precautionary statements' in data:
	for val in data['Precautionary statements']:
	cur.execute("INSERT INTO [Product P] ([Product ID], [Precautionary Statement]) VALUES (?, ?)", fkey[0], val)


	con.commit()
	cur.close()
	con.close()

	print "COMPLETE"