-
-
Save marcusdoesstuff/17cbd36b88f05b4b6d80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import csv | |
import pyodbc | |
MDB = 'E:\Design Work\Evans Vanodine\Database\EvansDB.accdb' | |
DRV = '{Microsoft Access Driver (*.mdb, *.accdb)}' | |
PWD = '' | |
con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD)) | |
cur = con.cursor() | |
# --- Scraper script starts here --- # | |
singleLinePatterns = [ | |
r'(Product name)[,"]+(.*?)"*,{2,}', | |
r'(Product number)[,]+(.*?),', | |
r'(Internal identification)[,]+(.*?),', | |
r'(Identified uses)[,"]+(.*?)"*,{2,}', | |
r'(Signal word)[,]+(.*?),', | |
r'(Contains)[,"]+(.*?)"*,{2,}' | |
] | |
singleLineMatchers = [re.compile(p) for p in singleLinePatterns] | |
multiLinePatterns = [ | |
r'Physical hazards', | |
r'Health hazards', | |
r'Environmental hazards', | |
r'Hazard statements', | |
r'Precautionary statements', | |
r'Appearance', | |
r'Colour', | |
r'Odour(?!less)', | |
r'Supplemental label information' | |
] | |
multiLineMatchers = [re.compile(p) for p in multiLinePatterns] | |
multiLineEndPatterns = [ | |
r'Classification \(67/548/EEC or 1999/45/EC\)', | |
r'Contains,', | |
r'2.3. Other hazards,', | |
r'pH[,{2,}]', | |
r'Melting point' | |
] | |
multiLineEndMatchers = [re.compile(p) for p in multiLineEndPatterns] | |
data = {} | |
currentKey = None | |
for filename in sys.argv[1:]: | |
for line in open(filename, 'r'): | |
if re.match('SECTION 10', line): | |
break | |
matched = False | |
for m in multiLineEndMatchers: | |
if m.match(line): | |
currentKey = None | |
for m in multiLineMatchers: | |
match = m.match(line) | |
if match: | |
matched = True | |
currentKey = match.group(0) | |
if currentKey not in data: | |
data[currentKey] = [] | |
if matched: continue | |
for m in singleLineMatchers: | |
match = m.match(line) | |
if match: | |
matched = True | |
(key,value) = match.groups() | |
if key not in data: | |
data[key] = [] | |
data[key].append(value) | |
currentKey = None | |
if matched: continue | |
if currentKey is not None: | |
data[currentKey].append(line.strip('\n,')) | |
if 'Signal word' not in data: | |
data['Signal word'] = [None] | |
if 'Contains' not in data: | |
data['Contains'] = [None] | |
if 'Internal identification' not in data: | |
data['Internal identification'] = [None] | |
if 'Identified uses' not in data: | |
data['Identified uses'] = [None] | |
if 'Appearance' not in data: | |
data['Appearance'] = ['????'] | |
if 'Colour' not in data: | |
data['Colour'] = ['????'] | |
if 'Odour' not in data: | |
data['Odour'] = ['????'] | |
print "---" | |
for i, val in data.iteritems(): | |
print i, val | |
print "---" | |
# --- Scraper script ends here --- # | |
cur.execute("INSERT INTO [Product] ([Product Number], [Product Name], [Internal ID], [Identified Uses]) VALUES (?, ?, ?, ?)", | |
data['Product number'][0], data['Product name'][0], data['Internal identification'][0], data['Identified uses'][0]) | |
[fkey] = cur.execute("SELECT @@IDENTITY") | |
cur.execute("INSERT INTO [Product Classification] ([Product ID], [Signal Word], [Physical Hazards], [Health Hazards], [Contains]) VALUES (?, ?, ?, ?, ?)", | |
fkey[0], data['Signal word'][0], data['Physical hazards'][0], data['Health hazards'][0], data['Contains'][0]) | |
cur.execute("INSERT INTO [Product Properties] ([Product ID], [Appearance], [Colour], [Odour]) VALUES (?, ?, ?, ?)", | |
fkey[0], data['Appearance'][0], data['Colour'][0], data['Odour'][0]) | |
if 'Hazard statements' in data: | |
for val in data['Hazard statements']: | |
cur.execute("INSERT INTO [Product H] ([Product ID], [Hazard Statement]) VALUES (?, ?)", fkey[0], val) | |
if 'Precautionary statements' in data: | |
for val in data['Precautionary statements']: | |
cur.execute("INSERT INTO [Product P] ([Product ID], [Precautionary Statement]) VALUES (?, ?)", fkey[0], val) | |
con.commit() | |
cur.close() | |
con.close() | |
print "COMPLETE" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment