Skip to content

Instantly share code, notes, and snippets.

@marcusdoesstuff
Last active November 30, 2015 10:00
Show Gist options
  • Save marcusdoesstuff/17cbd36b88f05b4b6d80 to your computer and use it in GitHub Desktop.
Save marcusdoesstuff/17cbd36b88f05b4b6d80 to your computer and use it in GitHub Desktop.
import sys
import re
import csv
import pyodbc
MDB = 'E:\Design Work\Evans Vanodine\Database\EvansDB.accdb'
DRV = '{Microsoft Access Driver (*.mdb, *.accdb)}'
PWD = ''
con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
cur = con.cursor()
# --- Scraper script starts here --- #
singleLinePatterns = [
r'(Product name)[,"]+(.*?)"*,{2,}',
r'(Product number)[,]+(.*?),',
r'(Internal identification)[,]+(.*?),',
r'(Identified uses)[,"]+(.*?)"*,{2,}',
r'(Signal word)[,]+(.*?),',
r'(Contains)[,"]+(.*?)"*,{2,}'
]
singleLineMatchers = [re.compile(p) for p in singleLinePatterns]
multiLinePatterns = [
r'Physical hazards',
r'Health hazards',
r'Environmental hazards',
r'Hazard statements',
r'Precautionary statements',
r'Appearance',
r'Colour',
r'Odour(?!less)',
r'Supplemental label information'
]
multiLineMatchers = [re.compile(p) for p in multiLinePatterns]
multiLineEndPatterns = [
r'Classification \(67/548/EEC or 1999/45/EC\)',
r'Contains,',
r'2.3. Other hazards,',
r'pH[,{2,}]',
r'Melting point'
]
multiLineEndMatchers = [re.compile(p) for p in multiLineEndPatterns]
data = {}
currentKey = None
for filename in sys.argv[1:]:
for line in open(filename, 'r'):
if re.match('SECTION 10', line):
break
matched = False
for m in multiLineEndMatchers:
if m.match(line):
currentKey = None
for m in multiLineMatchers:
match = m.match(line)
if match:
matched = True
currentKey = match.group(0)
if currentKey not in data:
data[currentKey] = []
if matched: continue
for m in singleLineMatchers:
match = m.match(line)
if match:
matched = True
(key,value) = match.groups()
if key not in data:
data[key] = []
data[key].append(value)
currentKey = None
if matched: continue
if currentKey is not None:
data[currentKey].append(line.strip('\n,'))
if 'Signal word' not in data:
data['Signal word'] = [None]
if 'Contains' not in data:
data['Contains'] = [None]
if 'Internal identification' not in data:
data['Internal identification'] = [None]
if 'Identified uses' not in data:
data['Identified uses'] = [None]
if 'Appearance' not in data:
data['Appearance'] = ['????']
if 'Colour' not in data:
data['Colour'] = ['????']
if 'Odour' not in data:
data['Odour'] = ['????']
print "---"
for i, val in data.iteritems():
print i, val
print "---"
# --- Scraper script ends here --- #
cur.execute("INSERT INTO [Product] ([Product Number], [Product Name], [Internal ID], [Identified Uses]) VALUES (?, ?, ?, ?)",
data['Product number'][0], data['Product name'][0], data['Internal identification'][0], data['Identified uses'][0])
[fkey] = cur.execute("SELECT @@IDENTITY")
cur.execute("INSERT INTO [Product Classification] ([Product ID], [Signal Word], [Physical Hazards], [Health Hazards], [Contains]) VALUES (?, ?, ?, ?, ?)",
fkey[0], data['Signal word'][0], data['Physical hazards'][0], data['Health hazards'][0], data['Contains'][0])
cur.execute("INSERT INTO [Product Properties] ([Product ID], [Appearance], [Colour], [Odour]) VALUES (?, ?, ?, ?)",
fkey[0], data['Appearance'][0], data['Colour'][0], data['Odour'][0])
if 'Hazard statements' in data:
for val in data['Hazard statements']:
cur.execute("INSERT INTO [Product H] ([Product ID], [Hazard Statement]) VALUES (?, ?)", fkey[0], val)
if 'Precautionary statements' in data:
for val in data['Precautionary statements']:
cur.execute("INSERT INTO [Product P] ([Product ID], [Precautionary Statement]) VALUES (?, ?)", fkey[0], val)
con.commit()
cur.close()
con.close()
print "COMPLETE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment