Skip to content

Instantly share code, notes, and snippets.

@elijahbenizzy
Created July 5, 2024 21:25
Show Gist options
  • Save elijahbenizzy/b6ff03f7c0964524b7bbb7791d0fdc1d to your computer and use it in GitHub Desktop.
Save elijahbenizzy/b6ff03f7c0964524b7bbb7791d0fdc1d to your computer and use it in GitHub Desktop.
# python 3.12
"""
Hamilton demo.
"""
import sys
import pprint
from hamilton import driver
import toyscriptiii as ts
dr = driver.Builder().with_modules(ts).build()
dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR')
results = dr.execute(['parsed_data',
'data_with_wikipedia',
'data_with_company',
'info_output',
'commodity_word_counts',
'colloquial_company_word_counts',
'info_dict_merged',
'wikipedia_report'],
inputs={'datafile':'data.csv'})
pprint.pprint(results['info_dict_merged'])
print(results['info_output'])
print(results['wikipedia_report'])
The main toy module with functions configured for the Hamilton graph:
# python 3.12
"""
Toy script.
Takes some input from a csv file on big American
mines and looks at Wikipedia text for some extra
context.
"""
import copy
import pprint
import sys
from urllib import request
import re
from bs4 import BeautifulSoup
def parsed_data(datafile:str) -> dict:
"""
Get csv data into a dictionary keyed on mine name.
"""
retval = {}
with open(datafile, 'r') as f:
headers = [x.strip() for x in next(f).split(',')]
for linex in f:
vals = [x.strip() for x in linex.split(',')]
retval[vals[0]] = {key:val for key, val in zip(headers, vals)}
pprint.pprint(retval)
return retval
def data_with_wikipedia(parsed_data:dict) -> dict:
"""
Connect to wikipedia sites and fill in
raw html data.
Return dictionary.
"""
retval = copy.deepcopy(parsed_data)
for minex in retval:
obj = request.urlopen(retval[minex]['wikipedia page'])
html = obj.read()
soup = BeautifulSoup(html, 'html.parser')
print(soup.title)
# Text from html and strip out newlines.
newstring = soup.get_text().replace('\n', '')
retval[minex]['wikipediatext'] = newstring
return retval
def data_with_company(data_with_wikipedia:dict) -> dict:
"""
Fetches company ownership for mine out of
Wikipedia text dump.
Returns a new dictionary with the company name
without the big wikipedia text dump.
"""
# Wikipedia setup for mine company name.
COMPANYPAT = r'[a-z]Company'
# Lower case followed by upper case heuristic.
ENDCOMPANYPAT = '[a-z][A-Z]'
retval = copy.deepcopy(data_with_wikipedia)
companypat = re.compile(COMPANYPAT)
endcompanypat = re.compile(ENDCOMPANYPAT)
for minex in retval:
print(minex)
match = re.search(companypat, retval[minex]['wikipediatext'])
if match:
print('Company match span = ', match.span())
companyidx = match.span()[1]
match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
print('End Company match span = ', match2.span())
retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
# Get rid of big text dump in return value.
retval[minex].pop('wikipediatext')
return retval
def info_output(data_with_company:dict) -> str:
"""
Prints some output text to a file for each
mine in the data_with_company dictionary.
Returns string filename of output.
"""
INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
COMPANYLINEFMT = '\n {company:s} owns the mine.\n\n'
retval = 'mine_info.txt'
with open(retval, 'w') as f:
for minex in data_with_company:
print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
return retval
def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
"""
Return dictionary keyed on mine with counts of
commodity (e.g., zinc etc.) mentions on Wikipedia
page (excluding ones in the company name).
"""
retval = {}
# This will probably miss some occurrences at mashed together
# word boundaries. It is a rough estimate.
# '\b[Gg]old\b'
commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
for minex in data_with_wikipedia:
print(minex)
commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
commoditypat = commoditypatfmt.format(commodityuc,
data_with_wikipedia[minex]['commodity'][0],
data_with_wikipedia[minex]['commodity'][1:])
print(commoditypat)
commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
# pprint.pprint(commoditymatches)
nummatchesraw = len(commoditymatches)
print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
companymatches = re.findall(data_with_company[minex]['company'],
data_with_wikipedia[minex]['wikipediatext'])
numcompanymatches = len(companymatches)
print('Length of companymatches is {0:d}.'.format(numcompanymatches))
# Is the commodity name part of the company name?
print('commoditypat = ', commoditypat)
print(data_with_company[minex]['company'])
commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
if commoditymatchcompany:
print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
nummatchesfinal = nummatchesraw - numcompanymatches
retval[minex] = nummatchesfinal
else:
retval[minex] = nummatchesraw
return retval
def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
"""
Find the number of times the company you associate with
the property/mine (very subjective) is within the
text of the mine's wikipedia article.
"""
retval = {}
for minex in data_with_wikipedia:
colloquial_pat = data_with_wikipedia[minex]['colloquial association']
print(minex)
nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
retval[minex] = nummatches
return retval
def info_dict_merged(data_with_company:dict,
commodity_word_counts:dict,
colloquial_company_word_counts:dict) -> dict:
"""
Get a dictionary with all the collected information
in it minus the big Wikipedia text dump.
"""
retval = copy.deepcopy(data_with_company)
for minex in retval:
retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
retval[minex]['commodity word count'] = commodity_word_counts[minex]
return retval
def wikipedia_report(info_dict_merged:dict) -> str:
"""
Writes out Wikipedia information (word counts)
to file in prose; returns string filename.
"""
retval = 'wikipedia_info.txt'
colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
with open(retval, 'w') as f:
for minex in info_dict_merged:
print(colloqfmt.format(info_dict_merged[minex]['mine'],
info_dict_merged[minex]['colloquial association count'],
info_dict_merged[minex]['colloquial association']), file=f)
print(commodfmt.format(info_dict_merged[minex]['mine'],
info_dict_merged[minex]['commodity word count'],
info_dict_merged[minex]['commodity']), file=f)
return retval
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment