elijahbenizzy/run.py

## run.py
# python 3.12

"""
Hamilton demo.
"""

import sys

import pprint

from hamilton import driver

import toyscriptiii as ts

dr = driver.Builder().with_modules(ts).build()

dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR')

results = dr.execute(['parsed_data',
                      'data_with_wikipedia',
                      'data_with_company',
                      'info_output',
                      'commodity_word_counts',
                      'colloquial_company_word_counts',
                      'info_dict_merged',
                      'wikipedia_report'],
                      inputs={'datafile':'data.csv'})

pprint.pprint(results['info_dict_merged'])
print(results['info_output'])
print(results['wikipedia_report'])

The main toy module with functions configured for the Hamilton graph:

# python 3.12

"""
Toy script.

Takes some input from a csv file on big American
mines and looks at Wikipedia text for some extra
context.
"""

import copy

import pprint

import sys

from urllib import request

import re

from bs4 import BeautifulSoup

def parsed_data(datafile:str) -> dict:
    """
    Get csv data into a dictionary keyed on mine name.
    """
    retval = {}
    with open(datafile, 'r') as f:
        headers = [x.strip() for x in next(f).split(',')]
        for linex in f:
            vals = [x.strip() for x in linex.split(',')]
            retval[vals[0]] = {key:val for key, val in zip(headers, vals)}
    pprint.pprint(retval)
    return retval

def data_with_wikipedia(parsed_data:dict) -> dict:
    """
    Connect to wikipedia sites and fill in
    raw html data.

    Return dictionary.
    """
    retval = copy.deepcopy(parsed_data)
    for minex in retval:
        obj = request.urlopen(retval[minex]['wikipedia page'])
        html = obj.read()
        soup = BeautifulSoup(html, 'html.parser')
        print(soup.title)
        # Text from html and strip out newlines.
        newstring = soup.get_text().replace('\n', '')
        retval[minex]['wikipediatext'] = newstring
    return retval

def data_with_company(data_with_wikipedia:dict) -> dict:
    """
    Fetches company ownership for mine out of
    Wikipedia text dump.

    Returns a new dictionary with the company name
    without the big wikipedia text dump.
    """
    # Wikipedia setup for mine company name.
    COMPANYPAT = r'[a-z]Company'
    # Lower case followed by upper case heuristic.
    ENDCOMPANYPAT = '[a-z][A-Z]'
    retval = copy.deepcopy(data_with_wikipedia)
    companypat = re.compile(COMPANYPAT)
    endcompanypat = re.compile(ENDCOMPANYPAT)
    for minex in retval:
        print(minex)
        match = re.search(companypat, retval[minex]['wikipediatext'])
        if match:
            print('Company match span = ', match.span())
            companyidx = match.span()[1]
            match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
            print('End Company match span = ', match2.span())
            retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
        # Get rid of big text dump in return value.
        retval[minex].pop('wikipediatext')
    return retval

def info_output(data_with_company:dict) -> str:
    """
    Prints some output text to a file for each
    mine in the data_with_company dictionary.

    Returns string filename of output.
    """
    INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
    COMPANYLINEFMT = '\n    {company:s} owns the mine.\n\n'
    retval = 'mine_info.txt'
    with open(retval, 'w') as f:
        for minex in data_with_company:
            print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
            print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
    return retval

def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
    """
    Return dictionary keyed on mine with counts of
    commodity (e.g., zinc etc.) mentions on Wikipedia
    page (excluding ones in the company name).
    """
    retval = {}
    # This will probably miss some occurrences at mashed together
    # word boundaries. It is a rough estimate.
    # '\b[Gg]old\b'
    commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
    for minex in data_with_wikipedia:
        print(minex)
        commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
        commoditypat = commoditypatfmt.format(commodityuc,
                                              data_with_wikipedia[minex]['commodity'][0],
                                              data_with_wikipedia[minex]['commodity'][1:])
        print(commoditypat)
        commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
        # pprint.pprint(commoditymatches)
        nummatchesraw = len(commoditymatches)
        print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
        companymatches = re.findall(data_with_company[minex]['company'],
                                    data_with_wikipedia[minex]['wikipediatext'])
        numcompanymatches = len(companymatches)
        print('Length of companymatches is {0:d}.'.format(numcompanymatches))
        # Is the commodity name part of the company name?
        print('commoditypat = ', commoditypat)
        print(data_with_company[minex]['company'])
        commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
        if commoditymatchcompany:
            print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
            nummatchesfinal = nummatchesraw - numcompanymatches
            retval[minex] = nummatchesfinal
        else:
            retval[minex] = nummatchesraw
    return retval

def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
    """
    Find the number of times the company you associate with
    the property/mine (very subjective) is within the
    text of the mine's wikipedia article.
    """
    retval = {}
    for minex in data_with_wikipedia:
        colloquial_pat = data_with_wikipedia[minex]['colloquial association']
        print(minex)
        nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
        print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
        retval[minex] = nummatches
    return retval

def info_dict_merged(data_with_company:dict,
                     commodity_word_counts:dict,
                     colloquial_company_word_counts:dict) -> dict:
    """
    Get a dictionary with all the collected information
    in it minus the big Wikipedia text dump.
    """
    retval = copy.deepcopy(data_with_company)
    for minex in retval:
        retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
        retval[minex]['commodity word count'] = commodity_word_counts[minex]
    return retval

def wikipedia_report(info_dict_merged:dict) -> str:
    """
    Writes out Wikipedia information (word counts)
    to file in prose; returns string filename.
    """
    retval = 'wikipedia_info.txt'
    colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
    commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
    with open(retval, 'w') as f:
        for minex in info_dict_merged:
            print(colloqfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['colloquial association count'],
                                   info_dict_merged[minex]['colloquial association']), file=f)
            print(commodfmt.format(info_dict_merged[minex]['mine'],
                                   info_dict_merged[minex]['commodity word count'],
                                   info_dict_merged[minex]['commodity']), file=f)
    return retval
	# python 3.12

	"""
	Hamilton demo.
	"""

	import sys

	import pprint

	from hamilton import driver

	import toyscriptiii as ts

	dr = driver.Builder().with_modules(ts).build()

	dr.display_all_functions("ts.png", deduplicate_inputs=True, keep_dot=True, orient='BR')

	results = dr.execute(['parsed_data',
	'data_with_wikipedia',
	'data_with_company',
	'info_output',
	'commodity_word_counts',
	'colloquial_company_word_counts',
	'info_dict_merged',
	'wikipedia_report'],
	inputs={'datafile':'data.csv'})

	pprint.pprint(results['info_dict_merged'])
	print(results['info_output'])
	print(results['wikipedia_report'])

	The main toy module with functions configured for the Hamilton graph:

	# python 3.12

	"""
	Toy script.

	Takes some input from a csv file on big American
	mines and looks at Wikipedia text for some extra
	context.
	"""

	import copy

	import pprint

	import sys

	from urllib import request

	import re

	from bs4 import BeautifulSoup

	def parsed_data(datafile:str) -> dict:
	"""
	Get csv data into a dictionary keyed on mine name.
	"""
	retval = {}
	with open(datafile, 'r') as f:
	headers = [x.strip() for x in next(f).split(',')]
	for linex in f:
	vals = [x.strip() for x in linex.split(',')]
	retval[vals[0]] = {key:val for key, val in zip(headers, vals)}
	pprint.pprint(retval)
	return retval

	def data_with_wikipedia(parsed_data:dict) -> dict:
	"""
	Connect to wikipedia sites and fill in
	raw html data.

	Return dictionary.
	"""
	retval = copy.deepcopy(parsed_data)
	for minex in retval:
	obj = request.urlopen(retval[minex]['wikipedia page'])
	html = obj.read()
	soup = BeautifulSoup(html, 'html.parser')
	print(soup.title)
	# Text from html and strip out newlines.
	newstring = soup.get_text().replace('\n', '')
	retval[minex]['wikipediatext'] = newstring
	return retval

	def data_with_company(data_with_wikipedia:dict) -> dict:
	"""
	Fetches company ownership for mine out of
	Wikipedia text dump.

	Returns a new dictionary with the company name
	without the big wikipedia text dump.
	"""
	# Wikipedia setup for mine company name.
	COMPANYPAT = r'[a-z]Company'
	# Lower case followed by upper case heuristic.
	ENDCOMPANYPAT = '[a-z][A-Z]'
	retval = copy.deepcopy(data_with_wikipedia)
	companypat = re.compile(COMPANYPAT)
	endcompanypat = re.compile(ENDCOMPANYPAT)
	for minex in retval:
	print(minex)
	match = re.search(companypat, retval[minex]['wikipediatext'])
	if match:
	print('Company match span = ', match.span())
	companyidx = match.span()[1]
	match2 = re.search(endcompanypat, retval[minex]['wikipediatext'][companyidx:])
	print('End Company match span = ', match2.span())
	retval[minex]['company'] = retval[minex]['wikipediatext'][companyidx:companyidx + match2.span()[0] + 1]
	# Get rid of big text dump in return value.
	retval[minex].pop('wikipediatext')
	return retval

	def info_output(data_with_company:dict) -> str:
	"""
	Prints some output text to a file for each
	mine in the data_with_company dictionary.

	Returns string filename of output.
	"""
	INFOLINEFMT = 'The {mine:s} mine is a big {commodity:s} mine in the State of {state:s} in the US.'
	COMPANYLINEFMT = '\n {company:s} owns the mine.\n\n'
	retval = 'mine_info.txt'
	with open(retval, 'w') as f:
	for minex in data_with_company:
	print(INFOLINEFMT.format(**data_with_company[minex]), file=f)
	print(COMPANYLINEFMT.format(**data_with_company[minex]), file=f)
	return retval

	def commodity_word_counts(data_with_wikipedia:dict, data_with_company:dict) -> dict:
	"""
	Return dictionary keyed on mine with counts of
	commodity (e.g., zinc etc.) mentions on Wikipedia
	page (excluding ones in the company name).
	"""
	retval = {}
	# This will probably miss some occurrences at mashed together
	# word boundaries. It is a rough estimate.
	# '\b[Gg]old\b'
	commoditypatfmt = r'\b[{0:s}{1:s}]{2:s}\b'
	for minex in data_with_wikipedia:
	print(minex)
	commodityuc = data_with_wikipedia[minex]['commodity'][0].upper()
	commoditypat = commoditypatfmt.format(commodityuc,
	data_with_wikipedia[minex]['commodity'][0],
	data_with_wikipedia[minex]['commodity'][1:])
	print(commoditypat)
	commoditymatches = re.findall(commoditypat, data_with_wikipedia[minex]['wikipediatext'])
	# pprint.pprint(commoditymatches)
	nummatchesraw = len(commoditymatches)
	print('Initial length of commoditymatches is {0:d}.'.format(nummatchesraw))
	companymatches = re.findall(data_with_company[minex]['company'],
	data_with_wikipedia[minex]['wikipediatext'])
	numcompanymatches = len(companymatches)
	print('Length of companymatches is {0:d}.'.format(numcompanymatches))
	# Is the commodity name part of the company name?
	print('commoditypat = ', commoditypat)
	print(data_with_company[minex]['company'])
	commoditymatchcompany = re.search(commoditypat, data_with_company[minex]['company'])
	if commoditymatchcompany:
	print('commoditymatchcompany.span() = ', commoditymatchcompany.span())
	nummatchesfinal = nummatchesraw - numcompanymatches
	retval[minex] = nummatchesfinal
	else:
	retval[minex] = nummatchesraw
	return retval

	def colloquial_company_word_counts(data_with_wikipedia:dict) -> dict:
	"""
	Find the number of times the company you associate with
	the property/mine (very subjective) is within the
	text of the mine's wikipedia article.
	"""
	retval = {}
	for minex in data_with_wikipedia:
	colloquial_pat = data_with_wikipedia[minex]['colloquial association']
	print(minex)
	nummatches = len(re.findall(colloquial_pat, data_with_wikipedia[minex]['wikipediatext']))
	print('{0:d} matches for colloquial association {1:s}.'.format(nummatches, colloquial_pat))
	retval[minex] = nummatches
	return retval

	def info_dict_merged(data_with_company:dict,
	commodity_word_counts:dict,
	colloquial_company_word_counts:dict) -> dict:
	"""
	Get a dictionary with all the collected information
	in it minus the big Wikipedia text dump.
	"""
	retval = copy.deepcopy(data_with_company)
	for minex in retval:
	retval[minex]['colloquial association count'] = colloquial_company_word_counts[minex]
	retval[minex]['commodity word count'] = commodity_word_counts[minex]
	return retval

	def wikipedia_report(info_dict_merged:dict) -> str:
	"""
	Writes out Wikipedia information (word counts)
	to file in prose; returns string filename.
	"""
	retval = 'wikipedia_info.txt'
	colloqfmt = 'The {0:s} mine has {1:d} occurrences of colloquial association {2:s} in its Wikipedia article text.\n'
	commodfmt = 'The {0:s} mine has {1:d} occurrences of commodity name {2:s} in its Wikipedia article text.\n\n'
	with open(retval, 'w') as f:
	for minex in info_dict_merged:
	print(colloqfmt.format(info_dict_merged[minex]['mine'],
	info_dict_merged[minex]['colloquial association count'],
	info_dict_merged[minex]['colloquial association']), file=f)
	print(commodfmt.format(info_dict_merged[minex]['mine'],
	info_dict_merged[minex]['commodity word count'],
	info_dict_merged[minex]['commodity']), file=f)
	return retval