arc2226/clean.py

## clean.py
import pandas as pd
import re

# I had to read in the release data, and specify the column names beforehand
# so that Pandas would understands how many columns we should see in the data
# and what to call them.

release = pd.read_csv(input,
    usecols= column_names).rename(
        columns=
        {'IDNO': 'idno',
        'Name': 'name',
        'DATE_OF_RELEASE': 'date_of_release',
        'ReleaseType': 'release_type',
        'GENDER': 'gender',
        'RACE': 'race',
        'DOB': 'dob',
        'AGE_AT_RELEASE': 'age_at_release',
        'OFFSET_AGE_AT_RELEASE': 'offset_age_at_release'})

    # The names had strange spacing and punctuation.
    # I had to clean them up to then mege them with the voter data.

    release['name'] = release['name'].astype(str)
    release['name'] = release.name.str.replace(',', '')

    # We had the fullname as a single string with '\s+' characters
    # so I replaced those will regular spaces to then .split on them to get them into three
    # separate columns.

    release['release_fullname_clean'] = release.name.apply(lambda x: re.sub('\s+', ' ', x))

    names = release['release_fullname_clean'].str.split(' ', expand = True)[[0, 1, 2]]
    names.columns = ['release_firstname', 'release_lastname', 'release_middlename']


    # I then merged those new name columns back into my release dataset.

    release = pd.concat([release, names], axis = 1) # merge names df with clean first, middle and last names
                                                    # to release df

    release['date_of_release'] = pd.to_datetime(release.date_of_release) # transform into datetime to match with date in voter file

    # In this case, the ages of release are spread in three different columns.
    # we fill empties in the `offset` with `dob`
    # and then fill age at release with the combination
    # of the previous two columns
    release['age_at_release_clean'] = release.age_at_release.fillna(release.offset_age_at_release.fillna(release.dob)) # the age of release appears in two seaprate columns

    # Lastly, I set the age at release to an integer type.
    release['age_at_release_clean'] = release.age_at_release_clean.astype(int)


    release.to_csv(output, index = False) # output


## match.py
import pandas as pd
import re

  releases = pd.read_csv(input)
  voters = pd.read_csv(input2)

  deduped_voter = voters[~voters.duplicated()]

  df = releases.merge(deduped_voter,
                      left_on = ['release_fullname'],
                      right_on = ['voter_fullname'], how = 'inner')

  df = df[df['dob_clean'] == df['birth_date']]

  df.to_csv(output, index = False)
	import pandas as pd
	import re

	# I had to read in the release data, and specify the column names beforehand
	# so that Pandas would understands how many columns we should see in the data
	# and what to call them.

	release = pd.read_csv(input,
	usecols= column_names).rename(
	columns=
	{'IDNO': 'idno',
	'Name': 'name',
	'DATE_OF_RELEASE': 'date_of_release',
	'ReleaseType': 'release_type',
	'GENDER': 'gender',
	'RACE': 'race',
	'DOB': 'dob',
	'AGE_AT_RELEASE': 'age_at_release',
	'OFFSET_AGE_AT_RELEASE': 'offset_age_at_release'})

	# The names had strange spacing and punctuation.
	# I had to clean them up to then mege them with the voter data.

	release['name'] = release['name'].astype(str)
	release['name'] = release.name.str.replace(',', '')

	# We had the fullname as a single string with '\s+' characters
	# so I replaced those will regular spaces to then .split on them to get them into three
	# separate columns.

	release['release_fullname_clean'] = release.name.apply(lambda x: re.sub('\s+', ' ', x))

	names = release['release_fullname_clean'].str.split(' ', expand = True)[[0, 1, 2]]
	names.columns = ['release_firstname', 'release_lastname', 'release_middlename']


	# I then merged those new name columns back into my release dataset.

	release = pd.concat([release, names], axis = 1) # merge names df with clean first, middle and last names
	# to release df

	release['date_of_release'] = pd.to_datetime(release.date_of_release) # transform into datetime to match with date in voter file

	# In this case, the ages of release are spread in three different columns.
	# we fill empties in the `offset` with `dob`
	# and then fill age at release with the combination
	# of the previous two columns
	release['age_at_release_clean'] = release.age_at_release.fillna(release.offset_age_at_release.fillna(release.dob)) # the age of release appears in two seaprate columns

	# Lastly, I set the age at release to an integer type.
	release['age_at_release_clean'] = release.age_at_release_clean.astype(int)


	release.to_csv(output, index = False) # output
	import pandas as pd
	import re

	releases = pd.read_csv(input)
	voters = pd.read_csv(input2)

	deduped_voter = voters[~voters.duplicated()]

	df = releases.merge(deduped_voter,
	left_on = ['release_fullname'],
	right_on = ['voter_fullname'], how = 'inner')

	df = df[df['dob_clean'] == df['birth_date']]

	df.to_csv(output, index = False)