Skip to content

Instantly share code, notes, and snippets.

@arc2226
Last active June 29, 2021 22:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arc2226/c33a20a97e8a79ef267191db53ea7be5 to your computer and use it in GitHub Desktop.
Save arc2226/c33a20a97e8a79ef267191db53ea7be5 to your computer and use it in GitHub Desktop.
import pandas as pd
import re
# I had to read in the release data, and specify the column names beforehand
# so that Pandas would understands how many columns we should see in the data
# and what to call them.
release = pd.read_csv(input,
usecols= column_names).rename(
columns=
{'IDNO': 'idno',
'Name': 'name',
'DATE_OF_RELEASE': 'date_of_release',
'ReleaseType': 'release_type',
'GENDER': 'gender',
'RACE': 'race',
'DOB': 'dob',
'AGE_AT_RELEASE': 'age_at_release',
'OFFSET_AGE_AT_RELEASE': 'offset_age_at_release'})
# The names had strange spacing and punctuation.
# I had to clean them up to then mege them with the voter data.
release['name'] = release['name'].astype(str)
release['name'] = release.name.str.replace(',', '')
# We had the fullname as a single string with '\s+' characters
# so I replaced those will regular spaces to then .split on them to get them into three
# separate columns.
release['release_fullname_clean'] = release.name.apply(lambda x: re.sub('\s+', ' ', x))
names = release['release_fullname_clean'].str.split(' ', expand = True)[[0, 1, 2]]
names.columns = ['release_firstname', 'release_lastname', 'release_middlename']
# I then merged those new name columns back into my release dataset.
release = pd.concat([release, names], axis = 1) # merge names df with clean first, middle and last names
# to release df
release['date_of_release'] = pd.to_datetime(release.date_of_release) # transform into datetime to match with date in voter file
# In this case, the ages of release are spread in three different columns.
# we fill empties in the `offset` with `dob`
# and then fill age at release with the combination
# of the previous two columns
release['age_at_release_clean'] = release.age_at_release.fillna(release.offset_age_at_release.fillna(release.dob)) # the age of release appears in two seaprate columns
# Lastly, I set the age at release to an integer type.
release['age_at_release_clean'] = release.age_at_release_clean.astype(int)
release.to_csv(output, index = False) # output
import pandas as pd
import re
releases = pd.read_csv(input)
voters = pd.read_csv(input2)
deduped_voter = voters[~voters.duplicated()]
df = releases.merge(deduped_voter,
left_on = ['release_fullname'],
right_on = ['voter_fullname'], how = 'inner')
df = df[df['dob_clean'] == df['birth_date']]
df.to_csv(output, index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment