abehmiel/fix_exhibit_b.py

## fix_exhibit_b.py
# It's really stupid when the gov't releases pdf's of tabular data. So I made a quick, hacky script to
# fix their mistakes for them. (I'm referring to https://t.co/oOyhHNVvjS )

# requirements:
# pandas
# tabula-py

import pandas as pd
from tabula import read_pdf

# read the pdf-- it's all messed up and only one space-delimited column.
# also it defauls to only loading one page unless you specify pages='all' or
# a different int or list.
df = read_pdf("exhibit_b.pdf", pages='all')

# fix the columns
df['user id'] = df['user id handle'].apply(lambda x: x.split()[0])
df['handle'] = df['user id handle'].apply(lambda x: x.split()[1])
df = df.drop('user id handle', axis=1)

# output to csv
df.to_csv('exhibit_b.csv', index=False)
	# It's really stupid when the gov't releases pdf's of tabular data. So I made a quick, hacky script to
	# fix their mistakes for them. (I'm referring to https://t.co/oOyhHNVvjS )

	# requirements:
	# pandas
	# tabula-py

	import pandas as pd
	from tabula import read_pdf

	# read the pdf-- it's all messed up and only one space-delimited column.
	# also it defauls to only loading one page unless you specify pages='all' or
	# a different int or list.
	df = read_pdf("exhibit_b.pdf", pages='all')

	# fix the columns
	df['user id'] = df['user id handle'].apply(lambda x: x.split()[0])
	df['handle'] = df['user id handle'].apply(lambda x: x.split()[1])
	df = df.drop('user id handle', axis=1)

	# output to csv
	df.to_csv('exhibit_b.csv', index=False)