Last active
June 29, 2020 11:04
-
-
Save mcgill-a/00f041e90f6b7cead01e9d24ad524b86 to your computer and use it in GitHub Desktop.
Separate fields with multiple companies in the global coal plant tracker data exported from https://alexmcgill.net/research/coal_tracker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
import argparse | |
import re | |
from pathlib import Path | |
def load_data(filename): | |
data = [] | |
with open(filename, 'r', encoding='latin-1') as file: | |
reader = csv.reader(file) | |
for row in reader: | |
data.append(row) | |
return data | |
def save_data(filename, data): | |
Path("output").mkdir(parents=True, exist_ok=True) | |
writer = csv.writer(open("output/" + filename, 'w', newline='', encoding='latin-1')) | |
for row in data: | |
writer.writerow(row) | |
def split_sub_categories(data, column): | |
row_count, company_count = 0, 0 | |
delim_token = "<--__-->" | |
new_rows = [] | |
remove_indices = [] | |
regex = r"(((\d{1,2}\.)?\d{1,2}%(;|,|\.))|\(((\d{1,2}\.)?\d{1,2}%\)(;|,|\.))|[,;])" | |
for i in range(len(data)): | |
# Select the sponsor field | |
field = data[i][column] | |
if "%;" in field or "%," in field or "%)" in field: | |
remove_indices.append(i) | |
duplicate = data[i] | |
row_count += 1 | |
# Split the companies on tokens: 20%; | 20.50%; | 20%, | 20.50%, | 20%. | 20.50%. | |
fields = re.split(delim_token, re.sub(regex, delim_token, field)) | |
# Remove empty list values | |
fields = list(filter(None, fields)) | |
company_count += len(fields) | |
# Create a new row for each company based on the origianal | |
for company in fields: | |
current = duplicate.copy() | |
current[column] = company.strip() | |
# remove leading ; character | |
current[column] = current[column].lstrip(";").strip() | |
new_rows.append(current) | |
# Remove the rows with multiple companies | |
remove_indices.sort(reverse=True) | |
for idx in remove_indices: | |
data.pop(idx) | |
# Add the new rows to the original dataset | |
for row in new_rows: | |
data.append(row) | |
print("Split " + str(row_count) + " rows into " + str(company_count) + " companies.") | |
def remove_trailing_percent(data, column): | |
regex = r"(\(\d{1,2}%\)|\d{1,2}%)$" | |
for row in data: | |
field = row[column] | |
if field.endswith("%") or field.endswith("%)"): | |
row[column] = re.sub(regex, '', field) | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--Input", help = "CSV input file") | |
parser.add_argument("-o", "--Output", help = "CSV output file") | |
parser.add_argument("-c", "--Column", help = "Number of the column to separate [0..N]") | |
args = parser.parse_args() | |
in_filename = '' | |
out_filename = '' | |
if args.Input and args.Output and args.Column: | |
in_filename = args.Input | |
out_filename = args.Output | |
column_number = int(args.Column) | |
else: | |
print("Please enter a valid input, output, and column number.") | |
print ("Arguments: -i <input.csv> -o <output.csv> -c <6>") | |
exit(0) | |
data = load_data(in_filename) | |
split_sub_categories(data, column_number) | |
remove_trailing_percent(data, column_number) | |
save_data(out_filename, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment