Skip to content

Instantly share code, notes, and snippets.

@mcgill-a
Last active June 29, 2020 11:04
Show Gist options
  • Save mcgill-a/00f041e90f6b7cead01e9d24ad524b86 to your computer and use it in GitHub Desktop.
Save mcgill-a/00f041e90f6b7cead01e9d24ad524b86 to your computer and use it in GitHub Desktop.
Separate fields with multiple companies in the global coal plant tracker data exported from https://alexmcgill.net/research/coal_tracker
import sys
import csv
import argparse
import re
from pathlib import Path
def load_data(filename):
data = []
with open(filename, 'r', encoding='latin-1') as file:
reader = csv.reader(file)
for row in reader:
data.append(row)
return data
def save_data(filename, data):
Path("output").mkdir(parents=True, exist_ok=True)
writer = csv.writer(open("output/" + filename, 'w', newline='', encoding='latin-1'))
for row in data:
writer.writerow(row)
def split_sub_categories(data, column):
row_count, company_count = 0, 0
delim_token = "<--__-->"
new_rows = []
remove_indices = []
regex = r"(((\d{1,2}\.)?\d{1,2}%(;|,|\.))|\(((\d{1,2}\.)?\d{1,2}%\)(;|,|\.))|[,;])"
for i in range(len(data)):
# Select the sponsor field
field = data[i][column]
if "%;" in field or "%," in field or "%)" in field:
remove_indices.append(i)
duplicate = data[i]
row_count += 1
# Split the companies on tokens: 20%; | 20.50%; | 20%, | 20.50%, | 20%. | 20.50%.
fields = re.split(delim_token, re.sub(regex, delim_token, field))
# Remove empty list values
fields = list(filter(None, fields))
company_count += len(fields)
# Create a new row for each company based on the origianal
for company in fields:
current = duplicate.copy()
current[column] = company.strip()
# remove leading ; character
current[column] = current[column].lstrip(";").strip()
new_rows.append(current)
# Remove the rows with multiple companies
remove_indices.sort(reverse=True)
for idx in remove_indices:
data.pop(idx)
# Add the new rows to the original dataset
for row in new_rows:
data.append(row)
print("Split " + str(row_count) + " rows into " + str(company_count) + " companies.")
def remove_trailing_percent(data, column):
regex = r"(\(\d{1,2}%\)|\d{1,2}%)$"
for row in data:
field = row[column]
if field.endswith("%") or field.endswith("%)"):
row[column] = re.sub(regex, '', field)
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--Input", help = "CSV input file")
parser.add_argument("-o", "--Output", help = "CSV output file")
parser.add_argument("-c", "--Column", help = "Number of the column to separate [0..N]")
args = parser.parse_args()
in_filename = ''
out_filename = ''
if args.Input and args.Output and args.Column:
in_filename = args.Input
out_filename = args.Output
column_number = int(args.Column)
else:
print("Please enter a valid input, output, and column number.")
print ("Arguments: -i <input.csv> -o <output.csv> -c <6>")
exit(0)
data = load_data(in_filename)
split_sub_categories(data, column_number)
remove_trailing_percent(data, column_number)
save_data(out_filename, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment