speedupmate/flatten_additional_attributes.py

## flatten_additional_attributes.py
"""
This script formats a magento product export file, and sets each
additional_attribute as a new column in a target file
"""
import csv
import pandas as pd

# testing files
# source_file = './importexport_605048471de17.csv'
# target_file = './importexport_605048471de177_formatted.csv'

# real files
source_file = './catalog_product_20210316_044037.csv'
target_file = './catalog_product_20210316_044037_formatted.csv'

def get_modified_headers():
    reader = csv.reader(open(source_file, 'r'))
    headers = next(reader)
    # print('headers original')
    # print(headers)

    # get all headers for additional_attributes
    additional_attributes_headers = []
    i = 0
    for row in reader:
        # print(f'processing row: {i}')
        target_index = headers.index("additional_attributes")

        attributes = row[target_index].split(",")
        for attribute in attributes:
            data = attribute.split("=")
            if len(data) != 1:
                if data[0] != 'publish_date':
                    header = data[0]
                    # add attribute as header
                    if header not in additional_attributes_headers:
                        additional_attributes_headers.append(header)

        # iterator
        i += 1

    # print('headers modified')
    headers_modified = headers + additional_attributes_headers
    # print(headers_modified)

    return [headers_modified, additional_attributes_headers]


def run_with_writer():
    """ Main run function """
    reader = csv.reader(open(source_file, 'r'))
    writer = csv.writer(open(target_file, 'w'))

    # get original headers
    headers = next(reader)
    # print('original headers')
    # print(headers)

    # add additional attributes to headers
    modified_headers = get_modified_headers()[0]
    additional_attributes_headers = get_modified_headers()[1]
    # print('modified headers')
    # print(modified_headers)
    # print('additional_attribute headers')
    # print(additional_attributes_headers)

    # write headers to file
    writer.writerow(modified_headers)

    # extract data and populate rows
    i = 0
    for row in reader:
        print(f'processing row: {i}')
        # print('original row')
        # print(row)

        result = [None] * len(additional_attributes_headers)
        # print(result)
        # pre-populate result (to avoid shifting items as we add insert them at the index)
        target_index = headers.index("additional_attributes")

        # print(row[target_index])
        attributes = row[target_index].split(",")
        # print(attributes)

        for attribute in attributes:
            # print(attribute)
            data = attribute.split("=")

            if len(data) != 1:
                if data[0] != 'publish_date':
                    header = data[0]
                    value = data[1]

                    # print('get target index for value')
                    target_index = additional_attributes_headers.index(header)
                    # print(f'insert {value} at index {target_index}')
                    result[target_index] = value


        modified_row = row + result
        # print('modified headers')
        # print(modified_headers)
        # print('modified row')
        # print(modified_row)

        # write data
        # write original row
        # writer.writerow(row)

        # write modified row
        writer.writerow(modified_row)

        # break loop
        # if i == 10:
        #     break

        i += 1


# Main entry point
if __name__ == '__main__':
    run_with_writer()
	"""
	This script formats a magento product export file, and sets each
	additional_attribute as a new column in a target file
	"""
	import csv
	import pandas as pd

	# testing files
	# source_file = './importexport_605048471de17.csv'
	# target_file = './importexport_605048471de177_formatted.csv'

	# real files
	source_file = './catalog_product_20210316_044037.csv'
	target_file = './catalog_product_20210316_044037_formatted.csv'

	def get_modified_headers():
	reader = csv.reader(open(source_file, 'r'))
	headers = next(reader)
	# print('headers original')
	# print(headers)

	# get all headers for additional_attributes
	additional_attributes_headers = []
	i = 0
	for row in reader:
	# print(f'processing row: {i}')
	target_index = headers.index("additional_attributes")

	attributes = row[target_index].split(",")
	for attribute in attributes:
	data = attribute.split("=")
	if len(data) != 1:
	if data[0] != 'publish_date':
	header = data[0]
	# add attribute as header
	if header not in additional_attributes_headers:
	additional_attributes_headers.append(header)

	# iterator
	i += 1

	# print('headers modified')
	headers_modified = headers + additional_attributes_headers
	# print(headers_modified)

	return [headers_modified, additional_attributes_headers]


	def run_with_writer():
	""" Main run function """
	reader = csv.reader(open(source_file, 'r'))
	writer = csv.writer(open(target_file, 'w'))

	# get original headers
	headers = next(reader)
	# print('original headers')
	# print(headers)

	# add additional attributes to headers
	modified_headers = get_modified_headers()[0]
	additional_attributes_headers = get_modified_headers()[1]
	# print('modified headers')
	# print(modified_headers)
	# print('additional_attribute headers')
	# print(additional_attributes_headers)

	# write headers to file
	writer.writerow(modified_headers)

	# extract data and populate rows
	i = 0
	for row in reader:
	print(f'processing row: {i}')
	# print('original row')
	# print(row)

	result = [None] * len(additional_attributes_headers)
	# print(result)
	# pre-populate result (to avoid shifting items as we add insert them at the index)
	target_index = headers.index("additional_attributes")

	# print(row[target_index])
	attributes = row[target_index].split(",")
	# print(attributes)

	for attribute in attributes:
	# print(attribute)
	data = attribute.split("=")

	if len(data) != 1:
	if data[0] != 'publish_date':
	header = data[0]
	value = data[1]

	# print('get target index for value')
	target_index = additional_attributes_headers.index(header)
	# print(f'insert {value} at index {target_index}')
	result[target_index] = value


	modified_row = row + result
	# print('modified headers')
	# print(modified_headers)
	# print('modified row')
	# print(modified_row)

	# write data
	# write original row
	# writer.writerow(row)

	# write modified row
	writer.writerow(modified_row)

	# break loop
	# if i == 10:
	# break

	i += 1


	# Main entry point
	if __name__ == '__main__':
	run_with_writer()