deepankarb/transpose_csv.py

## transpose_csv.py
# Copyright 2024, Deepankar Bhardwaj
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
import logging
import time

def transpose_csv(input_file, output_prefix="transposed", num_records=500, column_index=0):
  """
  Transposes a CSV file, picking a specific column (optional) and writes the result to multiple files with N records each, excluding headers.

  Logs messages about the process with timing information and incorporates the input column name in the output filenames with an underscore.

  Args:
      input_file: Path to the input CSV file.
      output_prefix: Prefix for the output files (default: "transposed").
      num_records: Number of records to write to each output file (default: 500).
      column_index: Index of the column to pick from the input file (default: 0).
  """

  logging.basicConfig(filename='transpose.log', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

  start_time = time.time()

  with open(input_file, 'r') as infile:
    reader = csv.reader(infile)
    headers = next(reader)  # Assuming headers exist
    logging.info(f"Input file opened: {input_file}")

    # Get the column name
    column_name = headers[column_index]

    # Get all rows but pick the specified column only (if specified)
    data = [row[column_index] for row in reader]
    chunked = [data[i:i+num_records] for i in range(0,len(data),num_records)]

    for i, chunk in enumerate(chunked):
      outname=f"{output_prefix}_{column_name}_{i}.txt"
      with open(outname, 'w') as outfile:
        logging.info(f"Writing to {outname}")
        outfile.write(",".join(chunk))

  end_time = time.time()
  total_time = end_time - start_time

  logging.info(f"Transposition completed successfully. Total time: {total_time:.2f} seconds.")

# Example usage
if __name__ == "__main__":
  import argparse

  parser = argparse.ArgumentParser(description='Transpose a CSV file')
  parser.add_argument('input_file', help='Path to the input CSV file')
  parser.add_argument('-o', '--output_prefix', default="transposed", help='Prefix for the output files (default: "transposed")')
  parser.add_argument('-n', '--num_records', type=int, default=500, help='Number of records per output file (default: 500)')
  parser.add_argument('-c', '--column_index', type=int, default=0, help='Index of the column to pick (default: 0)')

  args = parser.parse_args()

  # Call transpose_csv with parsed arguments directly
  transpose_csv(args.input_file, args.output_prefix, args.num_records, args.column_index)
	# Copyright 2024, Deepankar Bhardwaj
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import csv
	import logging
	import time

	def transpose_csv(input_file, output_prefix="transposed", num_records=500, column_index=0):
	"""
	Transposes a CSV file, picking a specific column (optional) and writes the result to multiple files with N records each, excluding headers.

	Logs messages about the process with timing information and incorporates the input column name in the output filenames with an underscore.

	Args:
	input_file: Path to the input CSV file.
	output_prefix: Prefix for the output files (default: "transposed").
	num_records: Number of records to write to each output file (default: 500).
	column_index: Index of the column to pick from the input file (default: 0).
	"""

	logging.basicConfig(filename='transpose.log', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

	start_time = time.time()

	with open(input_file, 'r') as infile:
	reader = csv.reader(infile)
	headers = next(reader) # Assuming headers exist
	logging.info(f"Input file opened: {input_file}")

	# Get the column name
	column_name = headers[column_index]

	# Get all rows but pick the specified column only (if specified)
	data = [row[column_index] for row in reader]
	chunked = [data[i:i+num_records] for i in range(0,len(data),num_records)]

	for i, chunk in enumerate(chunked):
	outname=f"{output_prefix}_{column_name}_{i}.txt"
	with open(outname, 'w') as outfile:
	logging.info(f"Writing to {outname}")
	outfile.write(",".join(chunk))

	end_time = time.time()
	total_time = end_time - start_time

	logging.info(f"Transposition completed successfully. Total time: {total_time:.2f} seconds.")

	# Example usage
	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description='Transpose a CSV file')
	parser.add_argument('input_file', help='Path to the input CSV file')
	parser.add_argument('-o', '--output_prefix', default="transposed", help='Prefix for the output files (default: "transposed")')
	parser.add_argument('-n', '--num_records', type=int, default=500, help='Number of records per output file (default: 500)')
	parser.add_argument('-c', '--column_index', type=int, default=0, help='Index of the column to pick (default: 0)')

	args = parser.parse_args()

	# Call transpose_csv with parsed arguments directly
	transpose_csv(args.input_file, args.output_prefix, args.num_records, args.column_index)