Skip to content

Instantly share code, notes, and snippets.

@muppetjones
Last active August 27, 2018 19:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save muppetjones/a2e186f11eb6e00e4b27c974080179e1 to your computer and use it in GitHub Desktop.
Save muppetjones/a2e186f11eb6e00e4b27c974080179e1 to your computer and use it in GitHub Desktop.
Print length of column names in a file.
#!/usr/bin/env python3
"""Print size of columns.
Usage:
$ python3 colsize.py ./ref_uniprot.csv
id: (1, 6)
ensembl_transcript_id: (0, 15)
feature_type: (4, 35)
feature_desc: (0, 747)
uniprot_id: (6, 10)
start_pos: (1, 5)
end_pos: (1, 5)
@Author: Stephen J. Bush
"""
import argparse
import re
class Column(object):
__CNT = 0
def __new__(cls, *args, **kwargs):
cls.__CNT += 1
return super().__new__(cls)
def __init__(self, name=None, field=None):
if field:
self._maxwidth = len(field)
self._minwidth = len(field)
self._fields = [field]
else:
self._maxwidth = 0
self._minwidth = 9e6
self._fields = []
self._longest = field
self._shortest = field
self._name = name
self._i = self.__CNT
@property
def name(self):
return self._name or 'unnamed_{:03d}'.format(self._i)
@property
def maxwidth(self):
return self._maxwidth
@property
def minwidth(self):
return self._minwidth
@property
def longest(self):
return self._longest
@property
def shortest(self):
return self._shortest
@property
def n_fields(self):
return len(self._fields)
def add(self, value):
width = len(value)
if width > self._maxwidth:
self._maxwidth = width
self._longest = value
elif width < self._minwidth:
self._minwidth = width
self._shortest = value
self._fields.append(value)
@classmethod
def split_row(cls, row, *, rx):
return [
x.strip('"')
for x in rx.split(row.strip())
]
def aggregate_columns(path, delim=r'[,\t]', has_header=True):
"""Do something.
"""
columns = []
rx_delim_w_quotes = re.compile(
delim + r'(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)'
)
with open(path, 'r') as fh:
first_line = Column.split_row(fh.readline(), rx=rx_delim_w_quotes)
if has_header:
columns = [Column(name=x) for x in first_line]
else:
columns = [
Column(name=str(i), field=x)
for i, x in enumerate(first_line)
]
for i, line in enumerate(fh):
cols = Column.split_row(line, rx=rx_delim_w_quotes)
for j, col in enumerate(cols):
try:
columns[j].add(col)
except IndexError:
# store extra columns
columns.append(Column(field=col))
return columns
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('path')
parser.add_argument('--delim', default=r'[,\t]')
parser.add_argument(
'--no-header', dest='has_header', default=True, action='store_false')
parser.add_argument(
'--count', dest='do_print_count',
default=False, action='store_true',
help='Print number of fields per column',
)
parser.add_argument(
'--longest', dest='do_print_longest',
default=False, action='store_true',
help='Print first, longest value of each column',
)
parser.add_argument(
'--shortest', dest='do_print_shortest',
default=False, action='store_true',
help='Print first, shortest of each column',
)
parser.add_argument(
'--values', dest='do_print_values',
default=False, action='store_true',
help='[not recommended] Print all values.',
)
parser.add_argument(
'--unique', dest='do_print_unique',
default=False, action='store_true',
help='Print number of unqiue values in the column.',
)
args = parser.parse_args()
columns = aggregate_columns(
args.path, delim=args.delim, has_header=args.has_header)
# write out column lengths
for column in columns:
print('{}: ({}, {})'.format(
column.name,
column.minwidth,
column.maxwidth,
))
if args.do_print_count:
print('\n-- Count')
for column in columns:
print('{}: {}'.format(column.name, column.n_fields))
if args.do_print_longest:
print('\n-- Longest')
for column in columns:
print('{}: "{}"'.format(column.name, column.longest))
if args.do_print_shortest:
print('\n-- Shortest')
for column in columns:
print('{}: "{}"'.format(column.name, column.shortest))
if args.do_print_values:
print('\n-- Values')
for column in columns:
print('{}: {}'.format(column.name, column._fields))
if args.do_print_unique:
print('\n-- Unique')
unique_dict = {
column.name: (n_unique / n_values, n_unique, n_values)
for column in columns
for n_unique, n_values in [(
len(set(column._fields)),
len(column._fields),
)]
}
for name, values in sorted(
unique_dict.items(), key=lambda x: x[1], reverse=True):
print('{1:6.2%} {0} ({2} of {3})'.format(name, *values))
# __END__
$ ~/dev/scripts/colsize.py test.txt --count --longest --shortest --values
foo: (1, 3)
bar: (0, 3)
baz: (1, 9)
unnamed_004: (1, 1)
-- Count
foo: 6
bar: 6
baz: 6
unnamed_004: 1
-- Longest
foo: "waa"
bar: "b,c"
baz: "off_topic"
unnamed_004: "z"
-- Shortest
foo: "a"
bar: ""
baz: "d"
unnamed_004: "z"
-- Values
foo: ['1', 'a', 'waa', 'm', '4', '7']
bar: ['2', 'b,c', 'x', 'n', '', '']
baz: ['3', 'd', 'y', 'off_topic', '6', '9']
unnamed_004: ['z']
foo,bar,baz
1,2,3
a,"b,c",d
waa,x,y,z
m n off_topic
4,,6
7 9
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment