Skip to content

Instantly share code, notes, and snippets.

@sbliven
Created August 30, 2019 13:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sbliven/237ccf49c42176a5febfb28dccecb271 to your computer and use it in GitHub Desktop.
Save sbliven/237ccf49c42176a5febfb28dccecb271 to your computer and use it in GitHub Desktop.
Create histograms of line lengths. Used for https://github.com/biopython/biopython/issues/2008
"""Create distribution of line lengths over files
Example:
find . -regextype egrep -regex './(Bio|Tests)/.*\.py' -type f -exec \
python linelength.py --hist linelengths.png \
--cdf linelengthscumulative.png -v '{}' '+'
"""
import sys
import argparse
import logging
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
def line_len(file):
for line in file:
yield len(line)-1
pass
def main(args=None):
parser = argparse.ArgumentParser(description='')
parser.add_argument("--hist", type=argparse.FileType('wb'),
help="PNG output file Histogram")
parser.add_argument("--cdf", type=argparse.FileType('wb'),
help="PNG output file cumulative")
parser.add_argument("files", type=argparse.FileType('r'),
default=sys.stdin,
nargs="*",
help="Input files")
parser.add_argument("-v", "--verbose", help="Long messages",
dest="verbose", default=False, action="store_true")
args = parser.parse_args(args)
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG if args.verbose else logging.WARN)
data = pd.DataFrame({"linelen": list(l for file in args.files for l in line_len(file))})
logging.info(f"Read data from {len(args.files)} files")
logging.debug(f"Read {len(data.index)} lines")
bins = list(range(200)) + [100000]
sns.distplot(data, bins=bins)
plt.xlim(0, 200)
plt.savefig(args.hist)
logging.info(f"Save histogram to {args.hist.name}")
plt.clf()
sns.distplot(data,
bins=bins,
hist_kws={"cumulative": True},
kde_kws={"cumulative": True},
kde=False,
norm_hist=True
)
plt.xlim(0, 200)
plt.savefig(args.cdf)
logging.info(f"Save cumulative to {args.cdf.name}")
cuts = [-1, 0, 80, 88, 120, max(data.linelen)]
c = pd.cut(data.linelen, cuts, right=True)
counts = data.groupby(c).count()
print(f"counts:\n{counts}")
print(f"total: {len(data.index)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment