brevans/Bosque_summarize.py

## Bosque_summarize.py
#!/usr/bin/env python
from glob import glob
import pandas as pd
import re

#keep a running list of results
results_list = []

#for each excel file, which we will call xlsx
for xlsx in glob("*.xlsx"):
    #see if we can extract the ID and isolation number with a regular expression
    #regular expression: http://regexr.com/3glnf
    match = re.match(r'(\d+)_isolation(\d+)\.xlsx', xlsx)
    if match:
        #trick to convert the strings into numbers called "list comprehension"
        sample_id, isolation = [int(x) for x in match.groups()]
    else:
        print("File {} doesn't seem to be named like I thought it should be.".format(xlsx))
        continue

    #read in the data from excel into a pandas data frame
    df = pd.read_excel(xlsx)

    #select all rows (:) and columns from mult_steps to noise_dist ('mult_steps':'noise_dist')
    # sum them, then divide by the number of rows, which is the first number in df.shape
    summary = df.loc[:,'mult_steps':'noise_dist'].sum() / df.shape[0]

    #add the sample ID and isolation number into the result
    summary.name = sample_id
    summary['isolation'] = isolation

    #add our results to the list
    results_list.append(summary)

#finally, concatenate our results and save to 'sumary.csv'
results = pd.concat(results_list, axis=1)
results.to_csv('summary.csv')
	#!/usr/bin/env python
	from glob import glob
	import pandas as pd
	import re

	#keep a running list of results
	results_list = []

	#for each excel file, which we will call xlsx
	for xlsx in glob("*.xlsx"):
	#see if we can extract the ID and isolation number with a regular expression
	#regular expression: http://regexr.com/3glnf
	match = re.match(r'(\d+)_isolation(\d+)\.xlsx', xlsx)
	if match:
	#trick to convert the strings into numbers called "list comprehension"
	sample_id, isolation = [int(x) for x in match.groups()]
	else:
	print("File {} doesn't seem to be named like I thought it should be.".format(xlsx))
	continue

	#read in the data from excel into a pandas data frame
	df = pd.read_excel(xlsx)

	#select all rows (:) and columns from mult_steps to noise_dist ('mult_steps':'noise_dist')
	# sum them, then divide by the number of rows, which is the first number in df.shape
	summary = df.loc[:,'mult_steps':'noise_dist'].sum() / df.shape[0]

	#add the sample ID and isolation number into the result
	summary.name = sample_id
	summary['isolation'] = isolation

	#add our results to the list
	results_list.append(summary)

	#finally, concatenate our results and save to 'sumary.csv'
	results = pd.concat(results_list, axis=1)
	results.to_csv('summary.csv')