FilipDominec/labbook_xls_filter.py

## labbook_xls_filter.py
#!/usr/bin/python3
#-*- coding: utf-8 -*-


"""
Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a
pattern as determined by filter_lines=... below. Prints matching names.

Typical invocation
    python3 ./xlsfilter.py *201{6,7,8}.{1,2}.xlsx | vi -

Supposed to help find interesting spectroscopic data on a shared network drive. Can be adapted for
any similar task in the future.
"""

## Import common moduli
import sys, os, time, re
from pathlib import Path
import pandas as pd
import collections as co


matching_names, matching_dates, matching_xls = [], [], []
errors = []
duplicities = []

for arg in sys.argv[1:]:
    #print('loading file {:}', arg);
    sheet_to_df_map = co.OrderedDict()

    xls = pd.ExcelFile(arg)
    skipped_sheet_names = []
    for sheet_name in xls.sheet_names:
        if re.search('\d\d\d\d', sheet_name):
            sheet_to_df_map[sheet_name] = xls.parse(sheet_name)
        else:
            skipped_sheet_names += [sheet_name]
    #print('Note: in file "{:}" parsed  sheets: {:} '.format(arg, sheet_to_df_map.keys()))
    #print('Note: in file "{:}" ignored sheets: {:} '.format(arg, skipped_sheet_names))

    for key,df in list(sheet_to_df_map.items())[::-1]:
        try:
            filter_lines = (df['Filtr (%)']==0.1) & (df['Acqt. (s)']==1) & (df['x']==-25000)
            new_names = list(df[filter_lines]['Name'].values)
            for nn in new_names:
                if nn in matching_names: duplicities.append(nn)
            matching_names += new_names
            matching_dates += [key[:]] * len(new_names)
            matching_xls   += [arg[:]] * len(new_names)
        except KeyError as e:
            errors.append("ERROR: could not parse or filter {:} in {:}:\n\t\t {:}".format(key,arg,e))
            #print(e)
matching_triplets = [n+'\t'+d+'\t'+a for (n,d,a) in zip(matching_names, matching_dates, matching_xls)]
#matching_triplets.sort()
for mp in matching_triplets:
    print(mp)
for err in errors:
    print(err)
if duplicities:
    print('WARNING: found {:} duplicities in above printed names: {:}'.format(len(duplicities),duplicities))
	#!/usr/bin/python3
	#-- coding: utf-8 --


	"""
	Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a
	pattern as determined by filter_lines=... below. Prints matching names.

	Typical invocation
	python3 ./xlsfilter.py *201{6,7,8}.{1,2}.xlsx \| vi -

	Supposed to help find interesting spectroscopic data on a shared network drive. Can be adapted for
	any similar task in the future.
	"""

	## Import common moduli
	import sys, os, time, re
	from pathlib import Path
	import pandas as pd
	import collections as co


	matching_names, matching_dates, matching_xls = [], [], []
	errors = []
	duplicities = []

	for arg in sys.argv[1:]:
	#print('loading file {:}', arg);
	sheet_to_df_map = co.OrderedDict()

	xls = pd.ExcelFile(arg)
	skipped_sheet_names = []
	for sheet_name in xls.sheet_names:
	if re.search('\d\d\d\d', sheet_name):
	sheet_to_df_map[sheet_name] = xls.parse(sheet_name)
	else:
	skipped_sheet_names += [sheet_name]
	#print('Note: in file "{:}" parsed sheets: {:} '.format(arg, sheet_to_df_map.keys()))
	#print('Note: in file "{:}" ignored sheets: {:} '.format(arg, skipped_sheet_names))

	for key,df in list(sheet_to_df_map.items())[::-1]:
	try:
	filter_lines = (df['Filtr (%)']==0.1) & (df['Acqt. (s)']==1) & (df['x']==-25000)
	new_names = list(df[filter_lines]['Name'].values)
	for nn in new_names:
	if nn in matching_names: duplicities.append(nn)
	matching_names += new_names
	matching_dates += [key[:]] * len(new_names)
	matching_xls += [arg[:]] * len(new_names)
	except KeyError as e:
	errors.append("ERROR: could not parse or filter {:} in {:}:\n\t\t {:}".format(key,arg,e))
	#print(e)
	matching_triplets = [n+'\t'+d+'\t'+a for (n,d,a) in zip(matching_names, matching_dates, matching_xls)]
	#matching_triplets.sort()
	for mp in matching_triplets:
	print(mp)
	for err in errors:
	print(err)
	if duplicities:
	print('WARNING: found {:} duplicities in above printed names: {:}'.format(len(duplicities),duplicities))