Skip to content

Instantly share code, notes, and snippets.

@FilipDominec
Created January 1, 2019 22:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FilipDominec/70173979cfe644139a64191e9276c530 to your computer and use it in GitHub Desktop.
Save FilipDominec/70173979cfe644139a64191e9276c530 to your computer and use it in GitHub Desktop.
Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a pattern
#!/usr/bin/python3
#-*- coding: utf-8 -*-
"""
Parses one or more XLS files, each containing arbitrary number of sheets, and filters all rows for a
pattern as determined by filter_lines=... below. Prints matching names.
Typical invocation
python3 ./xlsfilter.py *201{6,7,8}.{1,2}.xlsx | vi -
Supposed to help find interesting spectroscopic data on a shared network drive. Can be adapted for
any similar task in the future.
"""
## Import common moduli
import sys, os, time, re
from pathlib import Path
import pandas as pd
import collections as co
matching_names, matching_dates, matching_xls = [], [], []
errors = []
duplicities = []
for arg in sys.argv[1:]:
#print('loading file {:}', arg);
sheet_to_df_map = co.OrderedDict()
xls = pd.ExcelFile(arg)
skipped_sheet_names = []
for sheet_name in xls.sheet_names:
if re.search('\d\d\d\d', sheet_name):
sheet_to_df_map[sheet_name] = xls.parse(sheet_name)
else:
skipped_sheet_names += [sheet_name]
#print('Note: in file "{:}" parsed sheets: {:} '.format(arg, sheet_to_df_map.keys()))
#print('Note: in file "{:}" ignored sheets: {:} '.format(arg, skipped_sheet_names))
for key,df in list(sheet_to_df_map.items())[::-1]:
try:
filter_lines = (df['Filtr (%)']==0.1) & (df['Acqt. (s)']==1) & (df['x']==-25000)
new_names = list(df[filter_lines]['Name'].values)
for nn in new_names:
if nn in matching_names: duplicities.append(nn)
matching_names += new_names
matching_dates += [key[:]] * len(new_names)
matching_xls += [arg[:]] * len(new_names)
except KeyError as e:
errors.append("ERROR: could not parse or filter {:} in {:}:\n\t\t {:}".format(key,arg,e))
#print(e)
matching_triplets = [n+'\t'+d+'\t'+a for (n,d,a) in zip(matching_names, matching_dates, matching_xls)]
#matching_triplets.sort()
for mp in matching_triplets:
print(mp)
for err in errors:
print(err)
if duplicities:
print('WARNING: found {:} duplicities in above printed names: {:}'.format(len(duplicities),duplicities))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment