Skip to content

Instantly share code, notes, and snippets.

@jgarciabu
Created October 5, 2018 14:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jgarciabu/e6fec23ed90d6309970d9636866659fb to your computer and use it in GitHub Desktop.
Save jgarciabu/e6fec23ed90d6309970d9636866659fb to your computer and use it in GitHub Desktop.
File scan tool to report file metadata (number of columns, column names, filename, etc)
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 8 13:57:08 2018
@author: jeffrey.garcia
"""
import pandas as pd
import os
import inspect; os.path
from os import listdir
import openpyxl
import xlrd
import fnmatch
filename = inspect.getframeinfo(inspect.currentframe()).filename
path = os.path.dirname(os.path.abspath(filename))
firstfilenamelist = listdir(path)
filenames = []
numberofcolumns = []
columnnames = []
worksheetnames = []
precsvdict = {'Filename':filenames, 'Number of Columns': numberofcolumns, 'Column Details':columnnames, 'Worksheet Names':worksheetnames}
for file in firstfilenamelist:
if file.endswith(".csv"):
f = pd.read_csv(file,engine='python')
filenames.append(file)
numberofcolumns.append(len(f.columns))
i = 0
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'):
i += 1
f = pd.read_csv(file, header=i)
else:
i = 0
columnnames.append('Column Names: ' + str(list(f.columns.values)) + '\n')
worksheetnames.append('')
if file.endswith(".xlsx") or file.endswith(".xls"):
if file.endswith(".xlsx"):
wb = openpyxl.load_workbook(file)
sheets = wb.sheetnames
worksheetnames.append(sheets)
f = pd.read_excel(file)
filenames.append(file)
i = 0
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'):
i += 1
f = pd.read_excel(file, header=i)
pi=i
else:
pi = i
i = 0
worksheetdatastring = ''
if len(sheets) < 2:
for sheet in sheets:
df = pd.read_excel(file,sheet_name=sheet,header=pi)
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n'
numberofcolumns.append(len(df.columns))
columnnames.append(worksheetdatastring)
worksheetdatastring = ''
else:
for sheet in sheets:
df = pd.read_excel(file,sheet_name=sheet,header=pi)
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Number of Columns: ' + str(len(df.columns)) + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n'
#columnnames.append('Worksheet Name: ' + sheet + ' | ' + str(list(f.columns.values)))
numberofcolumns.append('See Column Details for Number of Columns and Column Names')
columnnames.append(worksheetdatastring)
worksheetdatastring = ''
if file.endswith(".xls"):
wb = xlrd.open_workbook(file, on_demand=True)
sheets = wb.sheet_names()
worksheetnames.append(sheets)
f = pd.read_excel(file)
filenames.append(file)
i = 0
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'):
i += 1
f = pd.read_excel(file, header=i)
pi = i
else:
pi = i
i = 0
worksheetdatastring = ''
if len(sheets) < 2:
for sheet in sheets:
df = pd.read_excel(file,sheet_name=sheet, header=pi)
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n'
numberofcolumns.append(len(df.columns))
columnnames.append(worksheetdatastring)
worksheetdatastring = ''
else:
for sheet in sheets:
df = pd.read_excel(file,sheet_name=sheet,header=pi)
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Number of Columns: ' + str(len(df.columns)) + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n'
#columnnames.append('Worksheet Name: ' + sheet + ' | ' + str(list(f.columns.values)))
numberofcolumns.append('See Column Details for Number of Columns and Column Names')
columnnames.append(worksheetdatastring)
worksheetdatastring = ''
else:
continue
outputdf = pd.DataFrame.from_dict(data=precsvdict)
outputdf.to_csv('filelist.csv',header=True, index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment