Created
October 5, 2018 14:30
-
-
Save jgarciabu/e6fec23ed90d6309970d9636866659fb to your computer and use it in GitHub Desktop.
File scan tool to report file metadata (number of columns, column names, filename, etc)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri Jun 8 13:57:08 2018 | |
@author: jeffrey.garcia | |
""" | |
import pandas as pd | |
import os | |
import inspect; os.path | |
from os import listdir | |
import openpyxl | |
import xlrd | |
import fnmatch | |
filename = inspect.getframeinfo(inspect.currentframe()).filename | |
path = os.path.dirname(os.path.abspath(filename)) | |
firstfilenamelist = listdir(path) | |
filenames = [] | |
numberofcolumns = [] | |
columnnames = [] | |
worksheetnames = [] | |
precsvdict = {'Filename':filenames, 'Number of Columns': numberofcolumns, 'Column Details':columnnames, 'Worksheet Names':worksheetnames} | |
for file in firstfilenamelist: | |
if file.endswith(".csv"): | |
f = pd.read_csv(file,engine='python') | |
filenames.append(file) | |
numberofcolumns.append(len(f.columns)) | |
i = 0 | |
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'): | |
i += 1 | |
f = pd.read_csv(file, header=i) | |
else: | |
i = 0 | |
columnnames.append('Column Names: ' + str(list(f.columns.values)) + '\n') | |
worksheetnames.append('') | |
if file.endswith(".xlsx") or file.endswith(".xls"): | |
if file.endswith(".xlsx"): | |
wb = openpyxl.load_workbook(file) | |
sheets = wb.sheetnames | |
worksheetnames.append(sheets) | |
f = pd.read_excel(file) | |
filenames.append(file) | |
i = 0 | |
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'): | |
i += 1 | |
f = pd.read_excel(file, header=i) | |
pi=i | |
else: | |
pi = i | |
i = 0 | |
worksheetdatastring = '' | |
if len(sheets) < 2: | |
for sheet in sheets: | |
df = pd.read_excel(file,sheet_name=sheet,header=pi) | |
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n' | |
numberofcolumns.append(len(df.columns)) | |
columnnames.append(worksheetdatastring) | |
worksheetdatastring = '' | |
else: | |
for sheet in sheets: | |
df = pd.read_excel(file,sheet_name=sheet,header=pi) | |
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Number of Columns: ' + str(len(df.columns)) + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n' | |
#columnnames.append('Worksheet Name: ' + sheet + ' | ' + str(list(f.columns.values))) | |
numberofcolumns.append('See Column Details for Number of Columns and Column Names') | |
columnnames.append(worksheetdatastring) | |
worksheetdatastring = '' | |
if file.endswith(".xls"): | |
wb = xlrd.open_workbook(file, on_demand=True) | |
sheets = wb.sheet_names() | |
worksheetnames.append(sheets) | |
f = pd.read_excel(file) | |
filenames.append(file) | |
i = 0 | |
while fnmatch.fnmatch(str(f.columns.values), '*Unnamed:*'): | |
i += 1 | |
f = pd.read_excel(file, header=i) | |
pi = i | |
else: | |
pi = i | |
i = 0 | |
worksheetdatastring = '' | |
if len(sheets) < 2: | |
for sheet in sheets: | |
df = pd.read_excel(file,sheet_name=sheet, header=pi) | |
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n' | |
numberofcolumns.append(len(df.columns)) | |
columnnames.append(worksheetdatastring) | |
worksheetdatastring = '' | |
else: | |
for sheet in sheets: | |
df = pd.read_excel(file,sheet_name=sheet,header=pi) | |
worksheetdatastring = worksheetdatastring + 'Worksheet Name: ' + sheet + ' \n ' + 'Number of Columns: ' + str(len(df.columns)) + ' \n ' + 'Column Names: ' + str(list(df.columns.values)) + '\n\n' | |
#columnnames.append('Worksheet Name: ' + sheet + ' | ' + str(list(f.columns.values))) | |
numberofcolumns.append('See Column Details for Number of Columns and Column Names') | |
columnnames.append(worksheetdatastring) | |
worksheetdatastring = '' | |
else: | |
continue | |
outputdf = pd.DataFrame.from_dict(data=precsvdict) | |
outputdf.to_csv('filelist.csv',header=True, index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment