reuf/pandaPDFTableExtractExcelCleanup.py

## pandaPDFTableExtractExcelCleanup.py
#import tabula
#pages=[212,213]
#df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0]

#df.columns = df.columns.str.replace('\r', ' ')
#data = df.dropna()
#print(df)
#data.to_excel('data.xlsx')

from tabula import read_pdf
import pandas as pd
import openpyxl
import glob, os
os.chdir("./")
# for filename in glob.glob("*.xlsx"):
#     print(filename)
#     table = pd.read_ excel(filename, skiprows=list(range(1)))
#     table.to_excel(filename, index=False)

# empty data frame for the new output excel file with the merged excel files


# # csv files in the path
# file_list = glob.glob("*.xlsx")
# for file in file_list:
#     print(file)
#     table = pd.read_excel(file, header=None)
#     table.set_axis(
#         ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status",
#          "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True)
#     table.to_excel(file, index=False)


# # list of excel files we want to merge.
# # pd.read_excel(file_path) reads the
# # excel data into pandas dataframe.
# excl_list = []
#
# for file in file_list:
#     excl_list.append(pd.read_excel(file))
#
# # concatenate all DataFrames in the list
# # into a single DataFrame, returns new
# # DataFrame.
# excl_merged = pd.concat(excl_list, ignore_index=True)
#
# # exports the dataframe into excel file
# # with specified name.
# excl_merged.to_excel('Srebrenica.xlsx', index=False)


# excl_list = []
#
# for filename in glob.glob("*.xlsx"):
#     print(filename)
#     excl_list.append(pd.read_excel(filename))
#
# excl_merged = pd.DataFrame()
#
# for excl_file in excl_list:
#     # appends the data into the excl_merged
#     # dataframe.
#     excl_merged = excl_merged.append(excl_file, ignore_index=True)
#
# # exports the dataframe into excel file with
# # specified name.
# excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False)
    # table = pd.read_excel(filename)
    # print(len(list(table.columns)))
    # if (len(list(table.columns)) > 10):
    #     print(len(list(table.columns)))
    #     table.drop(table.columns[9], axis=1, inplace=True)
    #     table.to_excel(filename, index=False)
    # if( 'Prezime' in list(table.columns)[0]):
    #     print(filename)
    #     print(list(table.columns)[0])
        # table.to_excel(filename, index=False, header=False)
    # print(str(table.iat[0,0]) + str(table.iat[0,1]))
    # if (table.iat[0,0].contains("Prezime")):
    #     print(filename)
    # table.dropna(how='all', axis=1, inplace=True)
    # table.to_excel(filename, index=False)

# for page in range(499, 495):
#     table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0]
#     table.to_excel("page-"+str(page)+".xlsx", index=False)
	#import tabula
	#pages=[212,213]
	#df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0]

	#df.columns = df.columns.str.replace('\r', ' ')
	#data = df.dropna()
	#print(df)
	#data.to_excel('data.xlsx')

	from tabula import read_pdf
	import pandas as pd
	import openpyxl
	import glob, os
	os.chdir("./")
	# for filename in glob.glob("*.xlsx"):
	# print(filename)
	# table = pd.read_ excel(filename, skiprows=list(range(1)))
	# table.to_excel(filename, index=False)

	# empty data frame for the new output excel file with the merged excel files




	# # csv files in the path
	# file_list = glob.glob("*.xlsx")
	# for file in file_list:
	# print(file)
	# table = pd.read_excel(file, header=None)
	# table.set_axis(
	# ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status",
	# "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True)
	# table.to_excel(file, index=False)








	# # list of excel files we want to merge.
	# # pd.read_excel(file_path) reads the
	# # excel data into pandas dataframe.
	# excl_list = []
	#
	# for file in file_list:
	# excl_list.append(pd.read_excel(file))
	#
	# # concatenate all DataFrames in the list
	# # into a single DataFrame, returns new
	# # DataFrame.
	# excl_merged = pd.concat(excl_list, ignore_index=True)
	#
	# # exports the dataframe into excel file
	# # with specified name.
	# excl_merged.to_excel('Srebrenica.xlsx', index=False)








	# excl_list = []
	#
	# for filename in glob.glob("*.xlsx"):
	# print(filename)
	# excl_list.append(pd.read_excel(filename))
	#
	# excl_merged = pd.DataFrame()
	#
	# for excl_file in excl_list:
	# # appends the data into the excl_merged
	# # dataframe.
	# excl_merged = excl_merged.append(excl_file, ignore_index=True)
	#
	# # exports the dataframe into excel file with
	# # specified name.
	# excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False)
	# table = pd.read_excel(filename)
	# print(len(list(table.columns)))
	# if (len(list(table.columns)) > 10):
	# print(len(list(table.columns)))
	# table.drop(table.columns[9], axis=1, inplace=True)
	# table.to_excel(filename, index=False)
	# if( 'Prezime' in list(table.columns)[0]):
	# print(filename)
	# print(list(table.columns)[0])
	# table.to_excel(filename, index=False, header=False)
	# print(str(table.iat[0,0]) + str(table.iat[0,1]))
	# if (table.iat[0,0].contains("Prezime")):
	# print(filename)
	# table.dropna(how='all', axis=1, inplace=True)
	# table.to_excel(filename, index=False)

	# for page in range(499, 495):
	# table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0]
	# table.to_excel("page-"+str(page)+".xlsx", index=False)