Skip to content

Instantly share code, notes, and snippets.

@reuf
Created February 17, 2022 20:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reuf/363bab430d4cdf74aa9054ba4daff579 to your computer and use it in GitHub Desktop.
Save reuf/363bab430d4cdf74aa9054ba4daff579 to your computer and use it in GitHub Desktop.
#import tabula
#pages=[212,213]
#df = tabula.read_pdf('Knjiga_mrtvih.pdf', pages = '212')[0]
#df.columns = df.columns.str.replace('\r', ' ')
#data = df.dropna()
#print(df)
#data.to_excel('data.xlsx')
from tabula import read_pdf
import pandas as pd
import openpyxl
import glob, os
os.chdir("./")
# for filename in glob.glob("*.xlsx"):
# print(filename)
# table = pd.read_ excel(filename, skiprows=list(range(1)))
# table.to_excel(filename, index=False)
# empty data frame for the new output excel file with the merged excel files
# # csv files in the path
# file_list = glob.glob("*.xlsx")
# for file in file_list:
# print(file)
# table = pd.read_excel(file, header=None)
# table.set_axis(
# ["last_name", "first_name", "fathers_name", "date_of_birth", "place_of_birth", "nationality", "status",
# "formation", "date_of_death", "municipality_of_death"], axis=1, inplace=True)
# table.to_excel(file, index=False)
# # list of excel files we want to merge.
# # pd.read_excel(file_path) reads the
# # excel data into pandas dataframe.
# excl_list = []
#
# for file in file_list:
# excl_list.append(pd.read_excel(file))
#
# # concatenate all DataFrames in the list
# # into a single DataFrame, returns new
# # DataFrame.
# excl_merged = pd.concat(excl_list, ignore_index=True)
#
# # exports the dataframe into excel file
# # with specified name.
# excl_merged.to_excel('Srebrenica.xlsx', index=False)
# excl_list = []
#
# for filename in glob.glob("*.xlsx"):
# print(filename)
# excl_list.append(pd.read_excel(filename))
#
# excl_merged = pd.DataFrame()
#
# for excl_file in excl_list:
# # appends the data into the excl_merged
# # dataframe.
# excl_merged = excl_merged.append(excl_file, ignore_index=True)
#
# # exports the dataframe into excel file with
# # specified name.
# excl_merged.to_excel('Sarajevo.xlsx', index=False, header=False)
# table = pd.read_excel(filename)
# print(len(list(table.columns)))
# if (len(list(table.columns)) > 10):
# print(len(list(table.columns)))
# table.drop(table.columns[9], axis=1, inplace=True)
# table.to_excel(filename, index=False)
# if( 'Prezime' in list(table.columns)[0]):
# print(filename)
# print(list(table.columns)[0])
# table.to_excel(filename, index=False, header=False)
# print(str(table.iat[0,0]) + str(table.iat[0,1]))
# if (table.iat[0,0].contains("Prezime")):
# print(filename)
# table.dropna(how='all', axis=1, inplace=True)
# table.to_excel(filename, index=False)
# for page in range(499, 495):
# table = read_pdf("Knjiga_mrtvih.pdf", pages='499', pandas_options={'header': None})[0]
# table.to_excel("page-"+str(page)+".xlsx", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment