toebR/pd_gdb.py

## pd_gdb.py
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#set pandas options so that all column names are diplayed when printes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

######load gdb with arcpy and convert to pd dataframe#########
input = r'my_project.gdb\my_table'
arr = arcpy.da.TableToNumPyArray(input, '*')

# convert to a Pandas DataFrame
df = pd.DataFrame(arr)

#####################

#load .gdb
gdb = gpd.read_file(r"D:\learn_py\basics\data\_STILLBERG_GIS\_STILLBERG_GIS.gdb")

#plot gdb
gdb.plot()

#first few rows of gdb
gdb.head()

#transform to pd dataframe
df = pd.DataFrame(gdb)
df
#column names
df.columns

#select columns
sel = df[["Code_group", 'Code_description', "Quality_pos____hgt_", "Quality_pos_"]]
sel

#select rows (filter) with "pkt" and return the column Code_group and Code_description, if second statement is empty, it returns the whole df
sel.loc[(sel.Code_group == "pkt") & (sel.Code_description == "aluprofil"), ["Code_group", "Code_description"]]

#unique values in a column (returns array)
a = np.unique(sel[['Code']])
a


#mutate a new column based on values in 2 existing columns (deprec)
sel = sel.assign(test = sel["Quality_pos____hgt_"] - sel["Quality_pos_"])

#conditional column value formatting (TRUE, False can be swapped with strings of my own choice)
sel['ifesle'] = np.where((sel['Code_description'] == 'aluprofil') & (sel["Quality_pos____hgt_"] < 0.25), True, False)
sel

#plot hist of dataframe column
sel.hist(column="Quality_pos____hgt_")

#look into python list comprehension

#grouped summary statistics over multiple columns
summary_sel = sel.groupby(['ifesle', "Code_group"]).describe()

#count entries per categorial variable in column
summary_count = sel.groupby('ifesle')["Code_group"].count()
summary_count = pd.DataFrame(summary_count)

########################## calculate total of row in a new column
summary_count_coltot = summary_count.assign(total = np.sum(summary_count["Code_group"].values)) #add total to dataframe based on values in Code_group #total in a new column
summary_count_coltot #now with this car we can e.g. calcualte rowwise percentages etc.
summary_count_coltot = summary_count_coltot.reset_index() #turn descriptors to column name (reset index)
summary_count_coltot["ifesle"] = summary_count_coltot["ifesle"].astype(str) #turn boolians into strings for plotting
plt.bar(x = summary_count_coltot["ifesle"], height= summary_count_coltot["Code_group"])
############################

#calcualate total of column in a new row
summary_count.loc['Column_Total']= summary_count.sum(numeric_only=True, axis=0)
summary_count

summary_count = summary_count.reset_index() #turn descriptors to column name (reset index)
summary_count["ifesle"] = summary_count["ifesle"].astype(str) #turn ifesle into one columns type (now its a string and 2 boolians)

plt.bar(x = summary_count["ifesle"], height = summary_count["Code_group"]) #codegroups count per total

#for loop of features
bsp = range(100)
rangetolist = [num for num in bsp] #jede nummer im range object wird gepostet (conditions können geadded werden) --> gibt list zurück (nicht mehr eine range))
bsp = [num for num in bsp if num < 50] #[return for iterator in bsp if condition]

sel.shape #shape array def
sel.shape[0] #number of rows in the pandas dataframe

bsp = bsp[::-2]

#to loop over all rows of a pd dataframe e.g. (here base on entries of shape).
for x in range(len(bsp)):
    #print(sel[x,2])
    print(x)

for x in enumerate(bsp): #enumerate "enumerates" iterator entries  -  creates tupples with iterator values
    print(x)

for x,num in enumerate(bsp): #enumerate "enumerates" iterator entries  -  creates tupples with iterator values
    print(x)
    print(num) #adding a ",num" 'unzipps' the tupples into a singular vector (see print(num) result)


#function
def functioname(dfvar):
    result = dfvar.columns
    return result

functioname(sel)
	import geopandas as gpd
	import pandas as pd
	import numpy as np
	from matplotlib import pyplot as plt

	#set pandas options so that all column names are diplayed when printes
	pd.set_option('display.max_columns', None)
	pd.set_option('display.max_rows', None)

	######load gdb with arcpy and convert to pd dataframe#########
	input = r'my_project.gdb\my_table'
	arr = arcpy.da.TableToNumPyArray(input, '*')

	# convert to a Pandas DataFrame
	df = pd.DataFrame(arr)

	#####################

	#load .gdb
	gdb = gpd.read_file(r"D:\learn_py\basics\data\_STILLBERG_GIS\_STILLBERG_GIS.gdb")

	#plot gdb
	gdb.plot()

	#first few rows of gdb
	gdb.head()

	#transform to pd dataframe
	df = pd.DataFrame(gdb)
	df
	#column names
	df.columns

	#select columns
	sel = df[["Code_group", 'Code_description', "Quality_pos____hgt_", "Quality_pos_"]]
	sel

	#select rows (filter) with "pkt" and return the column Code_group and Code_description, if second statement is empty, it returns the whole df
	sel.loc[(sel.Code_group == "pkt") & (sel.Code_description == "aluprofil"), ["Code_group", "Code_description"]]

	#unique values in a column (returns array)
	a = np.unique(sel[['Code']])
	a


	#mutate a new column based on values in 2 existing columns (deprec)
	sel = sel.assign(test = sel["Quality_pos____hgt_"] - sel["Quality_pos_"])

	#conditional column value formatting (TRUE, False can be swapped with strings of my own choice)
	sel['ifesle'] = np.where((sel['Code_description'] == 'aluprofil') & (sel["Quality_pos____hgt_"] < 0.25), True, False)
	sel

	#plot hist of dataframe column
	sel.hist(column="Quality_pos____hgt_")

	#look into python list comprehension

	#grouped summary statistics over multiple columns
	summary_sel = sel.groupby(['ifesle', "Code_group"]).describe()

	#count entries per categorial variable in column
	summary_count = sel.groupby('ifesle')["Code_group"].count()
	summary_count = pd.DataFrame(summary_count)

	########################## calculate total of row in a new column
	summary_count_coltot = summary_count.assign(total = np.sum(summary_count["Code_group"].values)) #add total to dataframe based on values in Code_group #total in a new column
	summary_count_coltot #now with this car we can e.g. calcualte rowwise percentages etc.
	summary_count_coltot = summary_count_coltot.reset_index() #turn descriptors to column name (reset index)
	summary_count_coltot["ifesle"] = summary_count_coltot["ifesle"].astype(str) #turn boolians into strings for plotting
	plt.bar(x = summary_count_coltot["ifesle"], height= summary_count_coltot["Code_group"])
	############################

	#calcualate total of column in a new row
	summary_count.loc['Column_Total']= summary_count.sum(numeric_only=True, axis=0)
	summary_count

	summary_count = summary_count.reset_index() #turn descriptors to column name (reset index)
	summary_count["ifesle"] = summary_count["ifesle"].astype(str) #turn ifesle into one columns type (now its a string and 2 boolians)

	plt.bar(x = summary_count["ifesle"], height = summary_count["Code_group"]) #codegroups count per total

	#for loop of features
	bsp = range(100)
	rangetolist = [num for num in bsp] #jede nummer im range object wird gepostet (conditions können geadded werden) --> gibt list zurück (nicht mehr eine range))
	bsp = [num for num in bsp if num < 50] #[return for iterator in bsp if condition]

	sel.shape #shape array def
	sel.shape[0] #number of rows in the pandas dataframe

	bsp = bsp[::-2]

	#to loop over all rows of a pd dataframe e.g. (here base on entries of shape).
	for x in range(len(bsp)):
	#print(sel[x,2])
	print(x)

	for x in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values
	print(x)

	for x,num in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values
	print(x)
	print(num) #adding a ",num" 'unzipps' the tupples into a singular vector (see print(num) result)


	#function
	def functioname(dfvar):
	result = dfvar.columns
	return result

	functioname(sel)