Skip to content

Instantly share code, notes, and snippets.

@toebR
Created October 24, 2022 10:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toebR/fe1f60e501320260652b822978cfaea8 to your computer and use it in GitHub Desktop.
Save toebR/fe1f60e501320260652b822978cfaea8 to your computer and use it in GitHub Desktop.
data wrangling basics in python pandas
import geopandas as gpd
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#set pandas options so that all column names are diplayed when printes
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
######load gdb with arcpy and convert to pd dataframe#########
input = r'my_project.gdb\my_table'
arr = arcpy.da.TableToNumPyArray(input, '*')
# convert to a Pandas DataFrame
df = pd.DataFrame(arr)
#####################
#load .gdb
gdb = gpd.read_file(r"D:\learn_py\basics\data\_STILLBERG_GIS\_STILLBERG_GIS.gdb")
#plot gdb
gdb.plot()
#first few rows of gdb
gdb.head()
#transform to pd dataframe
df = pd.DataFrame(gdb)
df
#column names
df.columns
#select columns
sel = df[["Code_group", 'Code_description', "Quality_pos____hgt_", "Quality_pos_"]]
sel
#select rows (filter) with "pkt" and return the column Code_group and Code_description, if second statement is empty, it returns the whole df
sel.loc[(sel.Code_group == "pkt") & (sel.Code_description == "aluprofil"), ["Code_group", "Code_description"]]
#unique values in a column (returns array)
a = np.unique(sel[['Code']])
a
#mutate a new column based on values in 2 existing columns (deprec)
sel = sel.assign(test = sel["Quality_pos____hgt_"] - sel["Quality_pos_"])
#conditional column value formatting (TRUE, False can be swapped with strings of my own choice)
sel['ifesle'] = np.where((sel['Code_description'] == 'aluprofil') & (sel["Quality_pos____hgt_"] < 0.25), True, False)
sel
#plot hist of dataframe column
sel.hist(column="Quality_pos____hgt_")
#look into python list comprehension
#grouped summary statistics over multiple columns
summary_sel = sel.groupby(['ifesle', "Code_group"]).describe()
#count entries per categorial variable in column
summary_count = sel.groupby('ifesle')["Code_group"].count()
summary_count = pd.DataFrame(summary_count)
########################## calculate total of row in a new column
summary_count_coltot = summary_count.assign(total = np.sum(summary_count["Code_group"].values)) #add total to dataframe based on values in Code_group #total in a new column
summary_count_coltot #now with this car we can e.g. calcualte rowwise percentages etc.
summary_count_coltot = summary_count_coltot.reset_index() #turn descriptors to column name (reset index)
summary_count_coltot["ifesle"] = summary_count_coltot["ifesle"].astype(str) #turn boolians into strings for plotting
plt.bar(x = summary_count_coltot["ifesle"], height= summary_count_coltot["Code_group"])
############################
#calcualate total of column in a new row
summary_count.loc['Column_Total']= summary_count.sum(numeric_only=True, axis=0)
summary_count
summary_count = summary_count.reset_index() #turn descriptors to column name (reset index)
summary_count["ifesle"] = summary_count["ifesle"].astype(str) #turn ifesle into one columns type (now its a string and 2 boolians)
plt.bar(x = summary_count["ifesle"], height = summary_count["Code_group"]) #codegroups count per total
#for loop of features
bsp = range(100)
rangetolist = [num for num in bsp] #jede nummer im range object wird gepostet (conditions können geadded werden) --> gibt list zurück (nicht mehr eine range))
bsp = [num for num in bsp if num < 50] #[return for iterator in bsp if condition]
sel.shape #shape array def
sel.shape[0] #number of rows in the pandas dataframe
bsp = bsp[::-2]
#to loop over all rows of a pd dataframe e.g. (here base on entries of shape).
for x in range(len(bsp)):
#print(sel[x,2])
print(x)
for x in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values
print(x)
for x,num in enumerate(bsp): #enumerate "enumerates" iterator entries - creates tupples with iterator values
print(x)
print(num) #adding a ",num" 'unzipps' the tupples into a singular vector (see print(num) result)
#function
def functioname(dfvar):
result = dfvar.columns
return result
functioname(sel)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment