Created
December 29, 2023 11:16
-
-
Save mikkohei13/5bacbad3c57bdefae29aa0cc3367d09d to your computer and use it in GitHub Desktop.
Convert Luke atlas dataset to FinBIF Data Bank format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format. | |
Mikko Heikkinen 2023-12-29 | |
''' | |
import pandas as pd | |
# Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value. | |
file_path = 'luke_atlasdata.txt' | |
df = pd.read_csv(file_path, delimiter='\t', na_values=[], keep_default_na=False) | |
# FIX ERRORS FIRST | |
# Fix incorrect delimiters | |
df['Ruutu'] = df['Ruutu'].str.replace('.', ':') | |
# Fix incorrect species code | |
df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp') | |
# ADD IDENTIFIER | |
# Add a new "ID" column by concatenating "Ruutu", "Laji" and "Vuosi" with a dash as a separator. This should be persistent if dataset is updated. | |
df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['Vuosi'].astype(str) | |
df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator | |
df['ID'] = df['ID'].str.lower() # Species codes to lowercase | |
# DATE CONVERSIONS | |
# Duplicate the 'Vuosi' column, since all data are from one year only | |
df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['Vuosi'] | |
# Fill in begin and end dates | |
df['Alku@dd - Yleinen keruutapahtuma'] = 1 | |
df['Alku@mm - Yleinen keruutapahtuma'] = 1 | |
df['Loppu@dd - Yleinen keruutapahtuma'] = 29 # Date cannot be in the future, so use today's date. | |
df['Loppu@mm - Yleinen keruutapahtuma'] = 12 | |
# OBSERVER CONVERSIONS | |
# Use semicolon as a separator in "Havainnoijat" column | |
df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';') | |
# Replace "NA" with "anonymous" as observer name | |
df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi') | |
# Make observer names hidden | |
df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei' | |
# COORDINATE CONVERSIONS | |
# Add a new coodinate system column with a constant value 'ykj' | |
df['Koordinaatit@System - Keruutapahtuma'] = 'ykj' | |
# Split the 'Ruutu' column into N and E columns | |
df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True) | |
# CLEANUP | |
# Rename columns to match Data Bank format | |
df.rename(columns={ | |
'Vuosi': 'Alku@yyyy - Yleinen keruutapahtuma', | |
'Laji': 'Laji - Määritys', | |
'Pesimavarmuusindeksi': 'Pesimävarmuusindeksi - Havainto', | |
'Aineisto': 'Tietojen lähde - Havaintoerä', | |
'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma' | |
}, inplace=True) | |
# Remove unneeded columns | |
df.drop(columns=['Lahde'], inplace=True) | |
df.drop(columns=['max_indeksi'], inplace=True) | |
df.drop(columns=['Ruutu'], inplace=True) | |
# Randomize order for test upload | |
#df = df.sample(frac=1).reset_index(drop=True) | |
# Save the dataframe to a CSV file with UTF-8 encoding with BOM | |
csv_file_path = 'output-bom.csv' | |
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment