Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Created December 29, 2023 11:16
Show Gist options
  • Save mikkohei13/5bacbad3c57bdefae29aa0cc3367d09d to your computer and use it in GitHub Desktop.
Save mikkohei13/5bacbad3c57bdefae29aa0cc3367d09d to your computer and use it in GitHub Desktop.
Convert Luke atlas dataset to FinBIF Data Bank format
'''
Converts "Luken aineistokooste 4. lintuatlakseen" tsv file into FinBIF Data Bank secondary data format.
Mikko Heikkinen 2023-12-29
'''
import pandas as pd
# Load the file into a Pandas dataframe using tab as the delimiter. Keep "NA" as a value.
file_path = 'luke_atlasdata.txt'
df = pd.read_csv(file_path, delimiter='\t', na_values=[], keep_default_na=False)
# FIX ERRORS FIRST
# Fix incorrect delimiters
df['Ruutu'] = df['Ruutu'].str.replace('.', ':')
# Fix incorrect species code
df['Laji'] = df['Laji'].str.replace('Trihyp', 'Acthyp')
# ADD IDENTIFIER
# Add a new "ID" column by concatenating "Ruutu", "Laji" and "Vuosi" with a dash as a separator. This should be persistent if dataset is updated.
df['ID'] = df['Ruutu'].astype(str) + '-' + df['Laji'] + '-' + df['Vuosi'].astype(str)
df['ID'] = df['ID'].str.replace(':', '-') # Replaces coordinate serparator
df['ID'] = df['ID'].str.lower() # Species codes to lowercase
# DATE CONVERSIONS
# Duplicate the 'Vuosi' column, since all data are from one year only
df['Loppu@yyyy - Yleinen keruutapahtuma'] = df['Vuosi']
# Fill in begin and end dates
df['Alku@dd - Yleinen keruutapahtuma'] = 1
df['Alku@mm - Yleinen keruutapahtuma'] = 1
df['Loppu@dd - Yleinen keruutapahtuma'] = 29 # Date cannot be in the future, so use today's date.
df['Loppu@mm - Yleinen keruutapahtuma'] = 12
# OBSERVER CONVERSIONS
# Use semicolon as a separator in "Havainnoijat" column
df['Havainnoijat'] = df['Havainnoijat'].str.replace(',', ';')
# Replace "NA" with "anonymous" as observer name
df['Havainnoijat'] = df['Havainnoijat'].replace('NA', 'Anonyymi')
# Make observer names hidden
df['Havainnoijien nimet ovat julkisia - Yleinen keruutapahtuma'] = 'Ei'
# COORDINATE CONVERSIONS
# Add a new coodinate system column with a constant value 'ykj'
df['Koordinaatit@System - Keruutapahtuma'] = 'ykj'
# Split the 'Ruutu' column into N and E columns
df[['Koordinaatit@N - Keruutapahtuma', 'Koordinaatit@E - Keruutapahtuma']] = df['Ruutu'].str.split(':', expand=True)
# CLEANUP
# Rename columns to match Data Bank format
df.rename(columns={
'Vuosi': 'Alku@yyyy - Yleinen keruutapahtuma',
'Laji': 'Laji - Määritys',
'Pesimavarmuusindeksi': 'Pesimävarmuusindeksi - Havainto',
'Aineisto': 'Tietojen lähde - Havaintoerä',
'Havainnoijat': 'Havainnoijat - Yleinen keruutapahtuma'
}, inplace=True)
# Remove unneeded columns
df.drop(columns=['Lahde'], inplace=True)
df.drop(columns=['max_indeksi'], inplace=True)
df.drop(columns=['Ruutu'], inplace=True)
# Randomize order for test upload
#df = df.sample(frac=1).reset_index(drop=True)
# Save the dataframe to a CSV file with UTF-8 encoding with BOM
csv_file_path = 'output-bom.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment