Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Created October 24, 2023 11:02
Show Gist options
  • Save mikkohei13/514c7a105c45ab3d8d1105df657edc54 to your computer and use it in GitHub Desktop.
Save mikkohei13/514c7a105c45ab3d8d1105df657edc54 to your computer and use it in GitHub Desktop.
Convert eBird GBIF simple archive to multiple DwC files for FinBIF ETL import
import pandas as pd
import numpy as np
import re
# Mikko Heikkinen 2020, 2022, 2023
# Defines which columns to use. Order is not significant.
usecols = [
'gbifID',
'catalogNumber',
'occurrenceID',
'basisOfRecord',
'eventDate',
'locality',
'stateProvince',
'countryCode',
'stateProvince',
'decimalLatitude',
'decimalLongitude',
'recordedBy',
'individualCount',
'verbatimScientificName',
'taxonRank',
'lastInterpreted',
'stateProvince'
]
# Source data
filename = "test.csv"
filename = "ebird-2023.csv"
debugLimit = 10000000 # 10 M
#debugLimit = 1000 # DEBUG
numberOfFiles = 25
#numberOfFiles = 1 # DEBUG
# Replace NaN values with empty string
df = pd.read_csv(filename, sep='\t', usecols=usecols)
df = df.replace(np.nan, '', regex=True)
print("Finished loading into dataframe")
data = df.to_dict(orient='records')
dwList = []
locationDict = dict()
print("Started conversions")
for i, ebirdObs in enumerate(data):
dwObs = {}
dwObs['collectionId'] = "http://tun.fi/HR.3691" # Must not be changed, so that old data can be updated, and id's stay persistent.
# catalogNumber (formerly identifier) is like OBS844370107
# occurrenceID is like URN:catalog:CLO:EBIRD:OBS844370107
dwObs['occurrenceID'] = dwObs['collectionId'] + "/" + ebirdObs['catalogNumber']
dwObs['catalogNumber'] = ebirdObs['catalogNumber']
dwObs['otherCatalogNumbers'] = ebirdObs['occurrenceID']
if "HUMAN_OBSERVATION" == ebirdObs['basisOfRecord']:
dwObs['basisOfRecord'] = "HumanObservation"
else:
print("\nSkipped " + ebirdObs['catalogNumber'] + "\n")
continue
dwObs['eventDate'] = ebirdObs['eventDate'].replace("T00:00:00", "")
locality = ebirdObs['locality']
# Clean up locality names
# Regexes made with ChatGPT
locality = re.sub(r'\([^)]*\)', '', locality) # Remove everything in parenthesis
# locality = re.sub(r'tie \d{1,3}', '', locality) # Remove house numbers after tie, e.g. "Lintutie 13" -> "Lintutie "
# locality = re.sub(r'\b\d{1,3}(?=\s*\w*tie\b)', '', locality) # Remove house numbers before tie, e.g. "13 Lintutie" -> " Lintutie"
locality = re.sub(r'\d+\.\d+, \d+\.\d+', '', locality) # Remove coordinates separated with comma and space
locality = re.sub(r'\d+\.\d+x\d+\.\d+', '', locality) # Remove coordinates with dots, separated with x
locality = re.sub(r'\d+\,\d+x\d+\,\d+', '', locality) # Remove coordinates with commas, separated with x
locality = re.sub(r'\d{1,2}\.\d{1,2}\.\d{4}\s\d{1,2}\.\d{2}', '', locality) # Remove some datetimes
# Full matches
if locality == "Koti" or locality == "Home":
locality = ""
# Remove odd place names and characters that are in at least 100 location names (2023-01-27)
locality = locality.replace("_", " ")
locality = locality.replace("--", ", ")
locality = locality.replace(" ,", ",")
locality = locality.replace(" ", " ")
locality = locality.replace("FI-", "")
locality = locality.replace("FI ", "")
locality = locality.replace(", FI", "")
locality = locality.replace("SK ", "")
locality = locality.replace("VS ", "")
locality = locality.replace("UM ", "")
locality = locality.replace("UM, ", "")
locality = locality.replace("PS ", "")
locality = locality.replace("P ", "")
locality = locality.replace("L ", "")
locality = locality.replace("home yard", "")
locality = locality.replace("home, yard", "")
locality = locality.replace("Homeyard", "")
locality = locality.replace(", yard", "")
locality = locality.replace("Mökki", "")
locality = locality.replace("SN yard", "")
locality = locality.replace("Aa My home", "")
locality = locality.replace(" home", "")
locality = locality.replace("10300, ", "")
locality = locality.replace("Home to Citymarket and back", "")
locality = locality.replace(" Lt", ", lintutorni")
locality = locality.replace("11.8.2020 10.37-60,592, 24,432", "")
locality = locality.replace("22 Apr 2017 19:14", "")
locality = locality.replace("22.6.2021 21.45-60,35, 27,454", "")
locality = locality.replace("koiran kanssa", "")
locality = locality.replace("Finnature Tour", "")
locality = locality.replace("exact locations not known", "")
locality = locality.replace("tour de búhos", "")
locality = locality.replace("-Neljan Tuulen Tupa", "Neljän tuulen tupa")
locality = locality.replace("\t", "")
# locality = locality.lstrip("1234567890") # Remove leading house numbers
locality = locality.strip(",.- ")
if locality in locationDict:
locationDict[locality] = locationDict[locality] + 1
else:
locationDict[locality] = 1
# locality = locality.replace("OULU", "")
dwObs['locality'] = locality
dwObs['stateProvince'] = ebirdObs['stateProvince']
dwObs['country'] = ebirdObs['countryCode']
dwObs['decimalLatitude'] = ebirdObs['decimalLatitude']
dwObs['decimalLongitude'] = ebirdObs['decimalLongitude']
dwObs['recordedBy'] = ebirdObs['recordedBy']
if "" == ebirdObs['individualCount']:
dwObs['individualCount'] = ""
else:
dwObs['individualCount'] = int(ebirdObs['individualCount'])
dwObs['scientificName'] = ebirdObs['verbatimScientificName']
dwObs['taxonRank'] = ebirdObs['taxonRank']
# These will be saved as facts to FinBIF
dynamicProps = {}
dynamicProps['lastInterpreted'] = ebirdObs['lastInterpreted']
dynamicProps['verbatimScientificName'] = ebirdObs['verbatimScientificName']
dynamicProps['gbifID'] = ebirdObs['gbifID']
dwObs['dynamicProperties'] = dynamicProps
dwList.append(dwObs)
if i >= debugLimit:
break
# Pretty print
#print(json.dumps(dwList, indent=4))
print("Finished conversions")
dwDataFrame = pd.DataFrame(dwList)
#print(dwDataFrame.to_string())
baseFilename = "ebird_{id}.txt"
for id, df_i in enumerate(np.array_split(dwDataFrame, numberOfFiles)):
df_i.to_csv(baseFilename.format(id=id), sep="\t", index=False)
print("Created files")
print(str(len(locationDict)) + " locality names")
result_f = open('localities.txt', 'a')
for name, count in locationDict.items():
if count >= 10:
loc = f"{name}_{count}\n"
result_f.write(loc)
result_f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment