Created
October 24, 2023 11:02
-
-
Save mikkohei13/514c7a105c45ab3d8d1105df657edc54 to your computer and use it in GitHub Desktop.
Convert eBird GBIF simple archive to multiple DwC files for FinBIF ETL import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import re | |
# Mikko Heikkinen 2020, 2022, 2023 | |
# Defines which columns to use. Order is not significant. | |
usecols = [ | |
'gbifID', | |
'catalogNumber', | |
'occurrenceID', | |
'basisOfRecord', | |
'eventDate', | |
'locality', | |
'stateProvince', | |
'countryCode', | |
'stateProvince', | |
'decimalLatitude', | |
'decimalLongitude', | |
'recordedBy', | |
'individualCount', | |
'verbatimScientificName', | |
'taxonRank', | |
'lastInterpreted', | |
'stateProvince' | |
] | |
# Source data | |
filename = "test.csv" | |
filename = "ebird-2023.csv" | |
debugLimit = 10000000 # 10 M | |
#debugLimit = 1000 # DEBUG | |
numberOfFiles = 25 | |
#numberOfFiles = 1 # DEBUG | |
# Replace NaN values with empty string | |
df = pd.read_csv(filename, sep='\t', usecols=usecols) | |
df = df.replace(np.nan, '', regex=True) | |
print("Finished loading into dataframe") | |
data = df.to_dict(orient='records') | |
dwList = [] | |
locationDict = dict() | |
print("Started conversions") | |
for i, ebirdObs in enumerate(data): | |
dwObs = {} | |
dwObs['collectionId'] = "http://tun.fi/HR.3691" # Must not be changed, so that old data can be updated, and id's stay persistent. | |
# catalogNumber (formerly identifier) is like OBS844370107 | |
# occurrenceID is like URN:catalog:CLO:EBIRD:OBS844370107 | |
dwObs['occurrenceID'] = dwObs['collectionId'] + "/" + ebirdObs['catalogNumber'] | |
dwObs['catalogNumber'] = ebirdObs['catalogNumber'] | |
dwObs['otherCatalogNumbers'] = ebirdObs['occurrenceID'] | |
if "HUMAN_OBSERVATION" == ebirdObs['basisOfRecord']: | |
dwObs['basisOfRecord'] = "HumanObservation" | |
else: | |
print("\nSkipped " + ebirdObs['catalogNumber'] + "\n") | |
continue | |
dwObs['eventDate'] = ebirdObs['eventDate'].replace("T00:00:00", "") | |
locality = ebirdObs['locality'] | |
# Clean up locality names | |
# Regexes made with ChatGPT | |
locality = re.sub(r'\([^)]*\)', '', locality) # Remove everything in parenthesis | |
# locality = re.sub(r'tie \d{1,3}', '', locality) # Remove house numbers after tie, e.g. "Lintutie 13" -> "Lintutie " | |
# locality = re.sub(r'\b\d{1,3}(?=\s*\w*tie\b)', '', locality) # Remove house numbers before tie, e.g. "13 Lintutie" -> " Lintutie" | |
locality = re.sub(r'\d+\.\d+, \d+\.\d+', '', locality) # Remove coordinates separated with comma and space | |
locality = re.sub(r'\d+\.\d+x\d+\.\d+', '', locality) # Remove coordinates with dots, separated with x | |
locality = re.sub(r'\d+\,\d+x\d+\,\d+', '', locality) # Remove coordinates with commas, separated with x | |
locality = re.sub(r'\d{1,2}\.\d{1,2}\.\d{4}\s\d{1,2}\.\d{2}', '', locality) # Remove some datetimes | |
# Full matches | |
if locality == "Koti" or locality == "Home": | |
locality = "" | |
# Remove odd place names and characters that are in at least 100 location names (2023-01-27) | |
locality = locality.replace("_", " ") | |
locality = locality.replace("--", ", ") | |
locality = locality.replace(" ,", ",") | |
locality = locality.replace(" ", " ") | |
locality = locality.replace("FI-", "") | |
locality = locality.replace("FI ", "") | |
locality = locality.replace(", FI", "") | |
locality = locality.replace("SK ", "") | |
locality = locality.replace("VS ", "") | |
locality = locality.replace("UM ", "") | |
locality = locality.replace("UM, ", "") | |
locality = locality.replace("PS ", "") | |
locality = locality.replace("P ", "") | |
locality = locality.replace("L ", "") | |
locality = locality.replace("home yard", "") | |
locality = locality.replace("home, yard", "") | |
locality = locality.replace("Homeyard", "") | |
locality = locality.replace(", yard", "") | |
locality = locality.replace("Mökki", "") | |
locality = locality.replace("SN yard", "") | |
locality = locality.replace("Aa My home", "") | |
locality = locality.replace(" home", "") | |
locality = locality.replace("10300, ", "") | |
locality = locality.replace("Home to Citymarket and back", "") | |
locality = locality.replace(" Lt", ", lintutorni") | |
locality = locality.replace("11.8.2020 10.37-60,592, 24,432", "") | |
locality = locality.replace("22 Apr 2017 19:14", "") | |
locality = locality.replace("22.6.2021 21.45-60,35, 27,454", "") | |
locality = locality.replace("koiran kanssa", "") | |
locality = locality.replace("Finnature Tour", "") | |
locality = locality.replace("exact locations not known", "") | |
locality = locality.replace("tour de búhos", "") | |
locality = locality.replace("-Neljan Tuulen Tupa", "Neljän tuulen tupa") | |
locality = locality.replace("\t", "") | |
# locality = locality.lstrip("1234567890") # Remove leading house numbers | |
locality = locality.strip(",.- ") | |
if locality in locationDict: | |
locationDict[locality] = locationDict[locality] + 1 | |
else: | |
locationDict[locality] = 1 | |
# locality = locality.replace("OULU", "") | |
dwObs['locality'] = locality | |
dwObs['stateProvince'] = ebirdObs['stateProvince'] | |
dwObs['country'] = ebirdObs['countryCode'] | |
dwObs['decimalLatitude'] = ebirdObs['decimalLatitude'] | |
dwObs['decimalLongitude'] = ebirdObs['decimalLongitude'] | |
dwObs['recordedBy'] = ebirdObs['recordedBy'] | |
if "" == ebirdObs['individualCount']: | |
dwObs['individualCount'] = "" | |
else: | |
dwObs['individualCount'] = int(ebirdObs['individualCount']) | |
dwObs['scientificName'] = ebirdObs['verbatimScientificName'] | |
dwObs['taxonRank'] = ebirdObs['taxonRank'] | |
# These will be saved as facts to FinBIF | |
dynamicProps = {} | |
dynamicProps['lastInterpreted'] = ebirdObs['lastInterpreted'] | |
dynamicProps['verbatimScientificName'] = ebirdObs['verbatimScientificName'] | |
dynamicProps['gbifID'] = ebirdObs['gbifID'] | |
dwObs['dynamicProperties'] = dynamicProps | |
dwList.append(dwObs) | |
if i >= debugLimit: | |
break | |
# Pretty print | |
#print(json.dumps(dwList, indent=4)) | |
print("Finished conversions") | |
dwDataFrame = pd.DataFrame(dwList) | |
#print(dwDataFrame.to_string()) | |
baseFilename = "ebird_{id}.txt" | |
for id, df_i in enumerate(np.array_split(dwDataFrame, numberOfFiles)): | |
df_i.to_csv(baseFilename.format(id=id), sep="\t", index=False) | |
print("Created files") | |
print(str(len(locationDict)) + " locality names") | |
result_f = open('localities.txt', 'a') | |
for name, count in locationDict.items(): | |
if count >= 10: | |
loc = f"{name}_{count}\n" | |
result_f.write(loc) | |
result_f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment