Skip to content

Instantly share code, notes, and snippets.

@brey
Created September 2, 2023 13:00
Show Gist options
  • Save brey/44d88a6ef943a7de1c70981afb147328 to your computer and use it in GitHub Desktop.
Save brey/44d88a6ef943a7de1c70981afb147328 to your computer and use it in GitHub Desktop.
remove station duplicates
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "15df7d74-9dd8-40a6-937b-967c22d93f1d",
"metadata": {},
"source": [
"# Remove duplicates in tide gauge locations datasets"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e124303c-771d-408f-8662-8d2f67d120a8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c71d22d6-724a-49fe-8ec7-486177552d51",
"metadata": {},
"outputs": [],
"source": [
"s = pd.read_csv('/Volumes/B1TB_1/catalog_new.csv', index_col=[0])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "2db31dd1-8713-4315-9005-d3ae6b9830f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Station_Name</th>\n",
" <th>ioc_code</th>\n",
" <th>gloss_id</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>Country</th>\n",
" <th>connection</th>\n",
" <th>contacts</th>\n",
" <th>dcp_id</th>\n",
" <th>last_observation_level</th>\n",
" <th>...</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>number_of_years</th>\n",
" <th>time_zone_hours</th>\n",
" <th>datum_information</th>\n",
" <th>instrument</th>\n",
" <th>precision</th>\n",
" <th>null_value</th>\n",
" <th>gauge_type</th>\n",
" <th>overall_record_quality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A Coru\\u00f1a 2 tide gauge</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>43.357000</td>\n",
" <td>-8.389000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A Coru\\u00f1a ACOR1 TG tide gauge</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>43.364400</td>\n",
" <td>-8.398900</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A Coruña</td>\n",
" <td>acor1</td>\n",
" <td>NaN</td>\n",
" <td>43.364000</td>\n",
" <td>-8.399000</td>\n",
" <td>Spain</td>\n",
" <td>web</td>\n",
" <td>Instituto Geográfico Nacional ( Spain )</td>\n",
" <td>NaN</td>\n",
" <td>4.27</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A121</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>55.400002</td>\n",
" <td>3.810000</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2015-06-28 00:00:00</td>\n",
" <td>2020-10-06 07:40:00</td>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>MSL</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.360600</td>\n",
" <td>3.118330</td>\n",
" <td>BEL</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-02-28 00:00:00</td>\n",
" <td>2021-01-12 08:55:00</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>MSL</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6096</th>\n",
" <td>\\u00d6lands norra udd</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>57.366100</td>\n",
" <td>17.097200</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6097</th>\n",
" <td>papho</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>34.755100</td>\n",
" <td>32.408798</td>\n",
" <td>CYP</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2014-11-01 00:00:00</td>\n",
" <td>2016-08-26 01:00:00</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>Unknown</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible quality control issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6098</th>\n",
" <td>s_Gravendeel</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.779151</td>\n",
" <td>4.625854</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1970-10-30 07:00:00</td>\n",
" <td>1983-12-31 22:00:00</td>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" <td>Normal Amsterdam Level</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>River</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6099</th>\n",
" <td>s_Gravendeel_haven</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.782206</td>\n",
" <td>4.625803</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1983-12-31 23:00:00</td>\n",
" <td>1992-12-31 22:50:00</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>Normal Amsterdam Level</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>River</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6100</th>\n",
" <td>Ålesund</td>\n",
" <td>ales</td>\n",
" <td>NaN</td>\n",
" <td>62.469000</td>\n",
" <td>6.152000</td>\n",
" <td>Norway</td>\n",
" <td>web</td>\n",
" <td>Norwegian Hydrographic Service ( Norway )</td>\n",
" <td>NaN</td>\n",
" <td>2.05</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6101 rows × 71 columns</p>\n",
"</div>"
],
"text/plain": [
" Station_Name ioc_code gloss_id latitude \\\n",
"0 A Coru\\u00f1a 2 tide gauge NaN NaN 43.357000 \n",
"1 A Coru\\u00f1a ACOR1 TG tide gauge NaN NaN 43.364400 \n",
"2 A Coruña acor1 NaN 43.364000 \n",
"3 A121 NaN NaN 55.400002 \n",
"4 A2 NaN NaN 51.360600 \n",
"... ... ... ... ... \n",
"6096 \\u00d6lands norra udd NaN NaN 57.366100 \n",
"6097 papho NaN NaN 34.755100 \n",
"6098 s_Gravendeel NaN NaN 51.779151 \n",
"6099 s_Gravendeel_haven NaN NaN 51.782206 \n",
"6100 Ålesund ales NaN 62.469000 \n",
"\n",
" longitude Country connection contacts \\\n",
"0 -8.389000 NaN NaN NaN \n",
"1 -8.398900 NaN NaN NaN \n",
"2 -8.399000 Spain web Instituto Geográfico Nacional ( Spain ) \n",
"3 3.810000 NLD NaN NaN \n",
"4 3.118330 BEL NaN NaN \n",
"... ... ... ... ... \n",
"6096 17.097200 NaN NaN NaN \n",
"6097 32.408798 CYP NaN NaN \n",
"6098 4.625854 NLD NaN NaN \n",
"6099 4.625803 NLD NaN NaN \n",
"6100 6.152000 Norway web Norwegian Hydrographic Service ( Norway ) \n",
"\n",
" dcp_id last_observation_level ... start_date_time \\\n",
"0 NaN NaN ... NaN \n",
"1 NaN NaN ... NaN \n",
"2 NaN 4.27 ... NaN \n",
"3 NaN NaN ... 2015-06-28 00:00:00 \n",
"4 NaN NaN ... 2020-02-28 00:00:00 \n",
"... ... ... ... ... \n",
"6096 NaN NaN ... NaN \n",
"6097 NaN NaN ... 2014-11-01 00:00:00 \n",
"6098 NaN NaN ... 1970-10-30 07:00:00 \n",
"6099 NaN NaN ... 1983-12-31 23:00:00 \n",
"6100 NaN 2.05 ... NaN \n",
"\n",
" end_date_time number_of_years time_zone_hours \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 2020-10-06 07:40:00 6.0 0.0 \n",
"4 2021-01-12 08:55:00 2.0 0.0 \n",
"... ... ... ... \n",
"6096 NaN NaN NaN \n",
"6097 2016-08-26 01:00:00 3.0 0.0 \n",
"6098 1983-12-31 22:00:00 14.0 0.0 \n",
"6099 1992-12-31 22:50:00 10.0 0.0 \n",
"6100 NaN NaN NaN \n",
"\n",
" datum_information instrument precision null_value gauge_type \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN \n",
"3 MSL Unspecified Unspecified -99.9999 Coastal \n",
"4 MSL Unspecified Unspecified -99.9999 Coastal \n",
"... ... ... ... ... ... \n",
"6096 NaN NaN NaN NaN NaN \n",
"6097 Unknown Unspecified Unspecified -99.9999 Coastal \n",
"6098 Normal Amsterdam Level Unspecified Unspecified -99.9999 River \n",
"6099 Normal Amsterdam Level Unspecified Unspecified -99.9999 River \n",
"6100 NaN NaN NaN NaN NaN \n",
"\n",
" overall_record_quality \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 No obvious issues \n",
"4 No obvious issues \n",
"... ... \n",
"6096 NaN \n",
"6097 Possible quality control issues \n",
"6098 No obvious issues \n",
"6099 No obvious issues \n",
"6100 NaN \n",
"\n",
"[6101 rows x 71 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c72c5605-dbb1-43d2-a9fe-c385b02e64de",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.duplicated('Station_Name').sum()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b29c926d-df32-4ac1-9db3-b2dcc14002cc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"280"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.duplicated(['latitude','longitude']).sum() #there are duplicates"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "89f1c829-4b44-4b68-961d-01489ee24c82",
"metadata": {},
"outputs": [],
"source": [
"dmask = s.duplicated(['latitude','longitude'], keep = False)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "6e172fa9-cab8-4905-92ea-95571ef233ab",
"metadata": {},
"outputs": [],
"source": [
"s1 = s.drop_duplicates(['latitude','longitude'])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "e758c7af-a55d-48e5-8f64-dac9b76fbd11",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Station_Name</th>\n",
" <th>ioc_code</th>\n",
" <th>gloss_id</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>Country</th>\n",
" <th>connection</th>\n",
" <th>contacts</th>\n",
" <th>dcp_id</th>\n",
" <th>last_observation_level</th>\n",
" <th>...</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>number_of_years</th>\n",
" <th>time_zone_hours</th>\n",
" <th>datum_information</th>\n",
" <th>instrument</th>\n",
" <th>precision</th>\n",
" <th>null_value</th>\n",
" <th>gauge_type</th>\n",
" <th>overall_record_quality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A Coru\\u00f1a 2 tide gauge</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>43.357000</td>\n",
" <td>-8.389000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A Coru\\u00f1a ACOR1 TG tide gauge</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>43.364400</td>\n",
" <td>-8.398900</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A Coruña</td>\n",
" <td>acor1</td>\n",
" <td>NaN</td>\n",
" <td>43.364000</td>\n",
" <td>-8.399000</td>\n",
" <td>Spain</td>\n",
" <td>web</td>\n",
" <td>Instituto Geográfico Nacional ( Spain )</td>\n",
" <td>NaN</td>\n",
" <td>4.27</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A121</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>55.400002</td>\n",
" <td>3.810000</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2015-06-28 00:00:00</td>\n",
" <td>2020-10-06 07:40:00</td>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>MSL</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.360600</td>\n",
" <td>3.118330</td>\n",
" <td>BEL</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-02-28 00:00:00</td>\n",
" <td>2021-01-12 08:55:00</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>MSL</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6095</th>\n",
" <td>Zygi</td>\n",
" <td>zygi</td>\n",
" <td>NaN</td>\n",
" <td>34.727000</td>\n",
" <td>33.338000</td>\n",
" <td>Cyprus</td>\n",
" <td>ftp</td>\n",
" <td>Cyprus Oceanography Center ( Cyprus )</td>\n",
" <td>ZYGI1</td>\n",
" <td>1.91</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6097</th>\n",
" <td>papho</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>34.755100</td>\n",
" <td>32.408798</td>\n",
" <td>CYP</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2014-11-01 00:00:00</td>\n",
" <td>2016-08-26 01:00:00</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>Unknown</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible quality control issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6098</th>\n",
" <td>s_Gravendeel</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.779151</td>\n",
" <td>4.625854</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1970-10-30 07:00:00</td>\n",
" <td>1983-12-31 22:00:00</td>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" <td>Normal Amsterdam Level</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>River</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6099</th>\n",
" <td>s_Gravendeel_haven</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>51.782206</td>\n",
" <td>4.625803</td>\n",
" <td>NLD</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1983-12-31 23:00:00</td>\n",
" <td>1992-12-31 22:50:00</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>Normal Amsterdam Level</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>River</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6100</th>\n",
" <td>Ålesund</td>\n",
" <td>ales</td>\n",
" <td>NaN</td>\n",
" <td>62.469000</td>\n",
" <td>6.152000</td>\n",
" <td>Norway</td>\n",
" <td>web</td>\n",
" <td>Norwegian Hydrographic Service ( Norway )</td>\n",
" <td>NaN</td>\n",
" <td>2.05</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5821 rows × 71 columns</p>\n",
"</div>"
],
"text/plain": [
" Station_Name ioc_code gloss_id latitude \\\n",
"0 A Coru\\u00f1a 2 tide gauge NaN NaN 43.357000 \n",
"1 A Coru\\u00f1a ACOR1 TG tide gauge NaN NaN 43.364400 \n",
"2 A Coruña acor1 NaN 43.364000 \n",
"3 A121 NaN NaN 55.400002 \n",
"4 A2 NaN NaN 51.360600 \n",
"... ... ... ... ... \n",
"6095 Zygi zygi NaN 34.727000 \n",
"6097 papho NaN NaN 34.755100 \n",
"6098 s_Gravendeel NaN NaN 51.779151 \n",
"6099 s_Gravendeel_haven NaN NaN 51.782206 \n",
"6100 Ålesund ales NaN 62.469000 \n",
"\n",
" longitude Country connection contacts \\\n",
"0 -8.389000 NaN NaN NaN \n",
"1 -8.398900 NaN NaN NaN \n",
"2 -8.399000 Spain web Instituto Geográfico Nacional ( Spain ) \n",
"3 3.810000 NLD NaN NaN \n",
"4 3.118330 BEL NaN NaN \n",
"... ... ... ... ... \n",
"6095 33.338000 Cyprus ftp Cyprus Oceanography Center ( Cyprus ) \n",
"6097 32.408798 CYP NaN NaN \n",
"6098 4.625854 NLD NaN NaN \n",
"6099 4.625803 NLD NaN NaN \n",
"6100 6.152000 Norway web Norwegian Hydrographic Service ( Norway ) \n",
"\n",
" dcp_id last_observation_level ... start_date_time \\\n",
"0 NaN NaN ... NaN \n",
"1 NaN NaN ... NaN \n",
"2 NaN 4.27 ... NaN \n",
"3 NaN NaN ... 2015-06-28 00:00:00 \n",
"4 NaN NaN ... 2020-02-28 00:00:00 \n",
"... ... ... ... ... \n",
"6095 ZYGI1 1.91 ... NaN \n",
"6097 NaN NaN ... 2014-11-01 00:00:00 \n",
"6098 NaN NaN ... 1970-10-30 07:00:00 \n",
"6099 NaN NaN ... 1983-12-31 23:00:00 \n",
"6100 NaN 2.05 ... NaN \n",
"\n",
" end_date_time number_of_years time_zone_hours \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 2020-10-06 07:40:00 6.0 0.0 \n",
"4 2021-01-12 08:55:00 2.0 0.0 \n",
"... ... ... ... \n",
"6095 NaN NaN NaN \n",
"6097 2016-08-26 01:00:00 3.0 0.0 \n",
"6098 1983-12-31 22:00:00 14.0 0.0 \n",
"6099 1992-12-31 22:50:00 10.0 0.0 \n",
"6100 NaN NaN NaN \n",
"\n",
" datum_information instrument precision null_value gauge_type \\\n",
"0 NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN \n",
"2 NaN NaN NaN NaN NaN \n",
"3 MSL Unspecified Unspecified -99.9999 Coastal \n",
"4 MSL Unspecified Unspecified -99.9999 Coastal \n",
"... ... ... ... ... ... \n",
"6095 NaN NaN NaN NaN NaN \n",
"6097 Unknown Unspecified Unspecified -99.9999 Coastal \n",
"6098 Normal Amsterdam Level Unspecified Unspecified -99.9999 River \n",
"6099 Normal Amsterdam Level Unspecified Unspecified -99.9999 River \n",
"6100 NaN NaN NaN NaN NaN \n",
"\n",
" overall_record_quality \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 No obvious issues \n",
"4 No obvious issues \n",
"... ... \n",
"6095 NaN \n",
"6097 Possible quality control issues \n",
"6098 No obvious issues \n",
"6099 No obvious issues \n",
"6100 NaN \n",
"\n",
"[5821 rows x 71 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s1"
]
},
{
"cell_type": "markdown",
"id": "401c6212-dbfa-42f3-9139-a4502dfe4bee",
"metadata": {},
"source": [
"### find lat/lon duplicates within a tolerance"
]
},
{
"cell_type": "markdown",
"id": "643144b6-c2ba-421b-b4be-55691c8259a1",
"metadata": {},
"source": [
"https://stackoverflow.com/questions/65166038/how-to-find-duplicate-rows-in-a-dataframe-with-given-tolerance-based-on-a-single"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "acb4e5ed-6f11-442d-9e4d-b8eefdac71f3",
"metadata": {},
"outputs": [],
"source": [
"s2 = s1.sort_values(['latitude','longitude'])"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "2268b4a9-2ffe-48dc-81a4-9a66e106c8aa",
"metadata": {},
"outputs": [],
"source": [
"ds = s2[['latitude','longitude']].diff() < [0.02,0.02]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "0f99ce6a-4144-4d00-8b45-6a5d963f47fc",
"metadata": {},
"outputs": [],
"source": [
"ds_ = ds.all(axis='columns') # get where you match both lat/lon"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "65de5f06-dcb0-4f6a-842b-e166d1194af4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Station_Name</th>\n",
" <th>ioc_code</th>\n",
" <th>gloss_id</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>Country</th>\n",
" <th>connection</th>\n",
" <th>contacts</th>\n",
" <th>dcp_id</th>\n",
" <th>last_observation_level</th>\n",
" <th>...</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>number_of_years</th>\n",
" <th>time_zone_hours</th>\n",
" <th>datum_information</th>\n",
" <th>instrument</th>\n",
" <th>precision</th>\n",
" <th>null_value</th>\n",
" <th>gauge_type</th>\n",
" <th>overall_record_quality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5397</th>\n",
" <td>Syowa</td>\n",
" <td>syow</td>\n",
" <td>95.0</td>\n",
" <td>-69.008000</td>\n",
" <td>39.570000</td>\n",
" <td>Antarctica</td>\n",
" <td>web</td>\n",
" <td>Japan Coast Guard ( Japan)</td>\n",
" <td>NaN</td>\n",
" <td>11.90</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5399</th>\n",
" <td>Syowa_Station_Antarctica</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-69.007783</td>\n",
" <td>39.570283</td>\n",
" <td>ATA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1986-12-31 21:00:00</td>\n",
" <td>2019-12-31 20:00:00</td>\n",
" <td>33.0</td>\n",
" <td>0.0</td>\n",
" <td>Zero of Tide Height</td>\n",
" <td>Float</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible datum issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1422</th>\n",
" <td>Dumont_dUrville</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-66.662000</td>\n",
" <td>140.010000</td>\n",
" <td>ATA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2008-01-16 07:00:00</td>\n",
" <td>2010-08-19 20:00:00</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1421</th>\n",
" <td>Dumont_D_Urville</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-66.661668</td>\n",
" <td>140.010001</td>\n",
" <td>FRA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1952-05-01 15:00:00</td>\n",
" <td>2000-03-02 05:00:00</td>\n",
" <td>8.0</td>\n",
" <td>0.0</td>\n",
" <td>Zero Hydrographique</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1420</th>\n",
" <td>DumontDUrville_60minute</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-66.661667</td>\n",
" <td>140.009990</td>\n",
" <td>FRA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1952-05-01 15:00:00</td>\n",
" <td>2017-11-30 23:00:00</td>\n",
" <td>22.0</td>\n",
" <td>0.0</td>\n",
" <td>chart datum</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible datum issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>761</th>\n",
" <td>Cameron_Island_West_Side</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>76.500000</td>\n",
" <td>-104.580000</td>\n",
" <td>CAN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1985-03-13 21:00:00</td>\n",
" <td>1985-04-25 21:00:00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3730</th>\n",
" <td>Ny_Alesund</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>78.928543</td>\n",
" <td>11.938015</td>\n",
" <td>NOR</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2014-01-01 00:00:00</td>\n",
" <td>2020-10-10 15:00:00</td>\n",
" <td>7.0</td>\n",
" <td>0.0</td>\n",
" <td>MSL</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3731</th>\n",
" <td>Ny_alesund</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>78.928545</td>\n",
" <td>11.938015</td>\n",
" <td>NOR</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1976-08-30 23:00:00</td>\n",
" <td>2020-12-31 23:00:00</td>\n",
" <td>45.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3729</th>\n",
" <td>Ny Ålesund</td>\n",
" <td>nyal</td>\n",
" <td>345.0</td>\n",
" <td>78.929000</td>\n",
" <td>11.938000</td>\n",
" <td>Norway</td>\n",
" <td>web</td>\n",
" <td>Norwegian Hydrographic Service ( Norway )</td>\n",
" <td>NaN</td>\n",
" <td>1.02</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3732</th>\n",
" <td>Nylesund</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>78.938000</td>\n",
" <td>11.938000</td>\n",
" <td>NOR</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1976-08-30 23:00:00</td>\n",
" <td>2018-12-30 22:00:00</td>\n",
" <td>42.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4330 rows × 71 columns</p>\n",
"</div>"
],
"text/plain": [
" Station_Name ioc_code gloss_id latitude longitude \\\n",
"5397 Syowa syow 95.0 -69.008000 39.570000 \n",
"5399 Syowa_Station_Antarctica NaN NaN -69.007783 39.570283 \n",
"1422 Dumont_dUrville NaN NaN -66.662000 140.010000 \n",
"1421 Dumont_D_Urville NaN NaN -66.661668 140.010001 \n",
"1420 DumontDUrville_60minute NaN NaN -66.661667 140.009990 \n",
"... ... ... ... ... ... \n",
"761 Cameron_Island_West_Side NaN NaN 76.500000 -104.580000 \n",
"3730 Ny_Alesund NaN NaN 78.928543 11.938015 \n",
"3731 Ny_alesund NaN NaN 78.928545 11.938015 \n",
"3729 Ny Ålesund nyal 345.0 78.929000 11.938000 \n",
"3732 Nylesund NaN NaN 78.938000 11.938000 \n",
"\n",
" Country connection contacts dcp_id \\\n",
"5397 Antarctica web Japan Coast Guard ( Japan) NaN \n",
"5399 ATA NaN NaN NaN \n",
"1422 ATA NaN NaN NaN \n",
"1421 FRA NaN NaN NaN \n",
"1420 FRA NaN NaN NaN \n",
"... ... ... ... ... \n",
"761 CAN NaN NaN NaN \n",
"3730 NOR NaN NaN NaN \n",
"3731 NOR NaN NaN NaN \n",
"3729 Norway web Norwegian Hydrographic Service ( Norway ) NaN \n",
"3732 NOR NaN NaN NaN \n",
"\n",
" last_observation_level ... start_date_time end_date_time \\\n",
"5397 11.90 ... NaN NaN \n",
"5399 NaN ... 1986-12-31 21:00:00 2019-12-31 20:00:00 \n",
"1422 NaN ... 2008-01-16 07:00:00 2010-08-19 20:00:00 \n",
"1421 NaN ... 1952-05-01 15:00:00 2000-03-02 05:00:00 \n",
"1420 NaN ... 1952-05-01 15:00:00 2017-11-30 23:00:00 \n",
"... ... ... ... ... \n",
"761 NaN ... 1985-03-13 21:00:00 1985-04-25 21:00:00 \n",
"3730 NaN ... 2014-01-01 00:00:00 2020-10-10 15:00:00 \n",
"3731 NaN ... 1976-08-30 23:00:00 2020-12-31 23:00:00 \n",
"3729 1.02 ... NaN NaN \n",
"3732 NaN ... 1976-08-30 23:00:00 2018-12-30 22:00:00 \n",
"\n",
" number_of_years time_zone_hours datum_information instrument \\\n",
"5397 NaN NaN NaN NaN \n",
"5399 33.0 0.0 Zero of Tide Height Float \n",
"1422 3.0 0.0 Unspecified Unspecified \n",
"1421 8.0 0.0 Zero Hydrographique Unspecified \n",
"1420 22.0 0.0 chart datum Unspecified \n",
"... ... ... ... ... \n",
"761 1.0 0.0 Chart Datum (CD) Unspecified \n",
"3730 7.0 0.0 MSL Unspecified \n",
"3731 45.0 0.0 Chart Datum Unspecified \n",
"3729 NaN NaN NaN NaN \n",
"3732 42.0 0.0 Unspecified Unspecified \n",
"\n",
" precision null_value gauge_type overall_record_quality \n",
"5397 NaN NaN NaN NaN \n",
"5399 Unspecified -99.9999 Coastal Possible datum issues \n",
"1422 Unspecified -99.9999 Coastal No obvious issues \n",
"1421 Unspecified -99.9999 Coastal No obvious issues \n",
"1420 Unspecified -99.9999 Coastal Possible datum issues \n",
"... ... ... ... ... \n",
"761 Unspecified -99.9999 Coastal No obvious issues \n",
"3730 Unspecified -99.9999 Coastal No obvious issues \n",
"3731 Unspecified -99.9999 Coastal No obvious issues \n",
"3729 NaN NaN NaN NaN \n",
"3732 Unspecified -99.9999 Coastal No obvious issues \n",
"\n",
"[4330 rows x 71 columns]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s2[ds_ | ds_.shift(-1)]"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "978855e6-e983-49f2-8b00-38a12d156d7f",
"metadata": {},
"outputs": [],
"source": [
"dps = s2[ds_ | ds_.shift(-1)].index.tolist()[::2] #find which indices to drop"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "ce94dbe4-dc84-49b9-ae4b-2cf1339ee6ed",
"metadata": {},
"outputs": [],
"source": [
"s3 = s2.drop(dps)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "785b6b0c-7726-4404-add2-d37ed8d96a99",
"metadata": {},
"outputs": [],
"source": [
"s3 = s3.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "968c156f-8648-4f1c-a4fe-477b907dc1d9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s3.duplicated('Station_Name').sum()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "4266408f-71fa-4da0-9f2b-75b4b8451089",
"metadata": {},
"outputs": [],
"source": [
"s3['location'] = s3.Station_Name\n",
"s3 = s3.drop('Station_Name',axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "6da092c8-bd66-4ed7-91c9-a0daf0d7f032",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ioc_code</th>\n",
" <th>gloss_id</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" <th>Country</th>\n",
" <th>connection</th>\n",
" <th>contacts</th>\n",
" <th>dcp_id</th>\n",
" <th>last_observation_level</th>\n",
" <th>last_observation_time</th>\n",
" <th>...</th>\n",
" <th>end_date_time</th>\n",
" <th>number_of_years</th>\n",
" <th>time_zone_hours</th>\n",
" <th>datum_information</th>\n",
" <th>instrument</th>\n",
" <th>precision</th>\n",
" <th>null_value</th>\n",
" <th>gauge_type</th>\n",
" <th>overall_record_quality</th>\n",
" <th>location</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-77.850000</td>\n",
" <td>166.767000</td>\n",
" <td>NZL</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2017-06-13 12:00:00</td>\n",
" <td>14.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Scott_Base</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-69.007783</td>\n",
" <td>39.570283</td>\n",
" <td>ATA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2019-12-31 20:00:00</td>\n",
" <td>33.0</td>\n",
" <td>0.0</td>\n",
" <td>Zero of Tide Height</td>\n",
" <td>Float</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible datum issues</td>\n",
" <td>Syowa_Station_Antarctica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-69.000000</td>\n",
" <td>39.600000</td>\n",
" <td>ATA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2015-12-31 20:00:00</td>\n",
" <td>29.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Syowa_Antarctica</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-68.450000</td>\n",
" <td>77.967000</td>\n",
" <td>AUS</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2010-01-28 08:00:00</td>\n",
" <td>18.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Davis</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-68.133000</td>\n",
" <td>-67.100000</td>\n",
" <td>ARG</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1999-02-09 23:00:00</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>San_Martin</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3651</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>81.950000</td>\n",
" <td>-60.080000</td>\n",
" <td>CAN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1975-06-03 12:00:00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Newman_Bay</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3652</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82.120000</td>\n",
" <td>-62.030000</td>\n",
" <td>CAN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1975-05-29 08:00:00</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Lincoln_Bay</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3653</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82.270000</td>\n",
" <td>-86.800000</td>\n",
" <td>CAN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1983-05-04 18:00:00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Cape_Woods_Offshore</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3654</th>\n",
" <td>aler</td>\n",
" <td>333.0</td>\n",
" <td>82.490000</td>\n",
" <td>-62.320000</td>\n",
" <td>Canada</td>\n",
" <td>web</td>\n",
" <td>Fisheries and Oceans Canada ( Canada )</td>\n",
" <td>NaN</td>\n",
" <td>0.36</td>\n",
" <td>2023-07-31 06:03</td>\n",
" <td>...</td>\n",
" <td>2021-04-13 05:00:00</td>\n",
" <td>32.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>No obvious issues</td>\n",
" <td>Alert</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3655</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>82.880000</td>\n",
" <td>-73.500000</td>\n",
" <td>CAN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>1971-07-22 05:00:00</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>Chart Datum (CD)</td>\n",
" <td>Unspecified</td>\n",
" <td>Unspecified</td>\n",
" <td>-99.9999</td>\n",
" <td>Coastal</td>\n",
" <td>Possible datum issues</td>\n",
" <td>Disraeli_Fiord</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3656 rows × 71 columns</p>\n",
"</div>"
],
"text/plain": [
" ioc_code gloss_id latitude longitude Country connection \\\n",
"0 NaN NaN -77.850000 166.767000 NZL NaN \n",
"1 NaN NaN -69.007783 39.570283 ATA NaN \n",
"2 NaN NaN -69.000000 39.600000 ATA NaN \n",
"3 NaN NaN -68.450000 77.967000 AUS NaN \n",
"4 NaN NaN -68.133000 -67.100000 ARG NaN \n",
"... ... ... ... ... ... ... \n",
"3651 NaN NaN 81.950000 -60.080000 CAN NaN \n",
"3652 NaN NaN 82.120000 -62.030000 CAN NaN \n",
"3653 NaN NaN 82.270000 -86.800000 CAN NaN \n",
"3654 aler 333.0 82.490000 -62.320000 Canada web \n",
"3655 NaN NaN 82.880000 -73.500000 CAN NaN \n",
"\n",
" contacts dcp_id last_observation_level \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"3651 NaN NaN NaN \n",
"3652 NaN NaN NaN \n",
"3653 NaN NaN NaN \n",
"3654 Fisheries and Oceans Canada ( Canada ) NaN 0.36 \n",
"3655 NaN NaN NaN \n",
"\n",
" last_observation_time ... end_date_time number_of_years \\\n",
"0 NaN ... 2017-06-13 12:00:00 14.0 \n",
"1 NaN ... 2019-12-31 20:00:00 33.0 \n",
"2 NaN ... 2015-12-31 20:00:00 29.0 \n",
"3 NaN ... 2010-01-28 08:00:00 18.0 \n",
"4 NaN ... 1999-02-09 23:00:00 2.0 \n",
"... ... ... ... ... \n",
"3651 NaN ... 1975-06-03 12:00:00 1.0 \n",
"3652 NaN ... 1975-05-29 08:00:00 2.0 \n",
"3653 NaN ... 1983-05-04 18:00:00 1.0 \n",
"3654 2023-07-31 06:03 ... 2021-04-13 05:00:00 32.0 \n",
"3655 NaN ... 1971-07-22 05:00:00 2.0 \n",
"\n",
" time_zone_hours datum_information instrument precision \\\n",
"0 0.0 Unspecified Unspecified Unspecified \n",
"1 0.0 Zero of Tide Height Float Unspecified \n",
"2 0.0 Unspecified Unspecified Unspecified \n",
"3 0.0 Unspecified Unspecified Unspecified \n",
"4 0.0 Unspecified Unspecified Unspecified \n",
"... ... ... ... ... \n",
"3651 0.0 Chart Datum (CD) Unspecified Unspecified \n",
"3652 0.0 Chart Datum (CD) Unspecified Unspecified \n",
"3653 0.0 Chart Datum (CD) Unspecified Unspecified \n",
"3654 0.0 Chart Datum (CD) Unspecified Unspecified \n",
"3655 0.0 Chart Datum (CD) Unspecified Unspecified \n",
"\n",
" null_value gauge_type overall_record_quality location \n",
"0 -99.9999 Coastal No obvious issues Scott_Base \n",
"1 -99.9999 Coastal Possible datum issues Syowa_Station_Antarctica \n",
"2 -99.9999 Coastal No obvious issues Syowa_Antarctica \n",
"3 -99.9999 Coastal No obvious issues Davis \n",
"4 -99.9999 Coastal No obvious issues San_Martin \n",
"... ... ... ... ... \n",
"3651 -99.9999 Coastal No obvious issues Newman_Bay \n",
"3652 -99.9999 Coastal No obvious issues Lincoln_Bay \n",
"3653 -99.9999 Coastal No obvious issues Cape_Woods_Offshore \n",
"3654 -99.9999 Coastal No obvious issues Alert \n",
"3655 -99.9999 Coastal Possible datum issues Disraeli_Fiord \n",
"\n",
"[3656 rows x 71 columns]"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s3"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "b5753d55-7038-4cf6-bb77-01d18869e986",
"metadata": {},
"outputs": [],
"source": [
"s3.to_csv('/Volumes/B1TB_1/global_stations.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b57e7f5a-66ff-4a8f-a7b8-c296c876d515",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dev",
"language": "python",
"name": "dev"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment