Last active
January 25, 2019 14:33
-
-
Save Bakaniko/f847f0cb09e6747c6917d5215ba6063a to your computer and use it in GitHub Desktop.
Cleaning some spatial data in WGS84 with R: remove all cities closest to 20 km to each other
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!-- | |
Created by Nicolas Roelandt, IFSTTAR 2019 | |
Licence GPL v3.0 | |
--> | |
# Load needed packages | |
```{r load_libraries} | |
library(tidyverse) | |
library(sf) | |
library(units) | |
library(lwgeom) | |
library(reshape2) | |
``` | |
<!-- original data --> | |
# Load the data | |
```{r load_data_source} | |
cities <- read_csv(url("https://raw.githubusercontent.com/Bakaniko/shriveling_world/master/example/datas/cities.csv"), col_types = cols(radius = col_number(), | |
yearHST = col_integer(), yearMotorway = col_integer())) | |
head(cities) | |
population <- read_csv(url("https://raw.githubusercontent.com/Bakaniko/shriveling_world/master/example/datas/population.csv") | |
``` | |
The line 1371 is the only one to have a radius. Probably a typo. | |
So we have 10 columns, we will need latitude and longitude to create points. | |
# Create geometries | |
```{r Create geometry} | |
cities_df = st_as_sf(cities, coords = c("longitude", "latitude"), crs = 4326) | |
head(cities_df) | |
``` | |
```{r, df_simplification} | |
cities_df_simplified <- cities_df %>% | |
select("cityCode","urbanAgglomeration") | |
head(cities_df_simplified) | |
``` | |
<!-- sample data --> | |
```{r, eval=FALSE} | |
# Create a tribble from the data | |
df <- tibble::tribble( | |
~index, ~city, ~lat, ~lon, | |
1172, "Zaria", 11.11128, 7.7227, | |
1173, "Oslo", 59.91273, 10.74609, | |
1174, "Masqat (Muscat)", 23.61387, 58.5922, | |
1175, "Bahawalpur", 29.4, 71.68333, | |
1181, "Islamabad", 33.70351,73.059373, | |
1194, "Rawalpindi", 33.6,73.0666667 | |
) | |
df | |
cities_df_simplified <- st_as_sf(df, coords = c("lon", "lat"), crs = 4326) | |
``` | |
```{r, distance_matrix} | |
#Create distance matrix | |
#dist_mat <- st_distance(cities_df_simplified) | |
dist_mat <- st_distance(cities_df_simplified) | |
# Create a Truth matrix when 0 < distance < 200000 meters | |
# source: https://stackoverflow.com/questions/54294484/rstats-how-to-convert-kilometers-into-arc-degrees-in-order-to-create-buffers-w/54296072#54296072 | |
truth_matrix <- dist_mat < set_units(20000, 'm') & dist_mat > set_units(0,'m') | |
# Put cities index as row and column names | |
dimnames(truth_matrix) <- list(cities_df$cityCode, cities_df$cityCode) | |
# fusion de la matrice pour créer une colonne pour chaque identifiant | |
# les valeur sont filtrées pour ne garder que les valeurs posiitives. | |
# source: https://stackoverflow.com/questions/32024620/subset-data-frame-to-get-rowname-colname-and-value | |
extraction <- subset(melt(truth_matrix), value ==1) | |
#extraction | |
``` | |
```{r create_pairs} | |
# Create pairs from the extract of the truth table | |
vars <- c(cityCodeV1 = "Var1", cityCodeV2 ="Var2") | |
pairs <- extraction %>% | |
rowwise() %>% | |
mutate(unique_order = | |
paste(min(Var1, Var2), | |
max(Var1, Var2), sep = '-')) %>% | |
distinct(unique_order, .keep_all = TRUE) %>% | |
rename(!!vars) | |
pairs_geom <- pairs %>% | |
left_join(cities_df, by = c("cityCodeV1" = "cityCode")) %>% | |
st_as_sf() | |
head(pairs_geom) | |
``` | |
# Order by population | |
```{r population_order} | |
# Create a simple df with only cityCode and 2015 population | |
pop2015 <- population %>% | |
select(cityCode, pop2015) | |
pairs_pop <- pairs_geom %>% | |
left_join(pop2015, by = c("cityCodeV1" = "cityCode")) %>% | |
rename(v1_pop2015 = pop2015) %>% | |
left_join(pop2015, by =c("cityCodeV2" = "cityCode")) %>% | |
rename(v2_pop2015 = pop2015) | |
# head(pairs_pop) | |
to_clean_df <- setNames(data.frame(matrix(ncol = 2, nrow = dim(pairs_pop)[1])),c("to_keep", "to_keep_pop")) | |
# For each row, determine which city is the biggest and adds the two cities population | |
for (i in 1:dim(pairs_pop)[1]) { | |
if(pairs_pop$v1_pop2015[i] > pairs_pop$v2_pop2015[i]){ | |
to_clean_df$to_keep[i] = pairs_pop$cityCodeV1[i] | |
to_clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i] | |
} | |
else | |
{ | |
to_clean_df$to_keep[i] = pairs_pop$cityCodeV2[i] | |
to_clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i] | |
} | |
} | |
to_clean_df | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment