Last active
December 15, 2015 09:19
-
-
Save tts/5237811 to your computer and use it in GitHub Desktop.
Querying DBpedia on the GeoNameID of Finnish municipalities, and then Europeana on the amount of different resource types enriched with these GeoNameIDs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ####################################################################### | |
| # | |
| # Querying DBpedia on the GeoNameID of Finnish municipalities, and then | |
| # Europeana on the amount of different resource types enriched with these | |
| # GeoNameIDs | |
| # | |
| # http://tts2.blogspot.fi/2013/03/some-europeana-av-resources-related-to.html | |
| # | |
| # Tuija Sonkkila 17.3.2013 | |
| # | |
| ######################################################################## | |
| library(sorvi) | |
| # Read municipality data from Land Survey Finland (MML) data | |
| # (C) MML 2011 | |
| sp <- LoadMML(data.id = "kunta1_p", resolution = "1_milj_Shape_etrs_shape") | |
| comms <- ConvertMunicipalityNames(sp$Kunta.FI) | |
| N <- length(comms) | |
| ################################### | |
| # | |
| # GeoNameID via DBpedia | |
| # | |
| ################################### | |
| library(SPARQL) | |
| dbpedia_endpoint <- "http://dbpedia.org/sparql" | |
| geo.df <- data.frame(c = character(N), | |
| id = character(N), | |
| stringsAsFactors = FALSE) | |
| for (i in 1:N) { | |
| c <- comms[i] | |
| dbq <- paste("select ?geoid where { | |
| ?s <http://www.w3.org/2000/01/rdf-schema#label> '", c, "'@fi ; | |
| <http://www.w3.org/2002/07/owl#sameAs> ?geoid . | |
| FILTER(CONTAINS(str(?geoid), 'geonames')) }", sep = "") | |
| cat("Next querying ", c, " (", i , "/", N , ")\n", sep = "") | |
| dbres <- SPARQL(url = dbpedia_endpoint, dbq)$results | |
| geo.df$c[i] <- c | |
| if ( length(dbres) > 0 ) { | |
| geo.df$id[i] <- dbres[1,1] | |
| } else { | |
| geo.df$id[i] <- 0 | |
| } | |
| } | |
| # 8.3 % without a GeoNameID | |
| paste(round((nrow(geo.df[geo.df$id == 0, ]) / nrow(geo.df) ) * 100, 1), "%", sep = "") | |
| write.csv(geo.df, file = "kunta_geoid.csv", row.names = FALSE) | |
| ################################ | |
| # | |
| # Query Europeana with | |
| # GeoNameIDs | |
| # | |
| # Resource types: | |
| # IMAGE, SOUND, TEXT, VIDEO | |
| # | |
| ################################ | |
| eu_endpoint <- "http://europeana.ontotext.com/sparql" | |
| geoids <- geo.df[geo.df$id != 0, ] | |
| N <- nrow(geoids) | |
| Ndf <- N * 4 | |
| geoitems.df <- data.frame(c = character(Ndf), | |
| type = character(Ndf), | |
| count = integer(Ndf), | |
| stringsAsFactors = FALSE) | |
| row <- 1 | |
| for (i in 1:N) { | |
| kunta <- geoids$c[i] | |
| id <- geoids$id[i] | |
| for (j in c("IMAGE", "SOUND", "TEXT", "VIDEO")) { | |
| idq <- paste("SELECT (COUNT(DISTINCT ?object) AS ?count) | |
| WHERE { | |
| ?euProxy ore:proxyFor ?object ; | |
| edm:hasMet ", id, " ; | |
| ore:proxyIn ?euAggr . | |
| ?euAggr ore:aggregates ?providerAggr . | |
| ?providerProxy ore:proxyIn ?providerAggr ; | |
| edm:type '", j, "' . | |
| }", sep="") | |
| cat("Next querying ", kunta, " (", i , "/", N , "): ", j, "\n", sep = "") | |
| idres <- SPARQL(url = eu_endpoint, idq)$results | |
| geoitems.df$c[row] <- kunta | |
| geoitems.df$type[row] <- j | |
| geoitems.df$count[row] <- idres[1,1] | |
| row <- row+1 | |
| } | |
| } | |
| write.csv(geoitems.df, file = "suomi_kunnat_geoitems.csv", row.names = FALSE) | |
| ################################################## | |
| # | |
| # Plotting GeoNameID-enriched items | |
| # with a stacked bar chart a la Datavaalit | |
| # | |
| # https://github.com/louhos/sorvi/wiki/Datavaalit | |
| # | |
| ################################################## | |
| geoitems.df[geoitems.df$c=='Pedersören kunta',]$c <- 'Pedersören kunta' | |
| minis <- items[items$count > 0 & items$count <= 100, ] | |
| smalls <- geoitems.df[geoitems.df$count > 0 & geoitems.df$count <= 500, ] | |
| mediums <- geoitems.df[geoitems.df$count >= 500 & geoitems.df$count <= 5000, ] | |
| bigs <- geoitems.df[geoitems.df$count >= 5000, ] | |
| library(ggplot2) | |
| theme_set(theme_bw(20)) | |
| p0 <- ggplot(minis, aes(x = c, weight = count, fill = type)) + | |
| geom_bar(stat = "bin", position = "stack") + | |
| ylab("") + | |
| xlab("") + | |
| ggtitle("GeoNameID-enriched items (max 100) per Finnish municipality") + | |
| theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) + | |
| coord_flip() | |
| print(p0) | |
| p1 <- ggplot(smalls, aes(x = c, weight = count, fill = type)) + | |
| geom_bar(stat = "bin", position = "stack") + | |
| ylab("") + | |
| xlab("") + | |
| ggtitle("GeoNameID-enriched items (max 500) per Finnish municipality") + | |
| theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) + | |
| coord_flip() | |
| print(p1) | |
| p2 <- ggplot(mediums, aes(x = c, weight = count, fill = type)) + | |
| geom_bar(stat = "bin", position = "stack") + | |
| ylab("") + | |
| xlab("") + | |
| ggtitle("GeoNameID-enriched items (max 5000) per Finnish municipality") + | |
| theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) + | |
| coord_flip() | |
| print(p2) | |
| p3 <- ggplot(bigs, aes(x = c, weight = count, fill = type)) + | |
| geom_bar(stat = "bin", position = "stack") + | |
| ylab("") + | |
| xlab("") + | |
| ggtitle("GeoNameID-enriched items (min 5000) per Finnish municipality") + | |
| theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) + | |
| coord_flip() | |
| print(p3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment