Skip to content

Instantly share code, notes, and snippets.

@tts tts/gist:5237811
Last active Dec 15, 2015

Embed
What would you like to do?
Querying DBpedia on the GeoNameID of Finnish municipalities, and then Europeana on the amount of different resource types enriched with these GeoNameIDs
#######################################################################
#
# Querying DBpedia on the GeoNameID of Finnish municipalities, and then
# Europeana on the amount of different resource types enriched with these
# GeoNameIDs
#
# More on this
# http://tts2.blogspot.fi/2013/03/some-europeana-av-resources-related-to.html
#
# Tuija Sonkkila 17.3.2013
#
########################################################################
library(sorvi)
# Read municipality data from Land Survey Finland (MML) data
# (C) MML 2011
sp <- LoadMML(data.id = "kunta1_p", resolution = "1_milj_Shape_etrs_shape")
comms <- ConvertMunicipalityNames(sp$Kunta.FI)
N <- length(comms)
###################################
#
# GeoNameID via DBpedia
#
###################################
library(SPARQL)
dbpedia_endpoint <- "http://dbpedia.org/sparql"
geo.df <- data.frame(c = character(N),
id = character(N),
stringsAsFactors = FALSE)
for (i in 1:N) {
c <- comms[i]
dbq <- paste("select ?geoid where {
?s <http://www.w3.org/2000/01/rdf-schema#label> '", c, "'@fi ;
<http://www.w3.org/2002/07/owl#sameAs> ?geoid .
FILTER(CONTAINS(str(?geoid), 'geonames')) }", sep = "")
cat("Next querying ", c, " (", i , "/", N , ")\n", sep = "")
dbres <- SPARQL(url = dbpedia_endpoint, dbq)$results
geo.df$c[i] <- c
if ( length(dbres) > 0 ) {
geo.df$id[i] <- dbres[1,1]
} else {
geo.df$id[i] <- 0
}
}
# 8.3 % without a GeoNameID
paste(round((nrow(geo.df[geo.df$id == 0, ]) / nrow(geo.df) ) * 100, 1), "%", sep = "")
write.csv(geo.df, file = "kunta_geoid.csv", row.names = FALSE)
################################
#
# Query Europeana with
# GeoNameIDs
#
# Resource types:
# IMAGE, SOUND, TEXT, VIDEO
#
################################
eu_endpoint <- "http://europeana.ontotext.com/sparql"
geoids <- geo.df[geo.df$id != 0, ]
N <- nrow(geoids)
Ndf <- N * 4
geoitems.df <- data.frame(c = character(Ndf),
type = character(Ndf),
count = integer(Ndf),
stringsAsFactors = FALSE)
row <- 1
for (i in 1:N) {
kunta <- geoids$c[i]
id <- geoids$id[i]
for (j in c("IMAGE", "SOUND", "TEXT", "VIDEO")) {
idq <- paste("SELECT (COUNT(DISTINCT ?object) AS ?count)
WHERE {
?euProxy ore:proxyFor ?object ;
edm:hasMet ", id, " ;
ore:proxyIn ?euAggr .
?euAggr ore:aggregates ?providerAggr .
?providerProxy ore:proxyIn ?providerAggr ;
edm:type '", j, "' .
}", sep="")
cat("Next querying ", kunta, " (", i , "/", N , "): ", j, "\n", sep = "")
idres <- SPARQL(url = eu_endpoint, idq)$results
geoitems.df$c[row] <- kunta
geoitems.df$type[row] <- j
geoitems.df$count[row] <- idres[1,1]
row <- row+1
}
}
write.csv(geoitems.df, file = "suomi_kunnat_geoitems.csv", row.names = FALSE)
##################################################
#
# Plotting GeoNameID-enriched items
# with a stacked bar chart a la Datavaalit
#
# https://github.com/louhos/sorvi/wiki/Datavaalit
#
##################################################
geoitems.df[geoitems.df$c=='Pedersören kunta',]$c <- 'Pedersören kunta'
minis <- items[items$count > 0 & items$count <= 100, ]
smalls <- geoitems.df[geoitems.df$count > 0 & geoitems.df$count <= 500, ]
mediums <- geoitems.df[geoitems.df$count >= 500 & geoitems.df$count <= 5000, ]
bigs <- geoitems.df[geoitems.df$count >= 5000, ]
library(ggplot2)
theme_set(theme_bw(20))
p0 <- ggplot(minis, aes(x = c, weight = count, fill = type)) +
geom_bar(stat = "bin", position = "stack") +
ylab("") +
xlab("") +
ggtitle("GeoNameID-enriched items (max 100) per Finnish municipality") +
theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) +
coord_flip()
print(p0)
p1 <- ggplot(smalls, aes(x = c, weight = count, fill = type)) +
geom_bar(stat = "bin", position = "stack") +
ylab("") +
xlab("") +
ggtitle("GeoNameID-enriched items (max 500) per Finnish municipality") +
theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) +
coord_flip()
print(p1)
p2 <- ggplot(mediums, aes(x = c, weight = count, fill = type)) +
geom_bar(stat = "bin", position = "stack") +
ylab("") +
xlab("") +
ggtitle("GeoNameID-enriched items (max 5000) per Finnish municipality") +
theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) +
coord_flip()
print(p2)
p3 <- ggplot(bigs, aes(x = c, weight = count, fill = type)) +
geom_bar(stat = "bin", position = "stack") +
ylab("") +
xlab("") +
ggtitle("GeoNameID-enriched items (min 5000) per Finnish municipality") +
theme(axis.text.x = element_text(angle = 30), plot.title = element_text(size = 13)) +
coord_flip()
print(p3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.