Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@hrbrmstr
Last active August 29, 2015 13:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrbrmstr/0a330b212b46517ffee9 to your computer and use it in GitHub Desktop.
Save hrbrmstr/0a330b212b46517ffee9 to your computer and use it in GitHub Desktop.
Answer to http://stackoverflow.com/questions/22855197/how-do-you-get-geom-map-to-show-all-parts-of-a-map/22875922 since it's long-ish and easier to cut/paste from here than on SO
library(ggplot2)
library(maps)
library(plyr)
library(gridExtra)
ARCTP53_SOExample <- read.csv("dat.csv")
# reduce all the distinct exon/introns to just exon or intron
ARCTP53_SOExample$EorI <- factor(ifelse(grepl("exon",
ARCTP53_SOExample$ExonIntron,
ignore.case = TRUE),
"exon", "intron"))
# extract summary data for the two variables we care about for the map
arc.combined <- count(ARCTP53_SOExample, .(Country, EorI))
colnames(arc.combined) <- c("region", "EorI", "ei.ct")
# get total for country (region) and add to the summary info
arc.combined <- merge(arc.combined, count(arc.combined, .(region), wt_var=.(ei.ct)))
colnames(arc.combined) <- c("region", "EorI", "ei.ct", "region.total")
# it wasn't specified if the "EorI" is going to be used on the map so
# we won't use it below (but we could, now)
# get map and intercourse Antarctica
world_map <- map_data("world")
world_map <- subset(world_map, region!="Antarctica")
# DEAL WITH MISSING REGIONS
# get our regions and the regions named in the "world"
wm.reg <- unique(as.character(world_map$region))
arc.reg <- unique(as.character(ARCTP53_SOExample$Country))
# this shows the missing ones
sort(arc.reg[!arc.reg %in% wm.reg])
# this shows all valid ones
sort(wm.reg)
# INSERT YOUR #SPIFFY CODE TO CLEANUP THE REGIONS HERE :-)
# this will show the counts by country with all of the "chart junk" removed
# and the "counts" scaled as a gradient, and with the legend at the top
gg <- ggplot(arc.combined)
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=region),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = region, fill = region.total), size=0.25)
gg <- gg + scale_fill_gradient(low="#fff7bc", high="#cc4c02", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg
# BUT you might want to show the counts by intron/exon by country
gg <- ggplot(arc.combined[arc.combined$EorI=="exon",])
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=region),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = region, fill = ei.ct), size=0.25)
gg <- gg + scale_fill_gradient(low="#f7fcb9", high="#238443", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by 'exon' & country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg.exon <- gg
gg <- ggplot(arc.combined[arc.combined$EorI=="intron",])
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=region),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = region, fill = ei.ct),
colour = "#7f7f7f", size=0.25)
gg <- gg + scale_fill_gradient(low="#ece7f2", high="#0570b0", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by 'intron' & country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg.intron <- gg
grid.arrange(gg.exon, gg.intron, ncol=1)
library(ggplot2)
library(maps)
library(plyr)
library(gridExtra)
library(maptools)
ARCTP53_SOExample <- read.csv("dat.csv")
# reduce all the distinct exon/introns to just exon or intron
ARCTP53_SOExample$EorI <- factor(ifelse(grepl("exon",
ARCTP53_SOExample$ExonIntron,
ignore.case = TRUE),
"exon", "intron"))
# extract summary data for the two variables we care about for the map
arc.combined <- count(ARCTP53_SOExample, .(Country, EorI))
colnames(arc.combined) <- c("id", "EorI", "ei.ct")
# get total for country (region) and add to the summary info
arc.combined <- merge(arc.combined, count(arc.combined, .(id), wt_var=.(ei.ct)))
colnames(arc.combined) <- c("id", "EorI", "ei.ct", "region.total")
# it wasn't specified if the "EorI" is going to be used on the map so
# we won't use it below (but we could, now)
# get map and intercourse Antarctica
data(wrld_simpl)
world_map <- fortify(wrld_simpl, region="NAME")
world_map <- subset(world_map, id!="Antarctica")
world_map <- world[order(world_map$order), ]
# DEAL WITH MISSING REGIONS
# get our regions and the regions named in the "world"
# clean up our data first
arc.combined$id <- gsub("USA", "United States", arc.combined$id)
arc.combined$id <- gsub("UK", "United Kingdom", arc.combined$id)
arc.combined$id <- gsub("The Netherlands", "Netherlands", arc.combined$id)
arc.combined$id <- gsub("Iran", "Iran (Islamic Republic of)", arc.combined$id)
arc.combined$id <- gsub("China.*", "China", arc.combined$id)
arc.combined$id <- gsub("Chinese.*", "Taiwan", arc.combined$id)
wm.reg <- unique(as.character(world_map$id))
arc.reg <- unique(as.character(arc.combined$id))
# this shows the missing ones
sort(arc.reg[!arc.reg %in% wm.reg])
# this shows all valid ones
# INSERT YOUR #SPIFFY CODE TO CLEANUP THE REGIONS HERE :-)
# this will show the counts by country with all of the "chart junk" removed
# and the "counts" scaled as a gradient, and with the legend at the top
gg <- ggplot(arc.combined)
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=id),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = id, fill = region.total), size=0.25)
gg <- gg + scale_fill_gradient(low="#fff7bc", high="#cc4c02", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg
# BUT you might want to show the counts by intron/exon by country
gg <- ggplot(arc.combined[arc.combined$EorI=="exon",])
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=id),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = id, fill = ei.ct), size=0.25)
gg <- gg + scale_fill_gradient(low="#f7fcb9", high="#238443", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by 'exon' & country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg.exon <- gg
gg <- ggplot(arc.combined[arc.combined$EorI=="intron",])
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=id),
fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = id, fill = ei.ct),
colour = "#7f7f7f", size=0.25)
gg <- gg + scale_fill_gradient(low="#ece7f2", high="#0570b0", name="Tumor counts")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Tumor contribution by 'intron' & country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg.intron <- gg
grid.arrange(gg.exon, gg.intron, ncol=1)
@OFish
Copy link

OFish commented Apr 7, 2014

Thanks so much. You're a star.
OFish

@OFish
Copy link

OFish commented Apr 7, 2014

Maybe its best to take our conversation from SO to here.

With regards to dealing with the "erroneus" country labelling in the WHO database.

First of all, this code is just awesome:

sort(arc.reg[!arc.reg %in% wm.reg])

It gives us this:

[1] "China, Hong-Kong" "Chinese Taipei (Taiwan)" "Czech Republic" "The Netherlands"

Interestingly, the sort(wm.reg), provides us with some interesting information, namely that:

"Czechoslovakia"

is used for both the Czech Republic and Slovakia, which as country hasn't existed for about 20 years.

And that

Tapei / Taiwan

doesn't exist in the world_map, or most likely (given the "Czechosolovakia" tag) is still a part of China...which is pretty harsh :-). By the way, Russia is still USSR - to me that means the map region names still stem from waaay back...maybe worth informing the maps authors?

Also RE:

INSERT YOUR #SPIFFY CODE TO CLEANUP THE REGIONS HERE :-)

If I knew how to do that, without having to use the cmd+F function and "replace" in Excel, I'd do it. But, because I obviously lack knowledge of how to re-structure data.frames and similar in R, I'm probably going to have to do it in Excel quickly :-(.

I'm wondering, where does one learn this cool stuff like the plyr package, grpl, aggregate etc.? I come from a biomedical background, so my introduction to R was for statistical purposes....but now I seem to realize that there's a whole other part to R that I've never thought of using........thanks for opening my eyes.

@hrbrmstr
Copy link
Author

hrbrmstr commented Apr 7, 2014

Whoa. That's a huge, keen insight re: country names (and I'll credit you with that find if I blog about it or submit a github pull request :-) RE: getting started with R. I read alot (not being flip, too. eidetic memory lets me background process alot of info). One way to keep current and see interesting things is to subscribe to r-bloggers (google finds that quickly) RSS. We (my coauthor of http://bit.ly/ddsec) have some resources logged for learning R more over at http://datadrivensecurity.info/blog (there's a link to resources somewhere there).

For data cleanup, check out openrefine - http://openrefine.org/. It doesn't have a huge learning curve is is waaaay better than excel. Prbly not simple enough for this activity but definitely something to check out. R's gsub() would help do substitutions but honestly Excel might just be as fast or faster if the differences are unique enough and not too widespread.

Feel free to ping me at bob at rudis dot net (that also works for Google+). I'm also hrbrmstr on skype if you ever need a more interactive explanation. Truly glad to be of help. Information security work rarely has definitive outcomes, which is one reason i try to help folks on SO. there some of my knowledge might help folks do some good :-)

@OFish
Copy link

OFish commented Apr 7, 2014

Haha. Thanks, can't really say its a "huge" find, esecially because it was simply due to luck (and I play a lot of Trivia in pubs after work, haha). But I was just composing an email to the author, as I do find it a bit "worrysome" that its plotting a map that's still using the USSR. Basically that would mean that all countries like lithuania, some of the "stans", georgia etc. don't turn up on the map. Let me know if you want me to hold off with the email...and write a blog post instead. Was just thinking that maybe the authors want to communicate that themselves. Seeing that the last "update" was in 2013...... :-/

Thanks for the links to the resources. I'll definitely have a look at them and try to learn the syntax. I must say, I've only been using R for say 6 months or so and it's pretty insane how much I learn every day. I'd love to learn how to clean up the data in R, as switching to excel is sometimes tedious and because of my dataset size I often have multiple text files open....for some reason this helps me keep an overview of what I'm working on instead of working out of one huge sheet....eventhough R-purists find that "ridiculous".

I'll send you an email with the other stuff I've been thinking about re the map. I really, really appreciate your help.

Cheers,

Oli

@hrbrmstr
Copy link
Author

hrbrmstr commented Apr 7, 2014

Well, this explains the real reason why hadley made the fortify() function for geom_map(). I added another sample source code file that uses a different set of maps that are far more current. code is mostly the same except for a change in variable name (region is now id).

I'm normally making more localized maps from shapefiles so i use that function alot. I didn't realize they weren't updating the base maps package maps anymore.

@OFish
Copy link

OFish commented Apr 7, 2014

Woha. Ok. Amazing. The OpenRefine platform is awesome, have just d/ld it to have a look at the data.frame as it might need some cleaning up anyways :-).

Regarding your code however, where did you specify wrdl_simpl? Getting an error from R there...and couldn't find it anywhere in the code.

@hrbrmstr
Copy link
Author

hrbrmstr commented Apr 7, 2014

typo (mebbe). or it cld be that you didn't library(maptools) first.. the 'new' code now has fully working code and cleans up your data frame :-) had to make sure it worked for your purposes.

@OFish
Copy link

OFish commented Apr 7, 2014

Thanks for this. So this is what I get from maptools

library(maptools)
Loading required package: sp
Checking rgeos availability: FALSE
Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
which has a restricted licence. It is disabled by default;
to enable gpclib, type gpclibPermit()
gpclibPermit()
[1] FALSE

It's weird, I can't enable it...?

EDIT: Didn't see you added the clean-up code!! Ha! Amazing, thanks!

@OFish
Copy link

OFish commented Apr 7, 2014

So the code's returning an error now by the way...can't figure out where it is....tried editing some things. Note, just changed the title names a bit.

gg <- ggplot(arc.combined)
gg <- gg + geom_map(dat=world_map, map = world_map, aes(map_id=id), fill="white", color="#7f7f7f", size=0.25)
gg <- gg + geom_map(map = world_map, aes(map_id = id, fill = region.total), size=0.25)
gg <- gg + scale_fill_gradient(low="#fff7bc", high="#cc4c02", name="Number of tumors")
gg <- gg + expand_limits(x = world_map$long, y = world_map$lat)
gg <- gg + labs(x="", y="", title="Number of tumors analyzed by country")
gg <- gg + theme(panel.grid=element_blank(), panel.border=element_blank())
gg <- gg + theme(axis.ticks=element_blank(), axis.text=element_blank())
gg <- gg + theme(legend.position="top")
gg
Don't know how to automatically pick scale for object of type function. Defaulting to continuous
Error in data.frame(map_id = function (.variables, drop = FALSE) :
arguments imply differing number of rows: 0, 24377

That's weird...

@OFish
Copy link

OFish commented Apr 9, 2014

Hi there. Took some time but worked my may around the issues with maptoolsand gpclibPermit(). On line 32 I believe there's a small mistake in your code:

world_map <- world[order(world_map$order), ] ## should be
world_map <- world_map[order(world_map$order), ]

If you change all of that I get the respective plots. Thanks for that!
Sent you an msge as well, which is not at all of priority. But thanks so much for everything until now. !
Cheers,

O.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment