espinielli/Another R script for plotting ship paths.md

## README.md

      
    Raw
  

              README.md
            
          
    R scripts from the plotting of shipping logs by Ben Schmidt.
He has also a nice D3 library for comet-like trails.

  
## Another R script for plotting ship paths.md

      
    Raw
  

              Another R script for plotting ship paths.md
            
          
    From http://rpubs.com/benmschmidt/shipLDA
opts_chunk$set(eval = FALSE)
Some preliminaries to get the data in order.

# Oceans2
rm(list = ls())
require(ggplot2)
require(plyr)
require(lubridate)
source("ICOADS parsing.R")
source("../Map Functions.R")
Pulling in the Maury data

data = loadInData("~/shipping/ICOADS/maury.txt")
Doing the split voyage level.

data = splitDataByVoyage(data)
# Save the R data.frame into a directory that MALLET will be able
# to interpret as a text, and run a topic model on it:

MALLEABLE = data.frame(text = data$voyageid, word = paste(round(data$lat, 1), 
    round(data$long, 1), sep = "X"))

### I'm p system('mkdir /tmp/MALLET') system('rm /tmp/MALLET/*')

# Write one text file for each voyage with all the points it passed
# through.
ddply(MALLEABLE, .(text), function(text) {
    write.table(text$word, paste("/tmp/MALLET/", text$text[1], ".txt", sep = ""), 
        row.names = F, col.names = F, quote = F)
}, .progress = "text")


# To get it to take numbers, I need to change the token-regex; that was
# the only major hiccup I found here.
system("cd ~/mallet-2.0.7/; bin/mallet import-dir 
    --input /tmp/MALLET --output ships.mallet --keep-sequence --token-regex \"\\S+\";")
# I ran this several times with topic topic sizes; only including in the
# post 9 and 25
system("cd ~/mallet-2.0.7/; bin/mallet train-topics
     --input ships.mallet --output-topic-keys shipKeys.txt --num-topics 16
     --output-doc-topics shipTopics.txt --output-state state.txt.gz;")

# This is the data I want to plot.
system("cd ~/mallet-2.0.7/; gunzip -f state.txt.gz")
Read in those MALLET results and clean them up for analysis.

malletresults = read.table("~/mallet-2.0.7/state.txt", sep = " ", col.names = c("?", 
    "file", "loc", "total", "point", "topic"), colClasses = c("NULL", "factor", 
    "NULL", "NULL", "factor", "factor"))
nrow(malletresults)
length(unique(malletresults$file))
levels(malletresults$file) = gsub(".*/", "", levels(malletresults$file))
levels(malletresults$file) = gsub(".txt", "", levels(malletresults$file))

words = (strsplit(as.character(malletresults$point), "x"))
plottable = ldply(words, as.numeric, progress = "text")

names(plottable) = c("lat", "long")
plottable$topic = malletresults$topic
plottable$long[plottable$long > 180] = plottable$long[plottable$long > 180] - 
    360
Make some plots:

ggplot(plottable, aes(x = as.numeric(long), y = as.numeric(lat))) + geom_point(alpha = 0.01, 
    size = 0.3) + facet_wrap(~topic, scales = "free", ncol = 4) + annotation_map(map = world, 
    fill = "#F3E0BD") + theme_nothing + labs(title = "Distribution of points across a 9-topic model")

ggplot(plottable[plottable$topic == 8, ], aes(x = as.numeric(long), y = as.numeric(lat))) + 
    geom_point(alpha = 0.04, size = 0.5) + facet_wrap(~topic, scales = "free", 
    ncol = 4) + annotation_map(map = world, fill = "#F3E0BD") + theme_nothing + 
    labs(title = "Topic 8 is a chimera of a sort text-based topic modelling analysis wouldn't uncover")

counts = table(plottable$word)
countz = as.data.frame(counts)
countz$rank = rank(-countz$Freq)
ggplot(countz, aes(x = rank, y = Freq)) + geom_point() + scale_x_log10() + scale_y_log10()
Try K-means clustering on the voyages in topic space
(note: this didn't work–most ended up in a single k-means cluster–so
I didn't include it in the blog post). Might work better with a higher number of topics,
a couple hundred or something, but that would take better priors.
topicdists = table(malletresults$file, malletresults$topic)
head(topicdists)
topicdists = apply(topicdists, 2, function(z) {
    z/sum(z, na.rm = T)
})
class(topicdists) = "matrix"
f = kmeans(topicdists, centers = 9)
data$cluster = f$cluster[match(data$voyageid, names(f$cluster))]

offset = 220
plotWorld = Recenter(world, offset, idfield = "group")
plotData = Recenter(data, offset, shapeType = "segment", idfield = "voyageid")
head(plotData)
ggplot(plotData[!is.na(plotData$cluster), ], aes(x = as.numeric(long), y = as.numeric(lat))) + 
    geom_path(aes(group = group), alpha = 0.02) + annotation_map(map = plotWorld, 
    fill = "beige") + facet_wrap(~cluster, ncol = 3) + theme_nothign


## Code.md

      
    Raw
  

              Code.md
            
          
    This can be used for either type of plot--it's agnostic about the time variable going,
and the lag can be set.
mapPlot = function(myframe = useful, mytime = 654100, timelag = 34, timevar = "time", ...) {
  # I define half the plot here, and half later. No real reason.
  # This is why I don't usually share that much code.
  myframe$timevar = get(timevar, myframe)
  myframe = myframe[myframe$timevar >= (mytime - timelag) & myframe$timevar <= mytime, ]
  if (timevar == "time") {
    myframe$group = myframe$voyageID
  }
  if (timevar == "yearday") {
    myframe$group = paste(myframe$voyageID, myframe$Year)
  }
  
  ggplot(myframe, aes(y = Lat, x = Long)) + 
    geom_polygon(data = world, aes(x = x, y = y, group = id), drop = T) + 
    geom_path(size = 0.5, aes(group = group,
      #This alpha here decaying by time is what makes the trails slowly erase.
      alpha = (timevar - mytime) / timelag, color = color)) + 
      scale_color_identity() +
      geom_point(data = myframe[myframe$timevar == mytime, ], size = 0.75, aes(group = voyageID,
        alpha = (timevar - mytime) / timelag, color = color)) +
        ylab("")+
        xlab("") +
        opts(legend.position = "none",
          axis.text.x = theme_blank(),
          axis.text.y = theme_blank(),
          axis.ticks = theme_blank()) +
        coord_map(...)
}

  
## Plotting Ship Paths.R
rm(list=ls())
source("SQLFunctions.R")

plotSet = function(filename,dck=701,alpha=.01,lwd=8) {
  library(grid)
  paths = tbl(tblsrc,"paths") %.%
    filter(DCK==dck) %.%
    arrange(voyagenum,yearday) %.%
    select(LON,LAT,voyagenum) %.%
    collect()

  paths = paths %.% mutate(LON=LON+180)
  paths$LON[paths$LON<0] = paths$LON[paths$LON<0] + 360
  paths$LON[paths$LON>360] = paths$LON[paths$LON>360] - 360

  #break groups that cross from 0 to 360 or vice versa, which otherwise leave streaks
  paths$split = c(0,abs(paths$LON[-1]-paths$LON[1:(nrow(paths)-1)])>20)
  paths$segment = 100*paths$voyagenum + cumsum(paths$split)


  #desperately try to remove all elements: code stole from stack overflow, somewhere.
  new_theme_empty <- theme_bw()
  new_theme_empty$line <- element_blank()
  new_theme_empty$rect <- element_blank()
  new_theme_empty$strip.text <- element_blank()
  new_theme_empty$axis.text <- element_blank()
  new_theme_empty$plot.title <- element_blank()
  new_theme_empty$axis.title <- element_blank()
  new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1),
    unit = "lines", valid.unit = 3L, class = "unit")

  plotting = paths

  #Actually make the plot:
  plot = ggplot(plotting,
                aes(x=LON,y=LAT,group=segment)) +
    geom_path(alpha=alpha,lwd=lwd) +
    labs(x="",y="",title="") +
    theme(line=element_blank(),
          rect=element_blank(),
          axis.text=element_blank()
    )+ coord_equal() + scale_x_continuous(expand=c(0,0),limits=c(0,360)) +
    scale_y_continuous(expand=c(0,0),limits=c(-90,90))

  png(filename=filename,width=4096*2,height=2048*2)

  #plotting code from: [whoops forgot to paste it in, but SO somewhere again]
  #plots just the central panel, which makes it compatible with the format Science on a Sphere wants.
  gt <- ggplot_gtable(ggplot_build(plot))
  ge = subset(gt$layout,name=="panel")
  grid.draw(gt[ge$t:ge$b, ge$l:ge$r])
  graphics.off()
}


dcks = tbl(tblsrc,"voyages") %.%
  group_by(DCK) %.% summarize(count=n(),points=sum(points)) %.%
  arrange(-count) %.% collect()
dcks %.% filter(points<3000000,points>10000) %.%
  group_by(DCK) %.%
  do(function(row) {
    dck = row$DCK[1]
    filename = paste0("~/Dropbox/",dck,"-2.png")
    if (!file.exists(filename))  {
    plotSet(filename=filename,dck=dck,alpha=.04,lwd=3)
    }
})


## Reading the Maudy DB.md

      
    Raw
  

              Reading the Maudy DB.md
            
          
    From http://rpubs.com/benmschmidt/MauryClassification
# Oceans2
opts_chunk$set(eval = F)
rm(list = ls())
require(ggplot2)
## Loading required package: ggplot2
require(plyr)
## Loading required package: plyr
require(lubridate)
## Loading required package: lubridate
source("ICOADS parsing.R")
## Loading required package: grid
data = loadInData("~/shipping/ICOADS/maury.txt")
data = splitDataByVoyage(data)

data$color = as.character("#A65628")

source("~/gibbon/Map Functions.R")
require(plyr)

colors = data.frame(city = c("Boston", "New York", "Baltimore", "New Bedford", 
    "Sag Harbor", "Nantucket", "Salem", "Philadelphia", "Other"), color = c("green", 
    "#377EB8", "#FF7F00", "#4DAF4A", "#4DAF4A", "#4DAF4A", "#984EA3", "#F781BF", 
    "#A65628"))
for (city in colors$city[colors$city != "Other"]) {
    cat(city, "\n")
    data$color[grep(city, data$origin, ignore.case = T)] = as.character(colors$color[colors$city == 
        city])
    data$color[grep(city, data$destination, ignore.case = T)] = as.character(colors$color[colors$city == 
        city])
}
colors = colors[order(nchar(as.character(colors$city))), ]
colors$lat = seq(62, 32, length.out = nrow(colors))
colors$long = 80

offset = 200
data$whaling = NA
data$whaling[grepl("SAG HARBOR", data$origin) | grepl("SAG HARBOR", data$destination) | 
    grepl("NEW BEDFORD", data$destination) | grepl("NEW BEDFORD", data$origin) | 
    grepl("COLD SPRING", data$destination) | grepl("COLD SPRING", data$origin) | 
    grepl("NANTUCKET", data$origin) | grepl("NANTUCKET", data$destination) | 
    grepl("WHALING", data$origin) | grepl("WHALING", data$destination) | grepl("NEW LONDON", 
    data$origin) | grepl("NEW LONDON", data$destination)] = T

data$whaling[is.na(data$whaling)] = F
sort(-table(data$origin[data$whaling]))[1:15]
require(plyr)
plotWorld = Recenter(world, offset, idfield = "group")
plotData = Recenter(data, offset, shapeType = "segment", idfield = "voyageid")
Here's the chunk I used in my part that classifies whaling voyages:
data$bin = factor(round(data$lat/36) * 28000 + round(data$long/36) * 36)


ggplot(plotData[grepl("NEW BEDFORD", plotData$destination) | grepl("NEW BEDFORD", 
    plotData$origin), ], aes(x = long, y = lat, group = group)) + geom_path(alpha = 0.1, 
    color = "black") + geom_polygon(data = plotWorld, aes(group = group), fill = "grey") + 
    theme_nothing + coord_map(proj = "mollweide") + 
    labs(title = "The paths of ships that sailed from or to New Bedford in Maury's collection")

plotData$bin = factor(plotData$bin, levels = sample(levels(plotData$bin)))
ggplot(plotData, aes(x = long, y = lat, color = bin)) + geom_point(alpha = 0.1) + 
    annotation_map(plotWorld, aes(group = group), fill = "grey") +
    labs(title = "Each voyage is classified by the time it spend in these grid points") + 
    theme_nothing

control = plotData
control$voyageid = factor(control$voyageid)
ddply(control, .(whaling), function(frame) {
    length(unique(frame$voyageid))
})
keep = sample(unique(control$voyageid[!control$whaling]),
  length(unique(control$voyageid[control$whaling])))

control = control[control$whaling | control$voyageid %in% keep, ]
control$voyageid = factor(control$voyageid)
ddply(control, .(whaling), function(frame) {
    length(unique(frame$voyageid))
})

# Visual inspections confirms this works reasonably for
# whaling/nonwhaling:
ggplot(control, aes(x = long, y = lat, color = whaling, group = group)) + 
  geom_path(alpha = 0.1) + 
  geom_polygon(data = plotWorld, aes(group = group), color = "black") +
  theme_nothing + 
  coord_map(proj = "mollweide") +
  labs(title = "Training sets for whaling (blue) and non-whaling (red) voyages")


require(class)

crosscut = table(plotData$voyageid, plotData$bin)
# Not sure if normalizing or allowing whaling voyages to be naturally
# longer is better. Classifier accuracy is about the same.
crosscut = (crosscut/rowSums(crosscut))
class(crosscut) = "matrix"

training = crosscut[rownames(crosscut) %in% control$voyageid, ]
# applygroup = crosscut[!rownames(crosscut) %in% control$voyageid,]
applygroup = crosscut

classes = knn(training, training, control$whaling[match(rownames(training), 
    control$voyageid)], k = 3)

summary(classes)
# First round, only assign _new_ whaling voyages; no old ones.
classes[classes == FALSE & control$whaling[match(rownames(training), control$voyageid)]] = TRUE

classes = knn(training, training, classes, k = 3)

summary(classes)
# classes = knn(training,classes,k=5) classes =
# knn.cv(training,classes,k=5) classes = knn.cv(training,classes,k=5)

results = control[control$voyageid %in% rownames(training), ]
results$derived = classes[match(results$voyageid, rownames(training))]

testing = results[results$voyageid %in% unique(control$voyageid), ]

testing$derived = factor(testing$derived, labels = c("Classifier\nsays not whaling", 
    "Classifier\nsays whaling"))
testing$whaling = factor(testing$whaling, labels = c("Port-of-call\ndoesn't indicate whaling", 
    "Port-of-call\nindicates whaling"))

ggplot(testing, aes(x = long, y = lat, color = as.numeric(derived) == as.numeric(whaling), 
    group = group)) + geom_path(alpha = 0.31) + annotation_map(plotWorld, aes(group = group), 
    fill = "grey") + facet_grid(derived ~ whaling) + theme_nothing +
    labs(title = "Where the classifier disagrees with the port-of-call (red quadrants),\nthe classifier looks more accurate") + 
    coord_map(proj = "mercator", ylim = c(-60, 75))


plotData = Recenter(data, offset, shapeType = "segment", idfield = "voyageid")
plotData = plotData[!is.na(plotData$derived), ]

summary(plotData$derived)
ggplot(plotData[plotData$Year == 1849, ], aes(x = long, y = lat, color = derived, 
    group = group)) + geom_path(alpha = 0.31) + annotation_map(map = plotWorld, 
    aes(group = group), color = "black") +
      theme_nothing +
      labs(title = "Whaling (blue) and non-whaling voyages in 1849") + 
      coord_map(proj = "mollweide", ylim = c(-60, 75))

head(plotData)

dim(as.matrix(crosscut))

crosscut = table(plotData$voyageid, plotData$bin)
groups = kmeans(as.matrix(crosscut), centers = 9, iter.max = 25, nstart = 1)
clusters = groups$cluster
plotData$cluster = factor(clusters[match(plotData$voyageid, names(clusters))])


names = daply(plotData, .(cluster), function(cluster) {
    paste0("(", round(length(unique(cluster$voyageid))/length(levels(cluster$voyageid)) * 
        100, 1), "%)")
})

# These names will change with different kmeans clusters
levels(plotData$cluster) = c("Cape of Good Hope", "Around the Horn", "Near Atlantic", 
    "Europe and Back", "East Coast", "Hawaii", "Strait of Malacca", "Eastern Australia", 
    "Galapagos")

levels(plotData$cluster) = paste(levels(plotData$cluster), names)

ggplot(plotData, aes(x = long, y = lat, group = group)) + geom_path(alpha = 0.02, 
    color = "black") + annotation_map(map = plotWorld, aes(group = group), fill = "grey") + 
    theme_nothing + labs(title = "Clusters of voyages in the Maury collection") + 
    coord_map(proj = "mercator", ylim = c(-60, 75)) + facet_wrap(~cluster)


# Show common whaling ports
voyageCounts = plotData[!duplicated(plotData$voyageid), c("origin", "destination", 
    "derived")]
require(reshape2)
r = melt(voyageCounts, id.var = c("derived"))
head(r)
percents = ddply(r, .(value), function(cityframe) {
    data.frame(city = cityframe$value[1], total = nrow(cityframe),
      whalepercent = sum(as.logical(cityframe$derived == TRUE)) * 100/nrow(cityframe))
}, .progress = "text")
percents = percents[order(-percents$total), ]
head(percents)
capwords <- function(s, strict = FALSE) {
    cap <- function(s) paste(toupper(substring(s, 1, 1)), {
        s <- substring(s, 2)
        if (strict) 
            tolower(s) else s
    }, sep = "", collapse = " ")
    sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s)))
}
percents$city = capwords(as.character(percents$city), strict = TRUE)
ggplot(percents[1:150, ]) + geom_text(aes(y = whalepercent, x = total, label = city), 
    size = 3.5) + scale_x_log10() + labs(title = "Which ports are Whaling Ports?", 
    x = "Number of voyages mentioning location (log scale)",
    y = "Percent of voyages through location classified as whalers")
data$bin = factor(round(data$lat/14) * 28000 + round(data$long/14) * 14)
data$numericbin = as.numeric(data$bin)
modellable = dlply(data, .(voyageid), function(voyage) {
    tmp = voyage$numericbin - 1
    matrix(c(as.numeric(tmp), rep(1, length(as.numeric(tmp)))), nrow = 2, byrow = T)
}, .progress = "text")

prod(sapply(modellable, function(row) {
    nrow(row) == 2
}))
lda.collapsed.gibbs.sampler(documents = modellable, K = 15, unique(data$numericbin), 
    10, 0.1, 0.1)
require(reshape2)
day = median(plotData$time)
length(levels(plotData$voyageid))
day = day + 1
day = median(plotData$time)
day = day - 1000
day = 1
source("~/gibbon/Map Functions.R")
day = 17
SeasonalityPlots = llply(as.vector(1:365), function(day) {
    if (!file.exists(paste("~/shipping/ICOADS/images/MauryYear/", formatC(day, 
        digits = 7, format = "d", flag = "0"), ".png", sep = ""))) {
        altered = plotData
        if (day < 6) {
            altered$Year[altered$yearday > 355 & altered$yearday != 366] = altered$Year[altered$yearday > 
                355 & altered$yearday != 366] + 1
            altered$yearday[altered$yearday > 355 & altered$yearday != 366] = altered$yearday[altered$yearday > 
                355 & altered$yearday != 366] - 365
        }
        myplot = mapPlot(altered, mytime = day, timevar = "yearday", timelag = 4, 
            colorvar = "color", polygons = plotWorld, orientation = c(90, offset, 
                0), ylim = c(-70, 75), myalpha = 0.6, proj = "gall", lat0 = 10) + 
            theme_nothing + geom_text(data = colors, aes(label = city, color = color), 
            size = 4) + annotate("text", x = 260, y = 40, size = 6, label = paste(months(as.Date("1-1-1") + 
            day - 1), mday(as.Date("1-1-1") + day - 1), sep = "\n"), color = "yellow", 
            align = 1) + opts(plot.margin = unit(rep(0, 4), "lines"))
        # And then save it to a file.
        ggsave(filename = paste("~/shipping/ICOADS/images/MauryYear/", formatC(day, 
            digits = 7, format = "d", flag = "0"), ".png", sep = ""), plot = myplot, 
            width = 1920/200, height = 1080/200, dpi = 200)
    }
}, .progress = "text")
# in shell: for file in 0* ; do cp '$file' '3$file'; done; for file in 0*
# ; do cp '$file' '2$file'; done; for file in 0* ; do cp '$file' '1$file';
# done; Fill in from the middle to make prettier charts I can check on in
# progress:
days = sort(unique(plotData$time[plotData$Year >= 1825 & plotData$Year < 1855]))
# only plot even numbered days to keep the files smaller
days = sort(unique(plotData$time))
length(days)
days = arbitraryOrder(unique(data$time),2)
length(days)
AllPlots = llply(days,function(day) {

  if (!file.exists(paste("~/shipping/ICOADS/images/Maury/",
                         formatC(day,digits=7,format='d',flag='0')
                         ,".png",sep=""))) {
    myplot = 
      mapPlot(
        plotData,
        mytime=day,
        timevar="time",
        timelag=45,
        colorvar='color',
        polygons=plotWorld,
        orientation=c(90,offset,0),
        ylim=c(-70,75),
        xlim=range(plotData$long),
        myalpha=.9,
        proj='gall',lat0=10) + 
          theme_nothing+ geom_text(data=colors,aes(label=city,color=color),size=4) +
          annotate("text",x=260,y=40,size=6,
                   label=paste(months(as.Date("1-1-1") + day-1),year(as.Date("1-1-1") + day)-1,sep="\n"),
                   color='yellow',align=1) + 
                     theme(plot.margin = unit(rep(0, 4), "lines"))
    ggsave(filename=paste("~/shipping/ICOADS/images/Maury/",
                          formatC(day,digits=7,format='d',flag='0')
                          ,".png",sep=""), plot=myplot,width=1920/200,height=1080/200,dpi = 200)
  }
},.progress="text")
?ggsave

whaleData = plotData[plotData$derived==TRUE,]

days = sort(unique(whaleData$time[
  whaleData$Year>=1830 & whaleData$Year <= 1855]))
days = arbitraryOrder(days,2)
whaleData$color='red'
dim(whaleData)


require(grid)
require(lubridate)
day = day+3000
require(grid)
WhalePlots = llply(days,function(day) {
  if (!file.exists(paste("~/shipping/ICOADS/images/whalescar/",
                        formatC(day,digits=7,format='d',flag='0'),".png",sep=""))) {
  myplot = 
    mapPlot(
        whaleData,
        mytime=day, timevar="time",
        timelag=14,colorvar='color',landfill='#F3E0BD',
        polygons=plotWorld,
        orientation=c(90,offset,0),
        ylim=c(-70,75),
        xlim=range(plotData$long), myalpha=.9,
        proj='mollweide') + 
          theme_nothing +
          annotate("text",x=260,y=40,size=3.5,
                   label=paste(months(as.Date("1-1-1") + day-1),
                               year(as.Date("1-1-1") + day)-1,sep="\n"),
                   color='black',align=1) + 
                     theme(plot.margin = unit(rep(0, 4), "lines"))  + 
        scale_color_manual(values="red")  
    #And then save it to a file.
  myplot = myplot + geom_path(
    data=whaleData[whaleData$time<day,],
    color='black',lwd=.3,alpha=.05,aes(group=group))
  myplot
    ggsave(filename=paste("~/shipping/ICOADS/images/whalescar/",
                          formatC(day,digits=7,format='d',flag='0')
                          ,".png",sep=""), plot=myplot,width=1920/200,height=1080/200,dpi = 200)
    }
},.progress="text")

ggplot()
require(plyr)
require(ggplot2)
require(lubridate)

#plotting Melville's voyages
head(plotData[plotData$derived,])
ggplot(plotData[plotData$derived==TRUE,],aes(x=long,y=lat,group=group)) +
  geom_path(alpha=.02,color='black') +
  annotation_map(map=plotWorld,aes(group=group),
        fill='grey') + 
        theme_nothing + 
        labs(title="The Voyages of the Acushnet in the context of American Whaling\n(All ships in black; the Acushnet in green; and Melville's time on board in red)") + 
        coord_map(proj='mollweide',ylim=c(-60,75)) + geom_path(data=plotData[grep("ACUSHNET",plotData$ID),],alpha=.5,color='green',lwd=3)+ geom_path(data=plotData[grepl("ACUSHNET",plotData$ID) & 
    plotData$Year >=1841 &
    plotData$time <= 672988,],alpha=1,color='red',lwd=3)


ggplot(plotData[plotData$Year < 1815,],aes(x=long,y=lat,group=group)) +
  geom_path(alpha=.22,color='black') +
  annotation_map(map=plotWorld,aes(group=group),
        fill='grey') + 
        theme_nothing + 
        labs(title="The 1803-1805 voyages of the Essex (in red)\nin the context of pre-1815 American shipping") + 
        coord_map(proj='mollweide',ylim=c(-60,75)) +
        geom_path(data=plotData[grepl("ESSEX",plotData$ID) & plotData$Year >=1800,],
            alpha=1,
            color='red',
            lwd=3)


melvilleLength=as.Date("1842-7-31")-as.Date("1841-1-1")


plotData$origin[grep("ESSEX",plotData$ID)]


shipDates=plotData$date[grep("ACUSHNET",plotData$ID)]

dates = as.Date("1840-1-1"):max(shipDates)
as.Date("1840-1-1")-dates[1]
ship = data.frame(date = dates,variable="Have Ship Data for Acushnet",value=0)
ship$value[ship$date %in% shipDates] = 1
melvilleaboard = as.Date("1841-1-1"):as.Date("1842-7-31")
melville =data.frame(date = dates,variable="Melville On Acushnet",value=0)
melville$value[melville$date %in% melvilleaboard] = 1

mergt = rbind(ship,melville)
head(mergt)
mergt$date=as.Date(mergt$date,origin=as.Date("1970-01-01"))
ggplot(mergt,aes(x=date,y=variable,alpha=value)) +
    geom_tile() +
    scale_x_date() +
    theme(legend.position="None")+
    labs(y="",title="We have nearly-continuous logbooks from the Acushnet except for the beginning of Melville's voyage")


ggplot(data[data$destination %in% names(sort(-table(data$destination)))[1:10],],
    aes(x=destination))+geom_bar()

tmp=data[data$voyageid %in% 
  unique(data$voyageid[
    data$lat > -10 & data$lat < 0 & data$long >85 & data$lat < 95]),]
ggplot(data,aes(x=long,y=lat)) + geom_hex(bins=133)+ 
  theme_nothing() + 
  opts(title="Density of log readings in the Maury collection", 
        legend.position="right",
        panel.background = theme_rect(fill='grey', colour='grey'),
        panel.border = theme_blank())  +
    scale_fill_gradient("number of observations",trans='log',low="white",high="steelblue") 
data$yeargrp = factor(floor(data$Year/10)*10)
data$yeargrp[as.numeric(as.character(data$yeargrp))<1820] = "1820" 
data$yeargrp[as.numeric(as.character(data$yeargrp))>1850] = "1850" 

data$yeargrp = factor(data$yeargrp)

data$cities = NA
cities = c("NEW ORLEANS")
for (city in cities) {
  data$cities[grepl(city,data$origin) | grepl(city,data$destination)]=city
}
data$cities = factor(data$cities)
ggplot(coordShift(data[!is.na(data$cities),],0),
       aes(x=long,y=lat,group=voyageid)) + 
  geom_path(alpha=.1) + theme_nothing() + 
  opts(title="Ship paths from Maury's collection (ICOANS data)",  
       panel.background = theme_rect(fill='white', colour='white'), 
       panel.border = theme_blank())   +  
         annotate_map(data=world,aes(x=long,y=lat,group=id),drop=T,fill='lightgreen') + 
         coord_map(proj='mercator',orientation=c(90,0,0)) + facet_grid(yeargrp~cities)

max(world$long)
coordShift =function(myframe,meridian) {
  myframe$long[myframe$long>meridian+180] = myframe$long[myframe$long>meridian+180]-360
  myframe$long[myframe$long<=meridian-180] = myframe$long[myframe$long<=meridian-180]+360
  myframe
}
qplot(coordShift(world,180)$long)

melville = data[grepl("ACUSHNET",data$ID) & data$Year>=1841 & data$time < 687270,c(1:8,14)]
	rm(list=ls())
	source("SQLFunctions.R")

	plotSet = function(filename,dck=701,alpha=.01,lwd=8) {
	library(grid)
	paths = tbl(tblsrc,"paths") %.%
	filter(DCK==dck) %.%
	arrange(voyagenum,yearday) %.%
	select(LON,LAT,voyagenum) %.%
	collect()

	paths = paths %.% mutate(LON=LON+180)
	paths$LON[paths$LON<0] = paths$LON[paths$LON<0] + 360
	paths$LON[paths$LON>360] = paths$LON[paths$LON>360] - 360

	#break groups that cross from 0 to 360 or vice versa, which otherwise leave streaks
	paths$split = c(0,abs(paths$LON[-1]-paths$LON[1:(nrow(paths)-1)])>20)
	paths$segment = 100*paths$voyagenum + cumsum(paths$split)


	#desperately try to remove all elements: code stole from stack overflow, somewhere.
	new_theme_empty <- theme_bw()
	new_theme_empty$line <- element_blank()
	new_theme_empty$rect <- element_blank()
	new_theme_empty$strip.text <- element_blank()
	new_theme_empty$axis.text <- element_blank()
	new_theme_empty$plot.title <- element_blank()
	new_theme_empty$axis.title <- element_blank()
	new_theme_empty$plot.margin <- structure(c(0, 0, -1, -1),
	unit = "lines", valid.unit = 3L, class = "unit")

	plotting = paths

	#Actually make the plot:
	plot = ggplot(plotting,
	aes(x=LON,y=LAT,group=segment)) +
	geom_path(alpha=alpha,lwd=lwd) +
	labs(x="",y="",title="") +
	theme(line=element_blank(),
	rect=element_blank(),
	axis.text=element_blank()
	)+ coord_equal() + scale_x_continuous(expand=c(0,0),limits=c(0,360)) +
	scale_y_continuous(expand=c(0,0),limits=c(-90,90))

	png(filename=filename,width=40962,height=20482)

	#plotting code from: [whoops forgot to paste it in, but SO somewhere again]
	#plots just the central panel, which makes it compatible with the format Science on a Sphere wants.
	gt <- ggplot_gtable(ggplot_build(plot))
	ge = subset(gt$layout,name=="panel")
	grid.draw(gt[ge$t:ge$b, ge$l:ge$r])
	graphics.off()
	}



	dcks = tbl(tblsrc,"voyages") %.%
	group_by(DCK) %.% summarize(count=n(),points=sum(points)) %.%
	arrange(-count) %.% collect()
	dcks %.% filter(points<3000000,points>10000) %.%
	group_by(DCK) %.%
	do(function(row) {
	dck = row$DCK[1]
	filename = paste0("~/Dropbox/",dck,"-2.png")
	if (!file.exists(filename)) {
	plotSet(filename=filename,dck=dck,alpha=.04,lwd=3)
	}
	})