Skip to content

Instantly share code, notes, and snippets.

@zackw
Created October 30, 2015 00:30
Show Gist options
  • Save zackw/f2e74a8d7b31baa88002 to your computer and use it in GitHub Desktop.
Save zackw/f2e74a8d7b31baa88002 to your computer and use it in GitHub Desktop.
Compute the growth rate of Unicode.
#! /usr/bin/Rscript
suppressPackageStartupMessages({
library(ggplot2)
library(scales)
library(grid)
})
# Data from https://en.wikipedia.org/wiki/Unicode#Versions
# as of 29 Oct 2015
ugrowth.d <- data.frame(
date=as.Date(c(
"1991-10-01", "1992-06-01", "1993-06-01", "1996-07-01", "1998-05-01",
"1999-09-01", "2001-03-01", "2002-03-01", "2003-04-01", "2005-03-01",
"2006-07-01", "2008-04-01", "2009-10-01", "2010-10-01", "2012-01-01",
"2012-09-01", "2013-09-01", "2014-06-01", "2015-06-01")),
nchars=c(
7161, 28359, 34233, 38950, 38952,
49259, 94205, 95221, 96447, 97720,
99089, 100713, 107361, 109449, 110181,
110182, 110187, 113021, 120737),
what=factor(c("Actual"))
)
# The last large jump in the number of assigned code points:
mat.cutoff <- as.Date("2000-01-01")
# The total number of *publicly assignable* code points.
# There are 1,114,112 code points reachable by UTF-16, of which 66 are
# permanently unassigned, 2048 are surrogates, and 137,468 are
# reserved for private use.
max.codepoint <- 974530
# Similarly, for the BMP, there are 65,536 code points, of which
# 34 are permanently unassigned, 2048 are surrogates, and 6400 are
# reserved for private use.
max.bmp <- 57054
# Supplemental planes all consist of 65,536 code points of which
# 2 are permanently unassigned. At present there are two all-PUA
# planes; this calculation is included in the above.
max.sup <- 65534
m.all <- lm(nchars ~ date, ugrowth.d)
m.mat <- lm(nchars ~ date, subset(ugrowth.d, date > mat.cutoff))
p.dates <- data.frame(date=seq.Date(as.Date("1991-10-01"),
as.Date("2991-10-01"), by="10 years"))
p.mdate <- subset(p.dates, date > mat.cutoff)
p.all <- as.data.frame(predict(m.all, new=p.dates, interval="prediction"))
p.mat <- as.data.frame(predict(m.mat, new=p.mdate, interval="prediction"))
colnames(p.all) <- c("nchars", "lb", "ub")
colnames(p.mat) <- c("nchars", "lb", "ub")
p.all$what <- factor(c("Predicted (all)"))
p.mat$what <- factor(c("Predicted (since 2000)"))
ugrowth.p <- rbind(cbind(p.dates, p.all), cbind(p.mdate, p.mat))
baseplot <-
ggplot() +
geom_point(aes(x=date, y=nchars, colour=what, fill=what), data=ugrowth.d) +
geom_smooth(aes(x=date, y=nchars, ymin=lb, ymax=ub, colour=what, fill=what),
data=ugrowth.p, stat="identity") +
scale_colour_manual(values=c("#000000", "#66c2a5", "#8da0cb")) +
scale_fill_manual(values=c("#000000", "#66c2a5", "#8da0cb")) +
theme(legend.title=element_blank(),
legend.background=element_rect(fill="#f8f8f8"),
legend.position=c(0, 1),
legend.justification=c("left", "top"))
fullplot <-
baseplot +
scale_y_continuous("Assigned codepoints", expand=c(0,0),
breaks=c(0, max.bmp, max.bmp + (1:14)*max.sup,
max.codepoint)) +
scale_x_date("Year", labels=date_format("%Y"), expand=c(0.01,0),
limits=c(as.Date("1991-10-01"), as.Date("2541-10-01")),
breaks=seq.Date(as.Date("1991-10-01"),
as.Date("2541-10-01"),
length.out=6)) +
coord_cartesian(ylim=c(-max.sup/4, max.codepoint + max.sup/4))
insetplot <-
baseplot +
scale_y_continuous("Assigned codepoints", expand=c(0,0),
breaks=c(0, max.bmp/2, max.bmp,
max.bmp + max.sup/2, max.bmp+max.sup)) +
scale_x_date("Year", labels=date_format("%Y"), expand=c(0.01,0),
limits=c(as.Date("1991-10-01"), as.Date("2021-10-01")),
breaks=seq.Date(as.Date("1991-10-01"),
as.Date("2021-10-01"),
length.out=6)) +
coord_cartesian(ylim=c(-max.sup/12, max.bmp + max.sup + max.sup/12)) +
theme(legend.position="none",
axis.title=element_blank())
# annotation_custom ... just doesn't work, and I don't understand why.
# We do it the hard way instead.
png("unicode-growth-rate.png", width=1900, height=1100, res=96)
grid.newpage()
grid.draw(ggplotGrob(fullplot))
pushViewport(viewport(x=0.98, y=0.06, w=0.38, h=0.36,
just=c("right", "bottom")))
grid.draw(ggplotGrob(insetplot))
popViewport()
invisible(dev.off())
@zackw
Copy link
Author

zackw commented Mar 29, 2016

cshndbwwuaaxlkm png large

@felixdivo
Copy link

😁 Nice!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment