Skip to content

Instantly share code, notes, and snippets.

@timriffe
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timriffe/9220868 to your computer and use it in GitHub Desktop.
Save timriffe/9220868 to your computer and use it in GitHub Desktop.
Testing HMD death counts for conormity with Benford's law
# 1) install DemogBerkeley package from github
library(devtools)
install_github("DemogBerkeley", subdir = "DemogBerkeley", username = "UCBdemography")
library(DemogBerkeley)
library(parallel)
# handy-dandy function to get a vector of the country codes:
CNTRIES <- getHMDcountries()
# ----------------------------------------
# define a couple functions for testing Benford's law
# takes a vector of numbers and produces a table of the first digits
get1stdigittable <- function(x){
xx <- unlist(lapply(strsplit(as.character(x),split=""),
function(xxx){
xxx[xxx %in% as.character(1:9)][1]
}))
table(as.integer(xx[xx %in% as.character(1:9)]))
}
# gives discrete density of digits, per Benford's law:
dbenford <- function(d,base=10){
log(1+1/d,base=base)
}
# first image: the basic pattern
getwd()
png("/hdir/0/triffe/workspace/other/PLOTS/Benford.png")
barplot(dbenford(1:9),names.arg=1:9,main="Benford's law",xlab="digit",ylab="density")
dev.off()
# now grab data live from web using functions from DemogBerkeley:
user <- userInput() # username (select lines one at a time in R.) - this avoids you writing your info in an R script
pw <- userInput() # password
# iterate over countries, grabbing Deaths_lexis, and making the digits table for deaths (Male and Female columns)
DeathDigits <-do.call(rbind,mclapply(CNTRIES, function(XYZ,user,pw){
DD <- readHMDweb(XYZ,"Deaths_lexis",username=user,password=pw)
get1stdigittable(c(DD$Male,DD$Female))
},mc.cores=4,user=user,pw=pw)) # (change cores according to your machine.)
# how many ?
sum(DeathDigits)
rownames(DeathDigits) <- CNTRIES
# get discrete empirical density for each country
DeathDigPDF <- DeathDigits / rowSums(DeathDigits)
#fields::image.plot(t(DeathDigPDF-dbenford(1:9)))
#dbenford(1:9)
# second figure: compare all
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD1.png")
matplot(1:9,t(DeathDigPDF),type='l',col="#88888860",lty=1,main="First digit distributions, HMD",xlab="digit",ylab="density")
lines(1:9,dbenford(1:9),col="red",lwd=3)
lines(1:9,colSums(DeathDigits) / sum(DeathDigits), col="blue",lty=2,lwd=3)
legend("topright",lty=c(1,1,2),col=c("#88888860","blue","red"), lwd=c(1,2,1),legend = c("HMD country","All HMD","Benford"))
dev.off()
# get discrete distribution difference:
rownames(DeathDigPDF) <- CNTRIES
SEP <- colSums(abs(t(DeathDigPDF) - dbenford(1:9)))/2
# ranked dot plot
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD2.png",width=400,height=600)
dotchart(SEP[order(SEP)], main = "Difference from Benford [0,1]")
dev.off()
# test for abridged-age death counts, to compare:
DeathDigits5 <-do.call(rbind,mclapply(CNTRIES, function(XYZ,user,pw){
DD <- readHMDweb(XYZ,"Deaths_5x1",username=user,password=pw)
get1stdigittable(c(DD$Male,DD$Female))
},mc.cores=4,user=user,pw=pw))
rownames(DeathDigits5) <- CNTRIES
DeathDigPDF5 <- DeathDigits5 / rowSums(DeathDigits5)
rownames(DeathDigPDF5) <- CNTRIES
SEP <- colSums(abs(t(DeathDigPDF5) - dbenford(1:9)))/2
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD3.png",width=400,height=600)
dotchart(SEP[order(SEP)], main = "Difference from Benford [0,1] (abridged counts)")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment