Last active
August 29, 2015 13:56
-
-
Save timriffe/9220868 to your computer and use it in GitHub Desktop.
Testing HMD death counts for conormity with Benford's law
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1) install DemogBerkeley package from github | |
library(devtools) | |
install_github("DemogBerkeley", subdir = "DemogBerkeley", username = "UCBdemography") | |
library(DemogBerkeley) | |
library(parallel) | |
# handy-dandy function to get a vector of the country codes: | |
CNTRIES <- getHMDcountries() | |
# ---------------------------------------- | |
# define a couple functions for testing Benford's law | |
# takes a vector of numbers and produces a table of the first digits | |
get1stdigittable <- function(x){ | |
xx <- unlist(lapply(strsplit(as.character(x),split=""), | |
function(xxx){ | |
xxx[xxx %in% as.character(1:9)][1] | |
})) | |
table(as.integer(xx[xx %in% as.character(1:9)])) | |
} | |
# gives discrete density of digits, per Benford's law: | |
dbenford <- function(d,base=10){ | |
log(1+1/d,base=base) | |
} | |
# first image: the basic pattern | |
getwd() | |
png("/hdir/0/triffe/workspace/other/PLOTS/Benford.png") | |
barplot(dbenford(1:9),names.arg=1:9,main="Benford's law",xlab="digit",ylab="density") | |
dev.off() | |
# now grab data live from web using functions from DemogBerkeley: | |
user <- userInput() # username (select lines one at a time in R.) - this avoids you writing your info in an R script | |
pw <- userInput() # password | |
# iterate over countries, grabbing Deaths_lexis, and making the digits table for deaths (Male and Female columns) | |
DeathDigits <-do.call(rbind,mclapply(CNTRIES, function(XYZ,user,pw){ | |
DD <- readHMDweb(XYZ,"Deaths_lexis",username=user,password=pw) | |
get1stdigittable(c(DD$Male,DD$Female)) | |
},mc.cores=4,user=user,pw=pw)) # (change cores according to your machine.) | |
# how many ? | |
sum(DeathDigits) | |
rownames(DeathDigits) <- CNTRIES | |
# get discrete empirical density for each country | |
DeathDigPDF <- DeathDigits / rowSums(DeathDigits) | |
#fields::image.plot(t(DeathDigPDF-dbenford(1:9))) | |
#dbenford(1:9) | |
# second figure: compare all | |
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD1.png") | |
matplot(1:9,t(DeathDigPDF),type='l',col="#88888860",lty=1,main="First digit distributions, HMD",xlab="digit",ylab="density") | |
lines(1:9,dbenford(1:9),col="red",lwd=3) | |
lines(1:9,colSums(DeathDigits) / sum(DeathDigits), col="blue",lty=2,lwd=3) | |
legend("topright",lty=c(1,1,2),col=c("#88888860","blue","red"), lwd=c(1,2,1),legend = c("HMD country","All HMD","Benford")) | |
dev.off() | |
# get discrete distribution difference: | |
rownames(DeathDigPDF) <- CNTRIES | |
SEP <- colSums(abs(t(DeathDigPDF) - dbenford(1:9)))/2 | |
# ranked dot plot | |
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD2.png",width=400,height=600) | |
dotchart(SEP[order(SEP)], main = "Difference from Benford [0,1]") | |
dev.off() | |
# test for abridged-age death counts, to compare: | |
DeathDigits5 <-do.call(rbind,mclapply(CNTRIES, function(XYZ,user,pw){ | |
DD <- readHMDweb(XYZ,"Deaths_5x1",username=user,password=pw) | |
get1stdigittable(c(DD$Male,DD$Female)) | |
},mc.cores=4,user=user,pw=pw)) | |
rownames(DeathDigits5) <- CNTRIES | |
DeathDigPDF5 <- DeathDigits5 / rowSums(DeathDigits5) | |
rownames(DeathDigPDF5) <- CNTRIES | |
SEP <- colSums(abs(t(DeathDigPDF5) - dbenford(1:9)))/2 | |
png("/hdir/0/triffe/workspace/other/PLOTS/BenfordHMD3.png",width=400,height=600) | |
dotchart(SEP[order(SEP)], main = "Difference from Benford [0,1] (abridged counts)") | |
dev.off() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment