hpiwowar/ASIST2011_stats.R

## ASIST2011_stats.R
### Heather Piwowar
### MIT open license.
### blog post about interm results:

library(rms)

# if true, draw figures on screen instead of in a file
SCREEN=TRUE

# set to ".eps" or ".png"
FORMAT=".pdf"

# PLoS standards from http://www.plosone.org/static/figureGuidelines.action#quickref
MAX.FIGURE.WIDTH = 6.83
MAX.FIGURE.HEIGHT = 8

setup.figure = function
### Set up a graphics object for plotting a journal-ready .eps figure
### call this, then do the plot, then call dev.off()
(
    filename,   ##<< filename for resulting eps file.  Will get .eps or .png or .pdf appended
    width=6.83, ##<< figure width, in inches
    height=8,    ##<< figure height, in inches
    format=".eps", ##<< ".eps" or ".png" or ".pdf"
    screen=FALSE
) {
    if (screen) {
        quartz()
    } else {
      filename.with.extension = paste(filename, format, sep="")
      if (format==".eps") {
          postscript(file=filename.with.extension,
                      paper="special",
                      width=width,
                      height=height,
                      family="sans",
                      onefile=FALSE,
                      pagecentre=FALSE,
                      horizontal=FALSE)
        } else if (format==".png") {
            png(filename.with.extension)
        } else if (format==".pdf") {
            pdf(filename.with.extension)
        }
    }
    return()

   ##examples<<
   setup.figure("figure1", 6, 8)
   plot(c(1, 2, 3), c(2, 4, 6))
   close.figure("figure1")
}


close.figure = function
### Closes a graphics object and embeds fonts in he associated .eps figure
### To be used with setup.figure
(
    filename,    ##<< filename of eps file.  Will get .eps appended
    format=".eps", ##<< ".eps" or ".png"
    screen=FALSE
) {
    if (!screen) {
        dev.off()
        if (format==".eps") {
            filename.with.extension = paste(filename, ".eps", sep="")
            embedFonts(filename.with.extension, outfile=filename.with.extension)
        }
    }
    return()

   ##examples<<
   setup.figure("figure1", 6, 8)
   plot(c(1, 2, 3), c(2, 4, 6))
   close.figure("figure1")
}


read.raw.data = function
### Read the raw text data file
### change underscores in column names to periods
### return as a data frame
(
    filename="rawdata.txt"  ##<< name of raw data
)
{
    dat.raw = read.csv(filename, header=TRUE, sep="\t", stringsAsFactors=FALSE)
    names(dat.raw) = gsub("_", ".", names(dat.raw))
    #names.pretty = cat(names(dat.raw), sep="\n")
    #print names.pretty
    #save(dat.raw, file="dat_raw.Rdata")
    #load("dat_raw.Rdata")

    #dim(dat.raw)
    #summary(dat.raw)
    #names(dat.raw)
    #names(dat.raw)[0:20]
    return(dat.raw)
}

# Don't do transformations for this analysis.  Use ranks, etc.
tr = function(x) return(x)
log.tr = function(x) return(x)

#### Some of my variables are extracted by looking to see if MEDLINE records have a MeSH term.
# This is fine, but sometimes MEDLINE records are incomplete, they are in process or aren't on the list
# of journals indexed by MeSH indexers.  This means a lack of relevant MeSH term does not mean the MeSH
# term does not apply.  Thus, must replace these 0s with NAs to indicate the data is indeed missing
medline.values = function(raw_values, medline_status) {
    values = raw_values
    values[medline_status!="indexed for MEDLINE"] = NA
    return(values)
}

preprocess.raw.data = function
### Do preprocessing on raw data
### change variables to ordered factors, integers, etc as appropriate
### return cleaned data as a data frame
(
    dat.raw ##<< data frame containing raw data
)
{
    ####  CONVERT STRINGS TO NUMBERS
    dat.nums = get.dat.nums(dat.raw)  ##<<details this produces warnings

    #library(psych)
    #described.dat.nums = psych::describe(dat.nums)
    #write.table(described.dat.nums,"described_dat_nums.txt",append=F,quote=F,sep="\t",row.names=T)

    #### START TO BUILD STATS DATASET
    dat = data.frame(pmid = as.numeric(dat.nums$pmid))

    # skip issn
    # skip essn
    # My naming convention is that "ago" means number of years between 2010 and the original variable
    dat$years.ago = tr(2010 - dat.nums$pubmed.year.published)
    # skip date_in_pubmed
    # skip authors
    # Break journal into top contenders
    #dat$journal.bmc.genomics                = ifelse("BMC Genomics" == dat.raw$pubmed.journal, 1, 0)
    #dat$journal.cancer.res                  = ifelse("Cancer Res" == dat.raw$pubmed.journal, 1, 0)
    #dat$journal.pnas                        = ifelse("Proc Natl Acad Sci U S A" == dat.raw$pubmed.journal, 1, 0)
    #dat$journal.j.biol.chem                 = ifelse("J Biol Chem" == dat.raw$pubmed.journal, 1, 0)
    #dat$journal.physiol.genomics            = ifelse("Physiol Genomics" == dat.raw$pubmed.journal, 1, 0)
    #dat$journal.plos.one                    = ifelse("PLoS One" == dat.raw$pubmed.journal, 1, 0)

    dat$num.authors = tr(dat.nums$pubmed.number.authors)

    ####### FIRST AUTHOR VARIABLES

    # skip first_author_first_name
    dat$first.author.female                 = ordered(dat.nums$author.first.author.female)
    dat$first.author.female[which(pmax(dat.nums$author.first.author.male, dat.nums$author.first.author.female) <= 0)] = NA

    # Removing male because so highly correlated with female
    #dat$first.author.male                   = ordered(dat.nums$author.first.author.male)
    dat$first.author.gender.not.found       = ordered(dat.nums$author.first.author.gender.not.found)
    dat$first.author.num.prev.pubs       = tr(dat.nums$first.author.num.prev.pubs)
    dat$first.author.num.prev.pmc.cites  = tr(dat.nums$first.author.num.prev.pmc.cites)

    dat$first.author.year.first.pub.ago  = tr( 2010 - dat.nums$first.author.year.first.pub)
    # NAs are either because there was no cluster, or the cluster included no prior papers
    # (unfortunately, I didn't collect an indication of which was true, but in theory there should
    # be a cluster for all papers published before 2009 Author-ity data was run, so I will
    # assume it is the latter, that there we no prior papers.)
    # I will code no prior papers as zero years since the prior paper
    dat$first.author.year.first.pub.ago[is.na(dat.nums$first.author.year.first.pub)] = tr(0)

    dat$first.author.num.prev.microarray.creations = tr(dat.nums$first.author.num.prev.microarray.creations)
    dat$first.author.num.prev.oa         = tr(dat.nums$first.author.num.prev.oa)
    dat$first.author.num.prev.other.sharing  = tr(
                                                    dat.nums$first.author.num.prev.genbank.sharing +
                                                    dat.nums$first.author.num.prev.pdb.sharing +
                                                    dat.nums$first.author.num.prev.swissprot.sharing)
    dat$first.author.num.prev.geoae.sharing = tr(dat.nums$first.author.num.prev.geoae.sharing)
    dat$first.author.num.prev.geo.reuse  = tr(dat.nums$first.author.num.prev.geo.reuse)
    dat$first.author.num.prev.multi.center = tr(dat.nums$first.author.num.prev.multi.center)
    # skip first.author.prev.meta.analysis because not enough > 0

    # set to NA if the number of publications is unrealistically big, because this probably means that
    # Author-ity combined two or more authors together.  Better to set to NA and deal with bias than introduce
    # such large numbers into our data
    first.author.thresh.cluster.size = quantile(dat$first.author.num.prev.pubs, .98, na.rm=TRUE)
    first.author.unrealistic.pub.clusters = which(dat$first.author.num.prev.pubs > first.author.thresh.cluster.size)
    dat$first.author.num.prev.pubs[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.pmc.cites[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.year.first.pub.ago[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.microarray.creations[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.oa[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.other.sharing [first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.geoae.sharing[first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.geo.reuse [first.author.unrealistic.pub.clusters] = NA
    dat$first.author.num.prev.multi.center[first.author.unrealistic.pub.clusters] = NA


    ####### LAST AUTHOR VARIABLES

    # skip last_author_first_name
    dat$last.author.female                 = ordered(dat.nums$last.first.author.female)
    dat$last.author.female[which(pmax(dat.nums$last.first.author.male, dat.nums$last.first.author.female) <= 0)] = NA

    # Removing male because so highly correlated with female
    #dat$last.author.male                   = ordered(dat.nums$last.first.author.male)
    dat$last.author.gender.not.found       = ordered(dat.nums$last.first.author.gender.not.found)
    dat$last.author.num.prev.pubs        = tr(dat.nums$last.author.num.prev.pubs)
    dat$last.author.num.prev.pmc.cites   = tr(dat.nums$last.author.num.prev.pmc.cites)

    dat$last.author.year.first.pub.ago   = tr(2010 - dat.nums$last.author.year.first.pub)
    # See comment on first author above for treatment of NAs for this variable
    dat$last.author.year.first.pub.ago[is.na(dat.nums$last.author.year.first.pub)] = tr(0)

    dat$last.author.num.prev.microarray.creations = tr(dat.nums$last.author.num.prev.microarray.creations)
    dat$last.author.num.prev.oa          = tr(dat.nums$last.author.num.prev.oa)
    dat$last.author.num.prev.other.sharing   = tr(
                                                    dat.nums$last.author.num.prev.genbank.sharing +
                                                    dat.nums$last.author.num.prev.pdb.sharing +
                                                    dat.nums$last.author.num.prev.swissprot.sharing )
    dat$last.author.num.prev.geoae.sharing = tr(dat.nums$last.author.num.prev.geoae.sharing)
    dat$last.author.num.prev.geo.reuse   = tr(dat.nums$last.author.num.prev.geo.reuse)
    dat$last.author.num.prev.multi.center = tr(dat.nums$last.author.num.prev.multi.center)
    # skip last.author.prev.meta.analysis because not enough > 0

    # set to NA if the number of publications is unrealistically big, because this probably means that
    # Author-ity combined two or more authors together.  Better to set to NA and deal with bias than introduce
    # such large numbers into our data
    last.author.thresh.cluster.size = quantile(dat$last.author.num.prev.pubs, .98, na.rm=TRUE)
    last.author.unrealistic.pub.clusters = which(dat$last.author.num.prev.pubs >= last.author.thresh.cluster.size)
    dat$last.author.num.prev.pubs[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.pmc.cites[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.year.first.pub.ago[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.microarray.creations[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.oa[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.other.sharing [last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.geoae.sharing[last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.geo.reuse [last.author.unrealistic.pub.clusters] = NA
    dat$last.author.num.prev.multi.center[last.author.unrealistic.pub.clusters] = NA


    ####  SOME FUNDING VARIABLES AND STUDY VARIABLES
    # skip address
    # skip institution, use instition_clean instead below
    # skip country, use country_clean instead below
    # skip grant_numbers, use lots of other related variables below
    dat$num.grant.numbers = tr(dat.nums$num.grant.numbers)
    dat$num.nih.is.nci = tr(dat.nums$nih.is.nci)
    dat$num.nih.is.nhlbi = tr(dat.nums$nih.is.nhlbi)
    dat$num.nih.is.ncrr = tr(dat.nums$nih.is.ncrr)
    dat$num.nih.is.niehs = tr(dat.nums$nih.is.niehs)
    dat$num.nih.is.ninds = tr(dat.nums$nih.is.ninds)
    dat$num.nih.is.niddk = tr(dat.nums$nih.is.niddk)
    dat$num.nih.is.nigms = tr(dat.nums$nih.is.nigms)
    dat$num.nih.is.niaid = tr(dat.nums$nih.is.niaid)

    # skip is_review because all 0s, as it should be
    # need to use pubmed.medline.status to figure out what should really be NA because of incomplete mesh terms
    dat$pubmed.is.humans    = ordered(medline.values(dat.nums$pubmed.is.humans, dat.raw$pubmed.medline.status))
    dat$pubmed.is.animals   = ordered(medline.values(dat.nums$pubmed.is.animals, dat.raw$pubmed.medline.status))
    dat$pubmed.is.mice      = ordered(medline.values(dat.nums$pubmed.is.mice, dat.raw$pubmed.medline.status))
    dat$pubmed.is.fungi     = ordered(medline.values(dat.nums$pubmed.is.fungi, dat.raw$pubmed.medline.status))
    dat$pubmed.is.bacteria  = ordered(medline.values(dat.nums$pubmed.is.bacteria, dat.raw$pubmed.medline.status))
    dat$pubmed.is.plants    = ordered(medline.values(dat.nums$pubmed.is.plants, dat.raw$pubmed.medline.status))
    dat$pubmed.is.viruses   = ordered(medline.values(dat.nums$pubmed.is.viruses, dat.raw$pubmed.medline.status))
    dat$pubmed.is.cultured.cells    = ordered(medline.values(dat.nums$pubmed.is.cultured.cells, dat.raw$pubmed.medline.status))
    # is cancer appears to depend not entirely on MEDLINE completeness
    dat$pubmed.is.cancer        = ordered(dat.nums$pubmed.is.cancer)
    # Open access filter, and some others below, are by journal not article, so doesn't depend on MEDLINE completion status
    dat$pubmed.is.open.access   = ordered(dat.nums$pubmed.is.open.access)
    dat$pubmed.is.effectiveness = ordered(medline.values(dat.nums$pubmed.is.effectiveness, dat.raw$pubmed.medline.status))
    dat$pubmed.is.diagnosis     = ordered(medline.values(dat.nums$pubmed.is.diagnosis, dat.raw$pubmed.medline.status))
    dat$pubmed.is.prognosis     = ordered(medline.values(dat.nums$pubmed.is.prognosis, dat.raw$pubmed.medline.status))
    dat$pubmed.is.core.clinical.journal = ordered(dat.nums$pubmed.is.core.clinical.journal)
    # skip is_clinical_trial because not enough
    # skip is_randomized_clinical_trial because not enough
    # skip is_meta_analysis because none (yay!)
    dat$pubmed.is.comparative.study = ordered(medline.values(dat.nums$pubmed.is.comparative.study, dat.raw$pubmed.medline.status))
    # skip multicenter because not enough (surprising?)
    # skip validation because not enough (surprising?)
    # skip funded_stimulus because all 0s
    # external funding seems set sometimes even for 1s, so doesn't depend on MEDLINE completely
    # commenting out external in favour of a combined nih below
    #dat$pubmed.is.funded.nih.extramural   = ordered(dat.nums$pubmed.is.funded.nih.extramural)
    dat$pubmed.is.funded.nih.intramural   = ordered(dat.nums$pubmed.is.funded.nih.intramural)
    # Combining extramural, intramural, and whether the PMID had a link to funding in the NIH data sources
    dat$pubmed.is.funded.nih            = ordered(pmax(dat.nums$pubmed.is.funded.nih.extramural, dat.nums$pubmed.is.funded.nih.intramural, !is.na(dat.nums$num.grants)))
    dat$pubmed.is.funded.non.us.govt    = ordered(dat.nums$pubmed.is.funded.non.us.govt)
    # Sharing in PDB and Swissprot very small, so will combine
    dat$pubmed.is.shared.other          = ordered(pmax(dat.nums$pubmed.in.genbank, dat.nums$pubmed.in.pdb, dat.nums$pubmed.in.swissprot))
    # geo.reuse is tiny but I want to use it anyway!
    dat$pubmed.is.geo.reuse             = ordered(dat.nums$is.geo.reuse)
    dat$pubmed.num.cites.from.pmc    = tr(dat.nums$pubmed.number.times.cited.in.pmc)
    dat$pubmed.num.cites.from.pmc.per.year = dat.nums$pubmed.number.times.cited.in.pmc/(2010 - dat.nums$pubmed.year.published)
    # Comment these out for now, because they look wrong
    #dat$found.by.highwire               = ordered(dat.nums$found.by.highwire)
    #dat$found.by.scirus                 = ordered(dat.nums$found.by.highwire)
    #dat$found.by.googlescholar          = ordered(dat.nums$found.by.highwire)
    #dat$found.by.pmc                    = ordered(dat.nums$portal.pmids.found.by.pmc)

    ##### DEPENDENT VARIABLES
    dat$dataset.in.geo                  = ordered(dat.nums$pubmed.in.geo)
    # commenting this out in favour of a combined metric below
    #dat$dataset.in.arrayexpress         = ordered(dat.nums$in.arrayexpress)
    dat$dataset.in.geo.or.ae            = ordered(dat.nums$in.ae.or.geo)

    ###### INSTITUTION
    dat$country.australia              = ordered(ifelse("Australia" == dat.raw$country.clean, 1, 0))
    dat$country.canada                 = ordered(ifelse("Canada" == dat.raw$country.clean, 1, 0))
    dat$country.china                  = ordered(ifelse("China" == dat.raw$country.clean, 1, 0))
    dat$country.france                 = ordered(ifelse("France" == dat.raw$country.clean, 1, 0))
    dat$country.germany                = ordered(ifelse("Germany" == dat.raw$country.clean, 1, 0))
    dat$country.japan                  = ordered(ifelse("Japan" == dat.raw$country.clean, 1, 0))
    dat$country.korea                  = ordered(ifelse("Korea" == dat.raw$country.clean, 1, 0))
    dat$country.uk                     = ordered(ifelse("UK" == dat.raw$country.clean, 1, 0))
    dat$country.usa                    = ordered(ifelse("USA" == dat.raw$country.clean, 1, 0))
    dat$institution.is.medical         = ordered(dat.nums$institution.hospital.or.medlcal)
    dat$institution.rank               = dat.nums$institution.rank
    dat$institution.is.higher.ed       = ordered(ifelse("Higher educ." == dat.raw$institution.sector, 1, 0))
    dat$institution.is.higher.ed[which(is.na(dat$institution.rank))] = NA
    dat$institution.is.govnt           = ordered(ifelse("Government" == dat.raw$institution.sector, 1, 0))
    dat$institution.is.govnt[which(is.na(dat$institution.rank))] = NA
    dat$institution.research.output = tr(dat.nums$institution.output)
    dat$institution.international.collaboration = dat.nums$international.collaboration
    dat$institution.mean.norm.impact.factor     = dat.nums$institution.norm.sjr
    dat$institution.mean.norm.citation.score    = dat.nums$institution.norm.citation.score

    # Include variables for the three most common institutions
    #top.institutions = names(sort(table(dat.institutions$institutions.factor), decreasing=TRUE)[0:25])
    #display.institutions = c(top.institutions)
    dat$institution.harvard            = ordered(ifelse("Harvard University" == dat.raw$institution.clean, 1, 0))
    dat$institution.nci                = ordered(ifelse("National Cancer Institute" == dat.raw$institution.clean, 1, 0))
    dat$institution.stanford           = ordered(ifelse("Stanford University" == dat.raw$institution.clean, 1, 0))

    ### JOURNAL VARIABLES
    # skip total cites 2008 because it could be due to so many things, not very informative
    #dat$journal.cites.2008       = tr(dat.nums$journal.2008.cites)
    dat$journal.impact.factor   = log.tr(dat.nums$journal.impact.factor)

    # PLoS One is a big part of our sample, and doesn't have an official impact factor yet.
    # Important, because it is relatively low, so better to give it our estimate instead of leave missing
    # Estimate of 3 comes from http://pbeltrao.blogspot.com/2009/04/guestimating-plos-one-impact-factor.html
    dat$journal.impact.factor[which(dat.raw$pubmed.journal== "PLoS One")] = log.tr(3)

    dat$journal.5yr.impact.factor = log.tr(dat.nums$journal.5yr.impact.factor)
    dat$journal.immediacy.index = log.tr(dat.nums$journal.immediacy.index)
    dat$journal.num.articles.2008 = tr(dat.nums$journal.num.articles.2008)
    dat$journal.cited.halflife      = dat.nums$journal.cited.halflife


    # skip journal_policy_try because it is redundant
    dat$journal.policy.requires.microarray.accession = ordered(ifelse(dat.nums$journal.policy.requires.microarray.accession > 0, 1, 0))
    # says must deposit should be true whenever requires microarray accession is tru
    dat$journal.policy.says.must.deposit    = ordered(ifelse(pmax(dat.nums$journal.policy.says.must.deposit, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0))
    dat$journal.policy.at.least.requests.sharing.microarray = ordered(ifelse(dat.nums$journal.policy.at.least.requests.sharing.microarray > 0, 1, 0))
    dat$journal.policy.mentions.any.sharing = ordered(ifelse(dat.nums$journal.policy.mentions.any.sharing > 0, 1, 0))
    dat$journal.policy.general.statement    = ordered(ifelse(dat.nums$journal.policy.general.statement > 0, 1, 0))
    # requests accession should be true whenenever requires accession is true, but looks like I didn't code it that way
    dat$journal.policy.requests.accession   = ordered(ifelse(pmax(dat.nums$journal.policy.requests.accession, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0))
    dat$journal.policy.mentions.exceptions  = ordered(ifelse(dat.nums$journal.policy.mentions.exceptions > 0, 1, 0))
    dat$journal.policy.mentions.consequences    = ordered(ifelse(dat.nums$journal.policy.mentions.consequences > 0, 1, 0))
    dat$journal.policy.contains.word.microarray = ordered(ifelse(dat.nums$journal.policy.contains.word.microarray > 0, 1, 0))
    dat$journal.policy.contains.word.miame.mged = ordered(ifelse(dat.nums$journal.policy.contains.word.MIAME.MGED > 0, 1, 0))
    dat$journal.policy.contains.word.arrayexpress   = ordered(ifelse(dat.nums$journal.policy.contains.word.arrayexpress > 0, 1, 0))
    dat$journal.policy.contains.word.geo.omnibus    = ordered(ifelse(dat.nums$journal.policy.contains.word.geo.omnibus > 0, 1, 0))
    dat$journal.policy.requests.sharing.other.data  = ordered(ifelse(dat.nums$journal.policy.requests.sharing.other.data > 0, 1, 0))
    # For some reason this variable didn't have NAs, but it should have the same NAs as the other journal policy vars
    dat$journal.policy.requests.sharing.other.data[is.na(dat$journal.policy.requests.accession)] = NA

    # Add a variable saying how many microarray creating (as defined by being in this set)
    # papers the journal has published
    table.journal.counts = table(dat.raw$pubmed.journal)
    dat$journal.microarray.creating.count = tr(as.numeric(table.journal.counts[dat.raw$pubmed.journal]))

    #### MORE FUNDING VARIABLES
    # set the missing values to 0 or tr(0), as appropriate

    # skip pmid:1 as duplicate column
    dat$num.grants.via.nih   = tr(dat.nums$num.grants)
    dat$num.grants.via.nih[is.na(dat$num.grants.via.nih)] = tr(0)
    dat$max.grant.duration   = tr(dat.nums$longest.num.years)
    dat$max.grant.duration[is.na(dat$max.grant.duration)] = tr(0)

    dat$nih.first.year.ago   = tr(2010 - dat.nums$first.year)
    dat$nih.first.year.ago[is.na(dat$nih.first.year.ago)] = NA
    dat$nih.last.year.ago    = tr(2010 - dat.nums$last.year)
    dat$nih.last.year.ago[is.na(dat$nih.last.year.ago)] = NA
    dat$nih.cumulative.years = tr(dat.nums$cumulative.years)
    dat$nih.cumulative.years[is.na(dat$nih.cumulative.years)] = tr(0)

    dat$nih.sum.avg.dollars  = tr(dat.nums$sum.avg.dollars)
    dat$nih.sum.avg.dollars[is.na(dat$nih.sum.avg.dollars)] = tr(0)
    dat$nih.sum.sum.dollars  = tr(dat.nums$sum.sum.dollars)
    dat$nih.sum.sum.dollars[is.na(dat$nih.sum.sum.dollars)] = tr(0)
    dat$nih.max.max.dollars  = tr(dat.nums$max.max.dollars)
    dat$nih.max.max.dollars[is.na(dat$nih.max.max.dollars)] = tr(0)


    # skip
    # skip grant_activity_codes list, because captured below
    dat$has.R01.funding = ordered(dat.nums$has.R01.funding)
    dat$has.R01.funding[is.na(dat$has.R01.funding)] = 0
    #dat$has.T32.funding = ordered(dat.nums$has.T32.funding)
    #dat$has.T32.funding[is.na(dat$has.T32.funding)] = 0
    # other funding types dropped because don't occurr often enough

    # look for patterns here
    dat$has.P.funding = ordered(ifelse(grepl("P", dat.raw$grant.activity.codes), 1, 0))
    dat$has.R.funding = ordered(ifelse(grepl("R", dat.raw$grant.activity.codes), 1, 0))
    dat$has.T.funding = ordered(ifelse(grepl("T", dat.raw$grant.activity.codes), 1, 0))
    dat$has.U.funding = ordered(ifelse(grepl("U", dat.raw$grant.activity.codes), 1, 0))
    dat$has.K.funding = ordered(ifelse(grepl("K", dat.raw$grant.activity.codes), 1, 0))

    # see below this block for handling of their NAs
    # only pick one funding level, otherwise too singular
    # not enough post2007
    dat$num.post2003.morethan500k = tr(dat.nums$num.post2003.morethan500k)
    dat$num.post2004.morethan500k = tr(dat.nums$num.post2004.morethan500k)
    dat$num.post2005.morethan500k = tr(dat.nums$num.post2005.morethan500k)
    dat$num.post2006.morethan500k = tr(dat.nums$num.post2006.morethan500k)
    #dat$num.post2007.morethan500k = tr(dat.nums$num.post2007.morethan500k)
    dat$num.post2003.morethan750k = tr(dat.nums$num.post2003.morethan750k)
    dat$num.post2004.morethan750k = tr(dat.nums$num.post2004.morethan750k)
    dat$num.post2005.morethan750k = tr(dat.nums$num.post2005.morethan750k)
    dat$num.post2006.morethan750k = tr(dat.nums$num.post2006.morethan750k)
    #dat$num.post2007.morethan750k = tr(dat.nums$num.post2007.morethan750k)
    dat$num.post2003.morethan1000k = tr(dat.nums$num.post2003.morethan1000k)
    dat$num.post2004.morethan1000k = tr(dat.nums$num.post2004.morethan1000k)
    dat$num.post2005.morethan1000k = tr(dat.nums$num.post2005.morethan1000k)
    dat$num.post2006.morethan1000k = tr(dat.nums$num.post2006.morethan1000k)
    #dat$num.post2007.morethan1000k = tr(dat.nums$num.post2007.morethan1000k)

    dat$num.post2003.morethan500k[is.na(dat$num.post2003.morethan500k)] = tr(0)
    dat$num.post2004.morethan500k[is.na(dat$num.post2004.morethan500k)] = tr(0)
    dat$num.post2005.morethan500k[is.na(dat$num.post2005.morethan500k)] = tr(0)
    dat$num.post2006.morethan500k[is.na(dat$num.post2006.morethan500k)] = tr(0)
    #dat$num.post2007.morethan500k[is.na(dat$num.post2007.morethan500k)] = tr(0)
    dat$num.post2003.morethan750k[is.na(dat$num.post2003.morethan750k)] = tr(0)
    dat$num.post2004.morethan750k[is.na(dat$num.post2004.morethan750k)] = tr(0)
    dat$num.post2005.morethan750k[is.na(dat$num.post2005.morethan750k)] = tr(0)
    dat$num.post2006.morethan750k[is.na(dat$num.post2006.morethan750k)] = tr(0)
    #dat$num.post2007.morethan750k[is.na(dat$num.post2007.morethan750k)] = tr(0)
    dat$num.post2003.morethan1000k[is.na(dat$num.post2003.morethan1000k)] = tr(0)
    dat$num.post2004.morethan1000k[is.na(dat$num.post2004.morethan1000k)] = tr(0)
    dat$num.post2005.morethan1000k[is.na(dat$num.post2005.morethan1000k)] = tr(0)
    dat$num.post2006.morethan1000k[is.na(dat$num.post2006.morethan1000k)] = tr(0)
    #dat$num.post2007.morethan1000k[is.na(dat$num.post2007.morethan1000k)] = tr(0)

    dat$dataset.in.geo.or.ae.int = dat.nums$in.ae.or.geo

    #dim(dat)

    #save(dat, file="dat.Rdata")
    return(dat)
}


explore.cleaned.data = function
(
    dat
)
{
    library(Hmisc)
    library(psych)
    # both Hmisc and psych have a describe
    Hmisc::describe(dat)
    #psych::describe(dat)

    s = summary(dataset.in.geo.or.ae.int ~ ., dat)
    s
}

display.attributes = c(
    "num.authors"
    ,"first.author.female"
    ,"first.author.gender.not.found"
    ,"first.author.num.prev.pubs"
    ,"first.author.num.prev.pmc.cites"
    ,"first.author.year.first.pub.ago"
    ,"first.author.num.prev.microarray.creations"
    ,"first.author.num.prev.oa"
    ,"first.author.num.prev.other.sharing"
    ,"first.author.num.prev.geo.reuse"
    ,"first.author.num.prev.multi.center"
    ,"last.author.female"
    ,"last.author.gender.not.found"
    ,"last.author.num.prev.pubs"
    ,"last.author.num.prev.pmc.cites"
    ,"last.author.year.first.pub.ago"
    ,"last.author.num.prev.microarray.creations"
    ,"last.author.num.prev.oa"
    ,"last.author.num.prev.other.sharing"
    ,"last.author.num.prev.geo.reuse"
    ,"last.author.num.prev.multi.center"
    ,"num.grant.numbers"
    ,"num.nih.is.nci"
    ,"num.nih.is.nhlbi"
    ,"num.nih.is.ncrr"
    ,"num.nih.is.niehs"
    ,"num.nih.is.ninds"
    ,"num.nih.is.niddk"
    ,"num.nih.is.nigms"
    ,"num.nih.is.niaid"
    ,"pubmed.is.humans"
    ,"pubmed.is.animals"
    ,"pubmed.is.mice"
    ,"pubmed.is.fungi"
    ,"pubmed.is.bacteria"
    ,"pubmed.is.plants"
    ,"pubmed.is.viruses"
    ,"pubmed.is.cultured.cells"
    ,"pubmed.is.cancer"
    ,"pubmed.is.open.access"
    ,"pubmed.is.effectiveness"
    ,"pubmed.is.diagnosis"
    ,"pubmed.is.prognosis"
    ,"pubmed.is.core.clinical.journal"
    ,"pubmed.is.comparative.study"
    ,"pubmed.is.funded.nih.intramural"
    ,"pubmed.is.funded.nih"
    ,"pubmed.is.funded.non.us.govt"
    ,"pubmed.is.shared.other"
    ,"pubmed.num.cites.from.pmc"
    ,"pubmed.num.cites.from.pmc.per.year"
    ,"dataset.in.geo"
    ,"dataset.in.geo.or.ae"
    ,"country.australia"
    ,"country.canada"
    ,"country.china"
    ,"country.france"
    ,"country.germany"
    ,"country.japan"
    ,"country.korea"
    ,"country.uk"
    ,"country.usa"
    ,"institution.is.medical"
    ,"institution.rank"
    ,"institution.is.higher.ed"
    ,"institution.is.govnt"
    ,"institution.research.output"
    ,"institution.international.collaboration"
    ,"institution.mean.norm.impact.factor"
    ,"institution.mean.norm.citation.score"
    ,"journal.impact.factor"
    ,"journal.num.articles.2008"
    ,"journal.policy.requires.microarray.accession"
    ,"journal.policy.general.statement"
    ,"journal.policy.requests.accession"
    ,"journal.policy.mentions.exceptions"
    ,"journal.policy.mentions.consequences"
    ,"journal.policy.contains.word.miame.mged"
    ,"journal.policy.contains.word.arrayexpress"
    ,"journal.policy.contains.word.geo.omnibus"
    ,"journal.policy.requests.sharing.other.data"
    ,"journal.microarray.creating.count"
    ,"num.grants.via.nih"
    ,"max.grant.duration"
    ,"nih.first.year.ago"
    ,"nih.last.year.ago"
    ,"nih.cumulative.years"
    ,"nih.sum.avg.dollars"
    ,"nih.sum.sum.dollars"
    ,"nih.max.max.dollars"
    ,"has.R01.funding"
    ,"has.P.funding"
    ,"has.R.funding"
    ,"has.T.funding"
    ,"has.U.funding"
    ,"has.K.funding"
    ,"num.post2004.morethan1000k"
    ,"dataset.in.geo.or.ae.int"
    )

make.figure.S1toS5 = function
(
    dat.untransformed,
    screen=SCREEN,
    format=FORMAT
)
{
    label(dat.untransformed$first.author.num.prev.microarray.creations) <- "first.author.num.prev.micro.produce"
    label(dat.untransformed$last.author.num.prev.microarray.creations) <- "last.author.num.prev.micro.produce"
    label(dat.untransformed$journal.policy.at.least.requests.sharing.microarray) <- "journal.policy.requests.or.requires.micro.data"
    label(dat.untransformed$journal.policy.contains.word.geo.omnibus) <- "journal.policy.contains.word.geo.omnibus"

    # to see formatting, try ?dotchart2 or ?plot.summary.formula.response
    mynames = names(dat.untransformed)
    sections = 5
    section.length = ceil(length(mynames) / sections)
    #for (ii in 0:0) {
    for (ii in 0:(sections-1)) {
        ## Check to make sure this is an even divide do don't miss any!
        #first.count = ii*section.length
        #last.count = min(length(mynames), ((ii+1)*section.length)-1)
        #mynames.section = mynames[first.count:last.count]

        # group them by topic instead
        variable.groups = c("^num.authors|^first|^last",
            "journal",
            "pubmed",
            "nih|grant|num.post|funding",
            "country|institution")
        mynames.section = mynames[grep(variable.groups[ii+1], mynames)]
        mynames.section = intersect(mynames.section, display.attributes)

        if ("dataset.in.geo.or.ae.int" %nin% mynames.section) {
            mynames.section = c(mynames.section, "dataset.in.geo.or.ae.int")
        }
        print(ii)
        print(mynames.section)
        s = summary(dataset.in.geo.or.ae.int ~ ., dat.untransformed[,names(dat.untransformed) %in% mynames.section], continuous = 3)
        #s
        #quartz()
        filename = paste("figureS", ii+1, sep="")
        print(filename)
        setup.figure(filename, MAX.FIGURE.WIDTH, MAX.FIGURE.HEIGHT, screen=screen, format=format)
        par(mai=c(1,5,0.2,1)) # bottom, left, top, right
        response = try(
                plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
                , TRUE)
        if (class(response)=="try-error") {
            plot(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
        }
        close.figure(filename, screen=screen, format=format)
    }
}

get.dat.nums = function
( dat.raw )
{
    library(plyr)

    dat.nums = colwise(as.numeric)(dat.raw)  ##<<details this produces warnings
    return(dat.nums)
}


make.figure.journals = function
(
    dat,
    n.journals = 25,
    screen=SCREEN,
    format=FORMAT
)
{
    dat.journals = dat
    dat.journals$dataset.in.geo.or.ae.int = as.numeric(dat.journals$in.ae.or.geo)

    # Consolidate the less frequent journals into Other
    top.journals = names(sort(table(dat.journals$pubmed.journal), decreasing=TRUE)[0:n.journals])
    dat.journals$journal.factor = factor(dat.journals$pubmed.journal)
    levels(dat.journals$journal.factor)[levels(dat.journals$journal.factor) %nin% top.journals] <- "OTHER"

    s = summary(dataset.in.geo.or.ae.int ~ journal.factor, dat.journals)
    s

    setup.figure("journals", MAX.FIGURE.WIDTH, MAX.FIGURE.WIDTH, screen=screen, format=format)
    par(mai=c(1,5,0.2,1)) # bottom, left, top, right
    plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.5, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
    close.figure("journals", screen=screen, format=format)

    ps.chisq = c()
    for (var in top.journals) {
        pv1 <- chisq.test(table(dat.journals$dataset.in.geo.or.ae, dat.journals$journal.factor==var))$p.value
        ps.chisq = cbind(ps.chisq, pv1)
    }
    colnames(ps.chisq) = top.journals

    adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none"))
    colnames(adjusted.ps.chisq) = colnames(ps.chisq)
    print(t(rbind(colnames(adjusted.ps.chisq)[order(adjusted.ps.chisq)], adjusted.ps.chisq[order(adjusted.ps.chisq)])))

}


###  RUN IT ALL!

    setwd("~/Documents/Projects/ASIST2011/paper")

    ## The source file below can be found at https://gist.github.com/999036
    source("plot.summary.formula.response.CIs.R")

    ## Dataset was downloaded from http://dx.doi.org/10.5061/dryad.mf1sd/1.
    ## Please cite use of this data in the references section of publications as
    ## Piwowar HA (2011) Data from: Who shares? Who doesn’t? Factors associated with openly archiving raw research data. Dryad Digital Repository. doi:10.5061/dryad.mf1sd
    raw.data.filename = "rawdata.txt"
    dat.raw.all = read.raw.data(raw.data.filename)

    ##Keep just the data that has journal policy information
    dat.raw.policycomplete = dat.raw.all
    dat.raw.policycomplete = dat.raw.policycomplete[!is.na(dat.raw.policycomplete$journal.policy.says.must.deposit),]
    dat.raw.policycomplete = dat.raw.policycomplete[(dat.raw.policycomplete$pubmed.year.published<2010),]

    ##Keep just the studies published in journals that require deposition
    dat.raw.policymustdeposit = dat.raw.policycomplete[(dat.raw.policycomplete$journal.policy.says.must.deposit>0),]

    ##Keep just the studies where neither the first nor last author have shared before (by my estimates)
    dat.raw.policymustdeposit.nevershared = dat.raw.policymustdeposit[intersect(
        which(dat.raw.policymustdeposit$first.author.num.prev.geoae.sharing==0),
        which(dat.raw.policymustdeposit$last.author.num.prev.geoae.sharing==0)),]

    ##Keep just the recent studies, published from 2007 through 2009
    dat.raw.policymustdeposit.nevershared.recent = subset(dat.raw.policymustdeposit.nevershared, subset=pubmed.year.published>=2007)
    dat.raw = dat.raw.policymustdeposit.nevershared.recent

    ##Process the data
    dat.clean = preprocess.raw.data(dat.raw)
    dat.clean = subset(dat.clean, select = -first.author.num.prev.geoae.sharing)
    dat.clean = subset(dat.clean, select = -last.author.num.prev.geoae.sharing)

    ##Display the results
    options(digits=4)
    options(width=100)
    explore.cleaned.data(dat.clean)
    make.figure.S1toS5(dat.clean, screen=TRUE, format=FORMAT)
    make.figure.journals(dat.raw, 25, screen=TRUE, format=FORMAT)

    #### Look at which categorical variables have significant differences
    dat = dat.clean
    ps.chisq = c()
    for (var in names(dat)) {
        pv1 <- chisq.test(table(dat[,"dataset.in.geo.or.ae.int"], dat[,var]))$p.value
        ps.chisq = cbind(ps.chisq, pv1)
    }
    colnames(ps.chisq) = names(dat)
    # Review the type of p-value multiple comparison adjustment used below before interpreting!
    adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none"))
    colnames(adjusted.ps.chisq) = colnames(ps.chisq)
    varnames = colnames(adjusted.ps.chisq)
    ordered.results = order(adjusted.ps.chisq)
    print("chi squared p-values, so only look at the results for category variables (review multiple comparison adjustment used below before interpreting!):")
    print(t(rbind(varnames[ordered.results], adjusted.ps.chisq[ordered.results])))

    #### Look at which continuous variables have significant differences
    ps.wilcox = c()
    for (var in names(dat)) {
        pv1 = NA
        try(pv1 <- wilcox.test(dat[,var] ~ dat[,"dataset.in.geo.or.ae.int"])$p.value, TRUE)
        ps.wilcox = cbind(ps.wilcox, pv1)
    }
    colnames(ps.wilcox) = names(dat)
    # Review the type of p-value multiple comparison adjustment used below before interpreting!
    adjusted.ps.wilcox = p.adjust(ps.wilcox, method = c("none"))
    colnames(adjusted.ps.wilcox) = colnames(ps.wilcox)
    varnames = colnames(adjusted.ps.wilcox)
    ordered.results = order(adjusted.ps.wilcox)
    print("wilcox p-values, so only look at the results for continuous variables  (review multiple comparison adjustment used below before interpreting!):")
    print(t(rbind(varnames[ordered.results], adjusted.ps.wilcox[ordered.results])))