Skip to content

Instantly share code, notes, and snippets.

@wenjie1991
Last active July 17, 2018 15:06
Show Gist options
  • Save wenjie1991/549e287126a8076fc7ab56e59256010e to your computer and use it in GitHub Desktop.
Save wenjie1991/549e287126a8076fc7ab56e59256010e to your computer and use it in GitHub Desktop.
Extract Gene frequency from Pubmed XML files
#################################################
# Input:
input = "./pubmed_result.xml"
#################################################
# Output:
output = "./abstract.xls"
# install.packages("pubmed.mineR")
library(pubmed.mineR)
# Input
dat0 <- xmlreadabs(input)
dat1 <- gene_atomization(dat0)
symbol <- dat1[, 1]
symbol.blank <- paste("\\b", symbol, "\\b", sep = "")
name <- dat1[, 2]
EntrezID <- c()
Number <- c()
Symbol <- c()
Name <- c()
Location <- c()
PMID <- c()
for (i in 1:length(symbol.blank)){
sb <- symbol.blank[i]
b <- grepl(sb, dat0@Abstract)
if (sum(b) == 0){
Number <- append(Number, sum(b))
PMID <- append(PMID, "")
} else {
Number <- append(Number, sum(b))
# Symbol <- append(Symbol, symbol[i])
PMID <- append(PMID, paste(dat0@PMID[b], collapse = "//"))
print(sum(b))
print(symbol[i])
}
}
source("https://bioconductor.org/biocLite.R")
biocLite("org.Hs.eg.db")
library(org.Hs.eg.db)
entrez <- mget(symbol, org.Hs.egALIAS2EG)
chrom <- c()
for (i in 1:length(entrez)){
print(i)
chrom1 <- unlist(mget(entrez[[i]], org.Hs.egMAP))
chrom <- append(chrom, paste(chrom1, collapse = "//"))
}
dat2 <- data.frame(Symbol = symbol, Name = name, Number, Location = chrom, PMID)
length(dat2$PMID == "")
write.table(dat2[order(Number, decreasing = T), ], output, sep="\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment