Skip to content

Instantly share code, notes, and snippets.

@Zoldin
Last active July 21, 2017 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Zoldin/47536af63182a0e8daf37a7b989e2e8d to your computer and use it in GitHub Desktop.
Save Zoldin/47536af63182a0e8daf37a7b989e2e8d to your computer and use it in GitHub Desktop.
parsingxml.R
#!/usr/bin/Rscript
library(XML)
args = commandArgs(trailingOnly=TRUE)
if (!length(args)==2) {
stop("Two arguments must be supplied (input file name ,output file name - csv ext).n", call.=FALSE)
}
#read XML line by line
con <- file(args[1], "r")
lines <- readLines(con, -1)
test <- lapply(lines,function(x){return(xmlTreeParse(x,useInternalNodes = TRUE))})
#parsing XML to get variables
ID <- as.numeric(sapply(test,function(x){return(xpathSApply(x, "//row",xmlGetAttr, "Id"))}))
Tags <- sapply(test,function(x){return(xpathSApply(x, "//row",xmlGetAttr, "Tags"))})
Title <- as.character(sapply(test,function(x){return(xpathSApply(x, "//row",xmlGetAttr, "Title"))}))
Body <- as.character(sapply(test,function(x){return(xpathSApply(x, "//row",xmlGetAttr, "Body"))}))
text = paste(Title,Body)
label = as.numeric(sapply(Tags,function(x){return(grep("python",x))}))
label[is.na(label)]=0
#final data frame for export
df <- as.data.frame(cbind(ID,label,text),stringsAsFactors = FALSE)
df$ID=as.numeric(df$ID)
df$label=as.numeric(df$label)
#write to csv
write.csv(df, file=args[2],row.names=FALSE)
print("output file created....")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment