Skip to content

Instantly share code, notes, and snippets.

@aojea
Created January 22, 2015 15:26
Show Gist options
  • Save aojea/cae32beca54d8440ec46 to your computer and use it in GitHub Desktop.
Save aojea/cae32beca54d8440ec46 to your computer and use it in GitHub Desktop.
Scrape for malware collection
library(XML)
library(RMySQL)
library(jsonlite)
geoloc<-function(x){
return(fromJSON(paste0("https://stat.ripe.net/data/geoloc/data.json?resource=",x))$data$locations$country)
}
getasn<-function(x){
return(fromJSON(paste0("https://stat.ripe.net/data/network-info/data.json?resource=",x))$data$asns[1])
}
# DB Connection, change parameters as necessary
con <- dbConnect(MySQL(), user="user", password="pass", dbname="db", host="127.0.0.1")
# download the data and store in data frame
url<-"http://urlquery.net/"
html<-htmlTreeParse(url,useInternalNodes=T)
# Select the table with data
url.stats<-getNodeSet(html,"//table[@class='test']")
table.stats<-lapply(url.stats,readHTMLTable,header=TRUE)
df<-table.stats[[1]]
df<-cbind(df,matrix(unlist(strsplit(as.character(df[,2])," - ")),ncol=3,byrow=TRUE))
df<-cbind(df,data.frame(matrix(sapply(df$IP, geoloc) )))
df<-cbind(df,data.frame(matrix(sapply(df$IP, getasn) )))
# Cast variables
names(df)<-c("Date","Alarms","URL","IP","UQ","IDS","BL","CC","ASN")
# Insert data DB
dbWriteTable(con, name="urlquery2", value=df,field.type=list(Date="timestamp",Alarms="text",URL="text",IP="text",UQ="int",IDS="int",BL="int",CC="text",ASN="int"),append=TRUE,row.names=FALSE,overwrite=FALSE)
# disconnect DB
dbDisconnect(con)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment