Skip to content

Instantly share code, notes, and snippets.

@Jian-Qiao
Created August 22, 2017 17:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Jian-Qiao/4971347d512fd085e79438165318e0bc to your computer and use it in GitHub Desktop.
Save Jian-Qiao/4971347d512fd085e79438165318e0bc to your computer and use it in GitHub Desktop.
#Data Cleaning
Error=Fighters_Scrapped[Fighters_Scrapped$Class==' VS ',c(1:8,10,11)]
Fighters_Scrapped=Fighters_Scrapped[Fighters_Scrapped$Class!=' VS ',]
Error_p1=Error[grepl('lbs',Error$Height),]
Error=Error[!grepl('lbs',Error$Height),]
Error_p2=Error_p1[c(6,13),]
Error_p1=Error_p1[-c(6,13),]
colnames(Error_p2)=c('Birth_Date','Age','Birth_Place','Country','Height','Weight','Association','Class','Fighter_id','Url')
Error_p2$Name=c('Mirsad Bektic','Noad Lahat')
Error_p3=Error_p1[Error_p1$Association==Error_p1$Association[3],c(1:7,9,10)]
Error_p1=Error_p1[!Error_p1$Association==Error_p1$Association[3],]
colnames(Error_p3)=c('Name','Birth_Date','Age','Country','Height','Weight','Class','Fighter_id','Url')
colnames(Error_p1)=c('Name','Birth_Date','Age','Country','Height','Weight','Association','Class','Fighter_id','Url')
colnames(Error)=c('Name','Birth_Date','Age','Birth_Place','Country','Height','Weight','Class','Fighter_id','Url')
Fighters_Scrapped=rbind.fill(Fighters_Scrapped,Error,Error_p1,Error_p2,Error_p3)
Fighters_Scrapped$NickName=sapply(Fighters_Scrapped$Name,function(x) gsub('\\"','',regmatches(x,gregexpr('"[^"]*"',x))[[1]]))
Fighters_Scrapped$Name=sapply(Fighters_Scrapped$Name,function(x) gsub('\\".*\\"','',x))
Fighters_Scrapped$Age=sapply(Fighters_Scrapped$Age,function(x) gsub('AGE: ','',x))
Fighters_Scrapped$Feet=sapply(Fighters_Scrapped$Height,function(x) gsub("\\'.*",'',x))
Fighters_Scrapped$Inch=sapply(Fighters_Scrapped$Height,function(x) gsub('\\"','',gsub(".\\'",'',x)))
Fighters_Scrapped$Height=as.integer(as.character(Fighters_Scrapped$Feet))*12 + as.integer(as.character(Fighters_Scrapped$Inch))
Fighters_Scrapped$Weight=sapply(Fighters_Scrapped$Weight,function(x) gsub(' lbs','',x))
Fighters_Scrapped=sapply(Fighters_Scrapped, function(x) gsub('N/A',NA,x))
Fighters_Scrapped=as.data.frame(Fighters_Scrapped)
Fighters_Scrapped$Birth_Date=ymd(Fighters_Scrapped$Birth_Date)
#Data Formatting
#-----------------------------------------------------------------------------------------------------------------------------
Fighters_Updated=Fighters_Scrapped
Fighters_Updated$Fighter_id=as.integer(Fighters_Updated$Fighter_id)
Fighters_Updated=Fighters_Updated[!duplicated(Fighters_Updated$Fighter_id),]
rownames(Fighters_Updated)=NULL
Fighters_Updated$Birth_Date=ymd(as.character(Fighters_Updated$Birth_Date))
Fighters_Updated$Name=as.character(Fighters_Updated$Name)
Fighters_Updated$NickName=as.character(Fighters_Updated$NickName)
Fighters_Updated$Height=as.integer(Fighters_Updated$Height)
Fighters_Updated$Weight=as.integer(Fighters_Updated$Weight)
Fighters_Updated[,7:10]=sapply(Fighters_Updated[,c(7:9,11)],function(x) as.character(x))
Fighters_Updated[,7:10]=sapply(Fighters_Updated[,c(7:9,11)],function(x) ifelse(x %in% c("",'N/A'),NA,x))
for (i in seq(7,10)){
Fighters_Updated[,i]=as.factor(Fighters_Updated[,i])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment