Mike Spaner mspan

## rf_phone_tutorial.R
library(randomForest)

# for reference, how to download cleaned up dataset we'll be using.

# url = "https://spark-public.s3.amazonaws.com/dataanalysis/samsungData.rda"
# destfile = "./samsungData2.rda"
# download.file(url, destfile, method="curl", quiet = FALSE, mode = "wb",cacheOK = TRUE)

load("~/Dropbox/random_phone_tutorial/samsungData.rda")

## RandomForestsInTen_final.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                mspan
                / RandomForestsInTen_final.ipynb
            
            
              Created
              September 26, 2013 00:10
            
              
                Overview of Random Forests for newhaven.io presentation
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## find-duplicate-files.R
library("digest")
test_dir= "/Volumes/Public/book_k/photo_backup"
filelist <- dir(test_dir, pattern = "JPG|AVI", recursive=TRUE, all.files =TRUE, full.names=TRUE)

# a concise, vectorized solution
# http://stackoverflow.com/questions/14060423/how-to-vectorize-this-r-code-using-plyr-apply-or-similar
md5s<-sapply(filelist,digest,file=TRUE,algo="md5", length = 5000)
duplicate_files = split(filelist,md5s)

# now let's divide the list into duplicates ( length > 1) and uniques ( length - 1)
	library(randomForest)

	# for reference, how to download cleaned up dataset we'll be using.

	# url = "https://spark-public.s3.amazonaws.com/dataanalysis/samsungData.rda"
	# destfile = "./samsungData2.rda"
	# download.file(url, destfile, method="curl", quiet = FALSE, mode = "wb",cacheOK = TRUE)

	load("~/Dropbox/random_phone_tutorial/samsungData.rda")
	library("digest")
	test_dir= "/Volumes/Public/book_k/photo_backup"
	filelist <- dir(test_dir, pattern = "JPG\|AVI", recursive=TRUE, all.files =TRUE, full.names=TRUE)

	# a concise, vectorized solution
	# http://stackoverflow.com/questions/14060423/how-to-vectorize-this-r-code-using-plyr-apply-or-similar
	md5s<-sapply(filelist,digest,file=TRUE,algo="md5", length = 5000)
	duplicate_files = split(filelist,md5s)

	# now let's divide the list into duplicates ( length > 1) and uniques ( length - 1)