Skip to content

Instantly share code, notes, and snippets.

View kendravant's full-sized avatar

Kendra Vant kendravant

View GitHub Profile
# FILE: Classifying Breast Cancer as Benign or Malignant
# AUTHOR: Timothy P. Jurka
library(RTextTools);
# GET THE BREAST CANCER DATA FROM http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
data <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",header=FALSE)
data <- data[-1]
# ADD TEXTUAL DESCRIPTORS FOR EACH MASS CHARACTERISTIC FOR THE DOCUMENT-TERM MATRIX
rm(list = ls())
doInstall <- TRUE # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "Snowball")
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
head(Transcript, 20)
doInstall <- TRUE
toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
searchTerm <- "#rstats"
searchResults <- searchTwitter(searchTerm, n = 1000) # Gather Tweets
tweetFrame <- twListToDF(searchResults) # Convert to a nice dF
userInfo <- lookupUsers(tweetFrame$screenName) # Batch lookup of user info
@kendravant
kendravant / free text manipulation
Created March 4, 2013 05:35
Library and common commands to clean and manipulate free text data
library(stringr)
df$NOTE = str_replace_all(df$NOTE,"\\n"," ") # Removes new line characters
df$NOTE = str_replace_all(df$NOTE,"\\r"," ") # Removes carriage returns
df$NOTE = str_replace_all(df$NOTE,"\\s+"," ") # Replaces two or more spaces with a single space
df$STRIPPED_NOTE = str_replace_all(df$STRIPPED_NOTE, fixed("Travel,"), " ")
# Faster because it explicitly looks for a fixed text string rather than a regexp
str_detect(df$NOTE,fixed("Family's future,")) # To create a flag if string present rather than mutate string
@kendravant
kendravant / connect R to db
Created March 4, 2013 05:06
Library and steps to read data from a relational data base into R
library(RODBC)
df = read.table("../01 raw_data/<file_name>.txt",sep='|',header=TRUE,nrow = xxxx)
dbname <- odbcDriverConnect('driver={SQL Server};server=<server_name>;database=<database_name>;trusted_connection=true')
df <- sqlQuery(dbname, "select * from <table_name>")
@kendravant
kendravant / pig script for counting words
Created March 4, 2013 00:43
Pig script for counting words in small text comments stored in one column of a flat file.
unstructuredText = load '<file name>' using PigStorage('|')
as
(
CUSTOMER_NUMBER:chararray,
VISIT_TYPE:chararray,
REVIEW_DATE:chararray,
NOTE:chararray
);
tokenized = foreach unstructuredText