Kendra Vant kendravant

## cancer.R
# FILE: Classifying Breast Cancer as Benign or Malignant
# AUTHOR: Timothy P. Jurka

library(RTextTools);

# GET THE BREAST CANCER DATA FROM http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
data <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",header=FALSE)
data <- data[-1]

# ADD TEXTUAL DESCRIPTORS FOR EACH MASS CHARACTERISTIC FOR THE DOCUMENT-TERM MATRIX

## tm_example.R
rm(list = ls())
doInstall <- TRUE  # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "Snowball")
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
lapply(toInstall, library, character.only = TRUE)

# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
head(Transcript, 20)

## geocoded_Tweets.R
doInstall <- TRUE
toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)

searchTerm <- "#rstats"
searchResults <- searchTwitter(searchTerm, n = 1000)  # Gather Tweets
tweetFrame <- twListToDF(searchResults)  # Convert to a nice dF

userInfo <- lookupUsers(tweetFrame$screenName)  # Batch lookup of user info

## free text manipulation
library(stringr)

df$NOTE = str_replace_all(df$NOTE,"\\n"," ") # Removes new line characters
df$NOTE = str_replace_all(df$NOTE,"\\r"," ") # Removes carriage returns
df$NOTE = str_replace_all(df$NOTE,"\\s+"," ") # Replaces two or more spaces with a single space

df$STRIPPED_NOTE = str_replace_all(df$STRIPPED_NOTE, fixed("Travel,"), " ")
# Faster because it explicitly looks for a fixed text string rather than a regexp

str_detect(df$NOTE,fixed("Family's future,")) # To create a flag if string present rather than mutate string

## connect R to db
library(RODBC)

df = read.table("../01 raw_data/<file_name>.txt",sep='|',header=TRUE,nrow = xxxx)
dbname <- odbcDriverConnect('driver={SQL Server};server=<server_name>;database=<database_name>;trusted_connection=true')
df <- sqlQuery(dbname, "select * from <table_name>")

## pig script for counting words
unstructuredText = load '<file name>' using PigStorage('|')
  as
		(
		CUSTOMER_NUMBER:chararray,
		VISIT_TYPE:chararray,
		REVIEW_DATE:chararray,
		NOTE:chararray
		);

tokenized = foreach unstructuredText
	# FILE: Classifying Breast Cancer as Benign or Malignant
	# AUTHOR: Timothy P. Jurka

	library(RTextTools);

	# GET THE BREAST CANCER DATA FROM http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
	data <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",header=FALSE)
	data <- data[-1]

	# ADD TEXTUAL DESCRIPTORS FOR EACH MASS CHARACTERISTIC FOR THE DOCUMENT-TERM MATRIX
	rm(list = ls())
	doInstall <- TRUE # Change to FALSE if you don't want packages installed.
	toInstall <- c("zoo", "tm", "ggplot2", "Snowball")
	if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
	lapply(toInstall, library, character.only = TRUE)

	# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
	Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
	head(Transcript, 20)
	doInstall <- TRUE
	toInstall <- c("twitteR", "dismo", "maps", "ggplot2")
	if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
	lapply(toInstall, library, character.only = TRUE)

	searchTerm <- "#rstats"
	searchResults <- searchTwitter(searchTerm, n = 1000) # Gather Tweets
	tweetFrame <- twListToDF(searchResults) # Convert to a nice dF

	userInfo <- lookupUsers(tweetFrame$screenName) # Batch lookup of user info
	library(stringr)

	df$NOTE = str_replace_all(df$NOTE,"\\n"," ") # Removes new line characters
	df$NOTE = str_replace_all(df$NOTE,"\\r"," ") # Removes carriage returns
	df$NOTE = str_replace_all(df$NOTE,"\\s+"," ") # Replaces two or more spaces with a single space

	df$STRIPPED_NOTE = str_replace_all(df$STRIPPED_NOTE, fixed("Travel,"), " ")
	# Faster because it explicitly looks for a fixed text string rather than a regexp

	str_detect(df$NOTE,fixed("Family's future,")) # To create a flag if string present rather than mutate string
	library(RODBC)

	df = read.table("../01 raw_data/<file_name>.txt",sep='\|',header=TRUE,nrow = xxxx)
	dbname <- odbcDriverConnect('driver={SQL Server};server=<server_name>;database=<database_name>;trusted_connection=true')
	df <- sqlQuery(dbname, "select * from <table_name>")
	unstructuredText = load '<file name>' using PigStorage('\|')
	as
	(
	CUSTOMER_NUMBER:chararray,
	VISIT_TYPE:chararray,
	REVIEW_DATE:chararray,
	NOTE:chararray
	);

	tokenized = foreach unstructuredText