daattali/linkedin.R

## linkedin.R
# Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn

# URL of the LinkedIn page
user_url <- "https://www.linkedin.com/in/daattali"

# since the information isn't available without being logged in, the web
# scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
# anywhere as you can see, it's just used to log in during the scrape session)
username <- "yourusername"
password <- "yourpassword"

# takes a couple seconds and might throw a warning, but ignore the warning
# (linkedin_info <- scrape_linkedin(user_url))

############################

library(rvest)

scrape_linkedin <- function(user_url) {
  linkedin_url <- "http://linkedin.com/"
  pgsession <- html_session(linkedin_url)
  pgform <- html_form(pgsession)[[1]]
  filled_form <- set_values(pgform,
                            session_key = username,
                            session_password = password)

  submit_form(pgsession, filled_form)

  pgsession <- jump_to(pgsession, user_url)
  page_html <- read_html(pgsession)

  name <-
    page_html %>% html_nodes("#name") %>% html_text()

  location <-
    page_html %>% html_nodes("#location .locality") %>% html_text()

  num_connections <-
    page_html %>% html_nodes(".member-connections strong") %>% html_text()

  summary <-
    page_html %>% html_nodes("#summary-item-view") %>% html_text()

  skills_nodes <-
    page_html %>% html_nodes("#profile-skills .skill-pill")

  skills <-
    lapply(skills_nodes, function(node) {
      num <- node %>% html_nodes(".num-endorsements") %>% html_text()
      name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
      data.frame(name = name, num = num)
    })

  skills <- do.call(rbind, skills)


  list(
    name = name,
    location = location,
    num_connections = num_connections,
    summary = summary,
    skills = skills
  )
}

## twitter.R
# Make a wordcloud of the most common words in a person's tweets

# Need to create a Twitter App and get credentials
setup_twitter_oauth(USE_YOUR_CREDENTIALS_HERE)

# Username of the Twitter user
name <- "daattali"

#####################

library(twitteR)
library(SnowballC)
library(wordcloud)
library(tm)
library(stringr)
library(dplyr)

user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })

topwords <-
  tweets %>%
  paste(collapse = " ") %>%
  str_split("\\s") %>%
  unlist %>%
  tolower %>%
  removePunctuation %>%
  removeWords(stopwords("english")) %>%
  #wordStem %>%
  .[. != ""] %>%
  table %>%
  sort(decreasing = TRUE) %>%
  head(100)

wordcloud(names(topwordscloud), topwords, min.freq = 3)
	# Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn

	# URL of the LinkedIn page
	user_url <- "https://www.linkedin.com/in/daattali"

	# since the information isn't available without being logged in, the web
	# scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
	# anywhere as you can see, it's just used to log in during the scrape session)
	username <- "yourusername"
	password <- "yourpassword"

	# takes a couple seconds and might throw a warning, but ignore the warning
	# (linkedin_info <- scrape_linkedin(user_url))

	############################

	library(rvest)

	scrape_linkedin <- function(user_url) {
	linkedin_url <- "http://linkedin.com/"
	pgsession <- html_session(linkedin_url)
	pgform <- html_form(pgsession)[[1]]
	filled_form <- set_values(pgform,
	session_key = username,
	session_password = password)

	submit_form(pgsession, filled_form)

	pgsession <- jump_to(pgsession, user_url)
	page_html <- read_html(pgsession)

	name <-
	page_html %>% html_nodes("#name") %>% html_text()

	location <-
	page_html %>% html_nodes("#location .locality") %>% html_text()

	num_connections <-
	page_html %>% html_nodes(".member-connections strong") %>% html_text()

	summary <-
	page_html %>% html_nodes("#summary-item-view") %>% html_text()

	skills_nodes <-
	page_html %>% html_nodes("#profile-skills .skill-pill")

	skills <-
	lapply(skills_nodes, function(node) {
	num <- node %>% html_nodes(".num-endorsements") %>% html_text()
	name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
	data.frame(name = name, num = num)
	})

	skills <- do.call(rbind, skills)


	list(
	name = name,
	location = location,
	num_connections = num_connections,
	summary = summary,
	skills = skills
	)
	}
	# Make a wordcloud of the most common words in a person's tweets

	# Need to create a Twitter App and get credentials
	setup_twitter_oauth(USE_YOUR_CREDENTIALS_HERE)

	# Username of the Twitter user
	name <- "daattali"

	#####################

	library(twitteR)
	library(SnowballC)
	library(wordcloud)
	library(tm)
	library(stringr)
	library(dplyr)

	user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
	tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })

	topwords <-
	tweets %>%
	paste(collapse = " ") %>%
	str_split("\\s") %>%
	unlist %>%
	tolower %>%
	removePunctuation %>%
	removeWords(stopwords("english")) %>%
	#wordStem %>%
	.[. != ""] %>%
	table %>%
	sort(decreasing = TRUE) %>%
	head(100)

	wordcloud(names(topwordscloud), topwords, min.freq = 3)