Skip to content

Instantly share code, notes, and snippets.

@quinnpertuit
Forked from daattali/linkedin.R
Created March 21, 2020 22:16
Show Gist options
  • Save quinnpertuit/92fdaf3aea0d7c10fad0e6f4b96f8808 to your computer and use it in GitHub Desktop.
Save quinnpertuit/92fdaf3aea0d7c10fad0e6f4b96f8808 to your computer and use it in GitHub Desktop.
Scraping Twitter and LinkedIn info in R
# Get a person's name, location, summary, # of connections, and skills & endorsements from LinkedIn
# URL of the LinkedIn page
user_url <- "https://www.linkedin.com/in/daattali"
# since the information isn't available without being logged in, the web
# scraper needs to log in. Provide your LinkedIn user/pw here (this isn't stored
# anywhere as you can see, it's just used to log in during the scrape session)
username <- "yourusername"
password <- "yourpassword"
# takes a couple seconds and might throw a warning, but ignore the warning
# (linkedin_info <- scrape_linkedin(user_url))
############################
library(rvest)
scrape_linkedin <- function(user_url) {
linkedin_url <- "http://linkedin.com/"
pgsession <- html_session(linkedin_url)
pgform <- html_form(pgsession)[[1]]
filled_form <- set_values(pgform,
session_key = username,
session_password = password)
submit_form(pgsession, filled_form)
pgsession <- jump_to(pgsession, user_url)
page_html <- read_html(pgsession)
name <-
page_html %>% html_nodes("#name") %>% html_text()
location <-
page_html %>% html_nodes("#location .locality") %>% html_text()
num_connections <-
page_html %>% html_nodes(".member-connections strong") %>% html_text()
summary <-
page_html %>% html_nodes("#summary-item-view") %>% html_text()
skills_nodes <-
page_html %>% html_nodes("#profile-skills .skill-pill")
skills <-
lapply(skills_nodes, function(node) {
num <- node %>% html_nodes(".num-endorsements") %>% html_text()
name <- node %>% html_nodes(".endorse-item-name-text") %>% html_text()
data.frame(name = name, num = num)
})
skills <- do.call(rbind, skills)
list(
name = name,
location = location,
num_connections = num_connections,
summary = summary,
skills = skills
)
}
# Make a wordcloud of the most common words in a person's tweets
# Need to create a Twitter App and get credentials
setup_twitter_oauth(USE_YOUR_CREDENTIALS_HERE)
# Username of the Twitter user
name <- "daattali"
#####################
library(twitteR)
library(SnowballC)
library(wordcloud)
library(tm)
library(stringr)
library(dplyr)
user <- userTimeline(user = name, n = 3200, includeRts = FALSE, excludeReplies = TRUE)
tweets <- sapply(user, function(x) { strsplit(gsub("[^[:alnum:] ]", "", x$text), " +")[[1]] })
topwords <-
tweets %>%
paste(collapse = " ") %>%
str_split("\\s") %>%
unlist %>%
tolower %>%
removePunctuation %>%
removeWords(stopwords("english")) %>%
#wordStem %>%
.[. != ""] %>%
table %>%
sort(decreasing = TRUE) %>%
head(100)
wordcloud(names(topwordscloud), topwords, min.freq = 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment