Create a dataset with stopwords in Portuguese (from Stopwords ISO)
### STOPWORDS IN PORTUGUESE | |
# Create a dataset with stopwords in Portuguese | |
# The original datasets used for this are from Stopwords ISO, at https://github.com/stopwords-iso/stopwords-pt | |
#rm(list = ls()) | |
library(dplyr) | |
library(stringi) | |
library(readr) | |
## Create a unique dataset from several datasets with stopwords in Portuguese | |
# Datasets links | |
links_stopwords_pt <- c("https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/bbalet_stopwords_pt.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/fergiemcdowall_stopwords_pt.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/geonetwork-por.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/gh-stopwords-json-pt.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/language-resource-stopwords-variant.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/language-resource-stopwords.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/ranksnl-brazilian.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/ranksnl-portugese.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/stop-words-portugese.txt", | |
"https://raw.githubusercontent.com/stopwords-iso/stopwords-pt/master/raw/stopwords-filter-pt.txt") | |
# Open each dataset and pile them up | |
stopwords_pt_final <- NULL | |
for(i in 1:length(links_stopwords_pt)){ | |
banco <- read_delim(links_stopwords_pt[i], delim = "\n", col_names = "word") | |
stopwords_pt_final <- rbind.data.frame(stopwords_pt_final, banco) | |
} | |
# Add stopwords source | |
stopwords_pt_final$source <- "stopwords-iso" | |
# Remove \t from the beginning of words | |
stopwords_pt_final$word <- trimws(stopwords_pt_final$word) | |
# Create the dataset with unique stopwords | |
stopwords_pt_final <- stopwords_pt_final %>% | |
mutate(word = tolower(word)) %>% | |
distinct() | |
# Create the dataset with unique stopwords once strings were "cleaned" | |
stopwords_pt_final_noaccent <- stopwords_pt_final %>% | |
mutate(word = tolower(stri_trans_general(word, "ascii"))) %>% | |
distinct() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment