Skip to content

Instantly share code, notes, and snippets.

@alessiot
Last active October 31, 2019 13:16
Show Gist options
  • Save alessiot/57c9858e1e19b5e01869f67ad6d2a818 to your computer and use it in GitHub Desktop.
Save alessiot/57c9858e1e19b5e01869f67ad6d2a818 to your computer and use it in GitHub Desktop.
Download YouTube Spam Comments with R
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# https://docs.anaconda.com/anaconda/packages/r-language-pkg-docs/\n",
"# conda install rpy2\n",
"# conda install -c r r-essentials\n",
"# conda install -c r r-base\n",
"# conda search -f r-EXACTNAME to search for existing packages and conda install EXACTNAME\n",
"# If any issue happens, remove /anaconda3/lib/R and install everything again\n",
"\n",
"import rpy2.ipython\n",
"\n",
"%reload_ext rpy2.ipython"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%%R\n",
"\n",
"# Snippet from\n",
"# https://github.com/christophM/interpretable-ml-book/blob/master/R/get-SpamTube-dataset.R\n",
"\n",
"# Data from\n",
"# http://www.dt.fee.unicamp.br/~tiago//youtubespamcollection/\n",
"\n",
"data_dir = \"youtube_data\"\n",
"\n",
"# Download the youtube datasets \n",
"download.spam.data = function(){\n",
" urls = sprintf('http://lasid.sor.ufscar.br/labeling/datasets/%i/download/', 9:13)\n",
" ycomments = lapply(urls, read.csv, stringsAsFactors=FALSE)\n",
" ycomments = do.call('rbind', ycomments)\n",
" cleanFun <- function(htmlString) {\n",
" return(gsub(\"<.*?>\", \"\", htmlString))\n",
" }\n",
" ycomments$CONTENT = cleanFun(ycomments$CONTENT)\n",
" # Convert to ASCII\n",
" ycomments$CONTENT = iconv(ycomments$CONTENT, \"UTF-8\", \"ASCII\", sub=\"\")\n",
" write.csv( x = ycomments, file = sprintf('%s/TubeSpam.csv', data_dir),row.names=FALSE)\n",
"}\n",
"download.spam.data()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment