Last active
October 31, 2019 13:16
-
-
Save alessiot/57c9858e1e19b5e01869f67ad6d2a818 to your computer and use it in GitHub Desktop.
Download YouTube Spam Comments with R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# https://docs.anaconda.com/anaconda/packages/r-language-pkg-docs/\n", | |
"# conda install rpy2\n", | |
"# conda install -c r r-essentials\n", | |
"# conda install -c r r-base\n", | |
"# conda search -f r-EXACTNAME to search for existing packages and conda install EXACTNAME\n", | |
"# If any issue happens, remove /anaconda3/lib/R and install everything again\n", | |
"\n", | |
"import rpy2.ipython\n", | |
"\n", | |
"%reload_ext rpy2.ipython" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%R\n", | |
"\n", | |
"# Snippet from\n", | |
"# https://github.com/christophM/interpretable-ml-book/blob/master/R/get-SpamTube-dataset.R\n", | |
"\n", | |
"# Data from\n", | |
"# http://www.dt.fee.unicamp.br/~tiago//youtubespamcollection/\n", | |
"\n", | |
"data_dir = \"youtube_data\"\n", | |
"\n", | |
"# Download the youtube datasets \n", | |
"download.spam.data = function(){\n", | |
" urls = sprintf('http://lasid.sor.ufscar.br/labeling/datasets/%i/download/', 9:13)\n", | |
" ycomments = lapply(urls, read.csv, stringsAsFactors=FALSE)\n", | |
" ycomments = do.call('rbind', ycomments)\n", | |
" cleanFun <- function(htmlString) {\n", | |
" return(gsub(\"<.*?>\", \"\", htmlString))\n", | |
" }\n", | |
" ycomments$CONTENT = cleanFun(ycomments$CONTENT)\n", | |
" # Convert to ASCII\n", | |
" ycomments$CONTENT = iconv(ycomments$CONTENT, \"UTF-8\", \"ASCII\", sub=\"\")\n", | |
" write.csv( x = ycomments, file = sprintf('%s/TubeSpam.csv', data_dir),row.names=FALSE)\n", | |
"}\n", | |
"download.spam.data()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment