diegotf30/Twitter_ADB.ipynb

## Twitter_ADB.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Proyecto de Twitter - Bases de Datos Avanzadas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inicializar MongoDB & API con Llaves"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tweepy\n",
    "\n",
    "api_key = '-'\n",
    "api_secret = '-'\n",
    "access_token = '-'\n",
    "access_secret = '-'\n",
    "\n",
    "auth = tweepy.OAuthHandler(api_key, api_secret)\n",
    "auth.set_access_token(access_token, access_secret)\n",
    "api = tweepy.API(auth)\n",
    "\n",
    "client = MongoClient('mongodb://localhost/TwitterADB')\n",
    "db = client.Twitter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stream de Tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient\n",
    "import time\n",
    "\n",
    "\n",
    "class StreamListener(tweepy.StreamListener):\n",
    "    def __init__(self, time_limit=60):\n",
    "        self.start_time = time.time()\n",
    "        self.limit = time_limit\n",
    "        self.api = api\n",
    "        self.start_amount = db.Tweets.count_documents({}) # used to print # tweets the stream added\n",
    "\n",
    "    def on_status(self, status):\n",
    "        tweet = {\n",
    "            \"user\": {\n",
    "                \"accountName\": status.author.screen_name,\n",
    "                \"isVerified\": status.user.verified,\n",
    "                \"joinDate\": status.user.created_at,\n",
    "                \"followers\": status.user.followers_count,\n",
    "                \"headerColor\": status.user.profile_background_color,\n",
    "                \"location\": status.user.location,\n",
    "                \"isGeoEnabled\": status.user.geo_enabled\n",
    "            },\n",
    "            \"text\": status.text,\n",
    "            \"rts\": status.retweet_count,\n",
    "            \"favs\": status.favorite_count,\n",
    "            \"mightBeSensitive\": hasattr(status, 'possibly_sensitive') and status.possibly_sensitive,\n",
    "            \"containsMedia\": 'media' in status.entities,\n",
    "            \"hashtags\": [h['text'] for h in status.entities['hashtags']]\n",
    "        }\n",
    "        if (time.time() - self.start_time) < self.limit:\n",
    "            db.Tweets.insert_one(tweet)\n",
    "            return True\n",
    "        else:\n",
    "            actual_amount = db.Tweets.count_documents({})\n",
    "            print(f'Added {actual_amount - self.start_amount} tweets')\n",
    "            return False\n",
    "  \n",
    "    def on_error(self, status):\n",
    "        print(f'Error: {status}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HASHTAG STREAM\n",
      "Added 3239 tweets\n",
      "-------------------\n",
      "KEYWORD STREAM\n",
      "Added 11414 tweets\n",
      "-------------------\n",
      "LOCATION STREAM\n",
      "Added 12556 tweets\n",
      "-------------------\n"
     ]
    }
   ],
   "source": [
    "listener = StreamListener(time_limit=300) # Stream for 5 min.\n",
    "stream = tweepy.Stream(auth = api.auth, listener=listener)\n",
    "\n",
    "print('HASHTAG STREAM')\n",
    "stream.filter(track=['#Trump', '#HalfLifeAlyx', '#ParoNacional'])\n",
    "print('-------------------')\n",
    "\n",
    "print('KEYWORD STREAM')\n",
    "listener.start_time = time.time()\n",
    "stream.filter(track=['mongoDB', 'liga mx', 'tec de monterrey', 'grammys', 'impeachment'])\n",
    "print('-------------------')\n",
    "\n",
    "print('LOCATION STREAM')\n",
    "listener.start_time = time.time()\n",
    "stream.filter(locations=[-4.62,41.97,10.49,51.1,-87.6,24.73,-75.41,32.12, -99.30,19.21, -98.85, 19.54])\n",
    "print('-------------------')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. From all those stored tweets, how many are from “verified accounts”."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "106"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({'user.isVerified': True}) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. How many tweets you stored?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12556"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. How many different accounts are the tweets from?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10060"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(db.Tweets.distinct('user.accountName'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. How many of those tweets are location tagged."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8297"
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({'user.location': { '$ne': None } })"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. What was the most popular hashtag?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'_id': 'ParoNacional', 'uses': 1421}]"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(db.Tweets.aggregate([\n",
    "    {'$unwind': '$hashtags' },\n",
    "    {'$group': { '_id': '$hashtags', 'uses': { '$sum': 1 } } },\n",
    "    {'$sort': {'uses': -1}},\n",
    "    {'$limit': 1}\n",
    "]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. What is the oldest account from all the tweets you stored?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"user\": {\n",
      "        \"accountName\": \"silas216\",\n",
      "        \"joinDate\": \"2007-01-02 23:59:35\"\n",
      "    }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "q = db.Tweets.aggregate([\n",
    "    {'$sort': {'user.joinDate': 1}},\n",
    "    {'$limit': 1},\n",
    "    {'$project': {'_id': 0, 'user.accountName': 1, 'user.joinDate': 1}}\n",
    "])\n",
    "l = list(q)[0]\n",
    "l['user']['joinDate'] = str(l['user']['joinDate']) # Cast to string so date can be pretty-printed\n",
    "print(json.dumps(l, indent=4))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. The most used profile background color."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'_id': 'F5F8FA', 'uses': 4162}]"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(db.Tweets.aggregate([\n",
    "    {'$group': {'_id': '$user.headerColor', 'uses': { '$sum': 1 } }},\n",
    "    {'$sort': {'uses': -1}},\n",
    "    {'$limit': 1}\n",
    "]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 8. How many of those tweets, are possibility sensitive?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "128"
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({'mightBeSensitive': True})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 9. From all those accounts, how many of them have more than 2000 number of followers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3290"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({'user.followers': {'$gt': 2000}})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 10. What percentage of those tweets included a media file (video, photo, gif.)?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "822"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "db.Tweets.count_documents({'containsMedia': True})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Proyecto de Twitter - Bases de Datos Avanzadas"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Inicializar MongoDB & API con Llaves"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 154,
	"metadata": {},
	"outputs": [],
	"source": [
	"import tweepy\n",
	"\n",
	"api_key = '-'\n",
	"api_secret = '-'\n",
	"access_token = '-'\n",
	"access_secret = '-'\n",
	"\n",
	"auth = tweepy.OAuthHandler(api_key, api_secret)\n",
	"auth.set_access_token(access_token, access_secret)\n",
	"api = tweepy.API(auth)\n",
	"\n",
	"client = MongoClient('mongodb://localhost/TwitterADB')\n",
	"db = client.Twitter"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Stream de Tweets"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {},
	"outputs": [],
	"source": [
	"from pymongo import MongoClient\n",
	"import time\n",
	"\n",
	"\n",
	"class StreamListener(tweepy.StreamListener):\n",
	" def __init__(self, time_limit=60):\n",
	" self.start_time = time.time()\n",
	" self.limit = time_limit\n",
	" self.api = api\n",
	" self.start_amount = db.Tweets.count_documents({}) # used to print # tweets the stream added\n",
	"\n",
	" def on_status(self, status):\n",
	" tweet = {\n",
	" \"user\": {\n",
	" \"accountName\": status.author.screen_name,\n",
	" \"isVerified\": status.user.verified,\n",
	" \"joinDate\": status.user.created_at,\n",
	" \"followers\": status.user.followers_count,\n",
	" \"headerColor\": status.user.profile_background_color,\n",
	" \"location\": status.user.location,\n",
	" \"isGeoEnabled\": status.user.geo_enabled\n",
	" },\n",
	" \"text\": status.text,\n",
	" \"rts\": status.retweet_count,\n",
	" \"favs\": status.favorite_count,\n",
	" \"mightBeSensitive\": hasattr(status, 'possibly_sensitive') and status.possibly_sensitive,\n",
	" \"containsMedia\": 'media' in status.entities,\n",
	" \"hashtags\": [h['text'] for h in status.entities['hashtags']]\n",
	" }\n",
	" if (time.time() - self.start_time) < self.limit:\n",
	" db.Tweets.insert_one(tweet)\n",
	" return True\n",
	" else:\n",
	" actual_amount = db.Tweets.count_documents({})\n",
	" print(f'Added {actual_amount - self.start_amount} tweets')\n",
	" return False\n",
	" \n",
	" def on_error(self, status):\n",
	" print(f'Error: {status}')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 116,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"HASHTAG STREAM\n",
	"Added 3239 tweets\n",
	"-------------------\n",
	"KEYWORD STREAM\n",
	"Added 11414 tweets\n",
	"-------------------\n",
	"LOCATION STREAM\n",
	"Added 12556 tweets\n",
	"-------------------\n"
	]
	}
	],
	"source": [
	"listener = StreamListener(time_limit=300) # Stream for 5 min.\n",
	"stream = tweepy.Stream(auth = api.auth, listener=listener)\n",
	"\n",
	"print('HASHTAG STREAM')\n",
	"stream.filter(track=['#Trump', '#HalfLifeAlyx', '#ParoNacional'])\n",
	"print('-------------------')\n",
	"\n",
	"print('KEYWORD STREAM')\n",
	"listener.start_time = time.time()\n",
	"stream.filter(track=['mongoDB', 'liga mx', 'tec de monterrey', 'grammys', 'impeachment'])\n",
	"print('-------------------')\n",
	"\n",
	"print('LOCATION STREAM')\n",
	"listener.start_time = time.time()\n",
	"stream.filter(locations=[-4.62,41.97,10.49,51.1,-87.6,24.73,-75.41,32.12, -99.30,19.21, -98.85, 19.54])\n",
	"print('-------------------')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 1. From all those stored tweets, how many are from “verified accounts”."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"106"
	]
	},
	"execution_count": 118,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({'user.isVerified': True}) "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 2. How many tweets you stored?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"12556"
	]
	},
	"execution_count": 119,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 3. How many different accounts are the tweets from?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 181,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"10060"
	]
	},
	"execution_count": 181,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(db.Tweets.distinct('user.accountName'))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 4. How many of those tweets are location tagged."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 177,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8297"
	]
	},
	"execution_count": 177,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({'user.location': { '$ne': None } })"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 5. What was the most popular hashtag?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 130,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'_id': 'ParoNacional', 'uses': 1421}]"
	]
	},
	"execution_count": 130,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"list(db.Tweets.aggregate([\n",
	" {'$unwind': '$hashtags' },\n",
	" {'$group': { '_id': '$hashtags', 'uses': { '$sum': 1 } } },\n",
	" {'$sort': {'uses': -1}},\n",
	" {'$limit': 1}\n",
	"]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 6. What is the oldest account from all the tweets you stored?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 163,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{\n",
	" \"user\": {\n",
	" \"accountName\": \"silas216\",\n",
	" \"joinDate\": \"2007-01-02 23:59:35\"\n",
	" }\n",
	"}\n"
	]
	}
	],
	"source": [
	"import json\n",
	"\n",
	"q = db.Tweets.aggregate([\n",
	" {'$sort': {'user.joinDate': 1}},\n",
	" {'$limit': 1},\n",
	" {'$project': {'_id': 0, 'user.accountName': 1, 'user.joinDate': 1}}\n",
	"])\n",
	"l = list(q)[0]\n",
	"l['user']['joinDate'] = str(l['user']['joinDate']) # Cast to string so date can be pretty-printed\n",
	"print(json.dumps(l, indent=4))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 7. The most used profile background color."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 170,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'_id': 'F5F8FA', 'uses': 4162}]"
	]
	},
	"execution_count": 170,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"list(db.Tweets.aggregate([\n",
	" {'$group': {'_id': '$user.headerColor', 'uses': { '$sum': 1 } }},\n",
	" {'$sort': {'uses': -1}},\n",
	" {'$limit': 1}\n",
	"]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 8. How many of those tweets, are possibility sensitive?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 165,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"128"
	]
	},
	"execution_count": 165,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({'mightBeSensitive': True})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 9. From all those accounts, how many of them have more than 2000 number of followers."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 120,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"3290"
	]
	},
	"execution_count": 120,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({'user.followers': {'$gt': 2000}})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 10. What percentage of those tweets included a media file (video, photo, gif.)?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 164,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"822"
	]
	},
	"execution_count": 164,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"db.Tweets.count_documents({'containsMedia': True})"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}