Skip to content

Instantly share code, notes, and snippets.

@diegotf30
Created November 21, 2019 23:24
Show Gist options
  • Save diegotf30/067222c49db318d7b9ae9b338f304d28 to your computer and use it in GitHub Desktop.
Save diegotf30/067222c49db318d7b9ae9b338f304d28 to your computer and use it in GitHub Desktop.
Twitter ADB
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Proyecto de Twitter - Bases de Datos Avanzadas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inicializar MongoDB & API con Llaves"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"import tweepy\n",
"\n",
"api_key = '-'\n",
"api_secret = '-'\n",
"access_token = '-'\n",
"access_secret = '-'\n",
"\n",
"auth = tweepy.OAuthHandler(api_key, api_secret)\n",
"auth.set_access_token(access_token, access_secret)\n",
"api = tweepy.API(auth)\n",
"\n",
"client = MongoClient('mongodb://localhost/TwitterADB')\n",
"db = client.Twitter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stream de Tweets"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"from pymongo import MongoClient\n",
"import time\n",
"\n",
"\n",
"class StreamListener(tweepy.StreamListener):\n",
" def __init__(self, time_limit=60):\n",
" self.start_time = time.time()\n",
" self.limit = time_limit\n",
" self.api = api\n",
" self.start_amount = db.Tweets.count_documents({}) # used to print # tweets the stream added\n",
"\n",
" def on_status(self, status):\n",
" tweet = {\n",
" \"user\": {\n",
" \"accountName\": status.author.screen_name,\n",
" \"isVerified\": status.user.verified,\n",
" \"joinDate\": status.user.created_at,\n",
" \"followers\": status.user.followers_count,\n",
" \"headerColor\": status.user.profile_background_color,\n",
" \"location\": status.user.location,\n",
" \"isGeoEnabled\": status.user.geo_enabled\n",
" },\n",
" \"text\": status.text,\n",
" \"rts\": status.retweet_count,\n",
" \"favs\": status.favorite_count,\n",
" \"mightBeSensitive\": hasattr(status, 'possibly_sensitive') and status.possibly_sensitive,\n",
" \"containsMedia\": 'media' in status.entities,\n",
" \"hashtags\": [h['text'] for h in status.entities['hashtags']]\n",
" }\n",
" if (time.time() - self.start_time) < self.limit:\n",
" db.Tweets.insert_one(tweet)\n",
" return True\n",
" else:\n",
" actual_amount = db.Tweets.count_documents({})\n",
" print(f'Added {actual_amount - self.start_amount} tweets')\n",
" return False\n",
" \n",
" def on_error(self, status):\n",
" print(f'Error: {status}')\n"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"HASHTAG STREAM\n",
"Added 3239 tweets\n",
"-------------------\n",
"KEYWORD STREAM\n",
"Added 11414 tweets\n",
"-------------------\n",
"LOCATION STREAM\n",
"Added 12556 tweets\n",
"-------------------\n"
]
}
],
"source": [
"listener = StreamListener(time_limit=300) # Stream for 5 min.\n",
"stream = tweepy.Stream(auth = api.auth, listener=listener)\n",
"\n",
"print('HASHTAG STREAM')\n",
"stream.filter(track=['#Trump', '#HalfLifeAlyx', '#ParoNacional'])\n",
"print('-------------------')\n",
"\n",
"print('KEYWORD STREAM')\n",
"listener.start_time = time.time()\n",
"stream.filter(track=['mongoDB', 'liga mx', 'tec de monterrey', 'grammys', 'impeachment'])\n",
"print('-------------------')\n",
"\n",
"print('LOCATION STREAM')\n",
"listener.start_time = time.time()\n",
"stream.filter(locations=[-4.62,41.97,10.49,51.1,-87.6,24.73,-75.41,32.12, -99.30,19.21, -98.85, 19.54])\n",
"print('-------------------')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. From all those stored tweets, how many are from “verified accounts”."
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"106"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({'user.isVerified': True}) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. How many tweets you stored?"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12556"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. How many different accounts are the tweets from?"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10060"
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(db.Tweets.distinct('user.accountName'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 4. How many of those tweets are location tagged."
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8297"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({'user.location': { '$ne': None } })"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 5. What was the most popular hashtag?"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'_id': 'ParoNacional', 'uses': 1421}]"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(db.Tweets.aggregate([\n",
" {'$unwind': '$hashtags' },\n",
" {'$group': { '_id': '$hashtags', 'uses': { '$sum': 1 } } },\n",
" {'$sort': {'uses': -1}},\n",
" {'$limit': 1}\n",
"]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 6. What is the oldest account from all the tweets you stored?"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"user\": {\n",
" \"accountName\": \"silas216\",\n",
" \"joinDate\": \"2007-01-02 23:59:35\"\n",
" }\n",
"}\n"
]
}
],
"source": [
"import json\n",
"\n",
"q = db.Tweets.aggregate([\n",
" {'$sort': {'user.joinDate': 1}},\n",
" {'$limit': 1},\n",
" {'$project': {'_id': 0, 'user.accountName': 1, 'user.joinDate': 1}}\n",
"])\n",
"l = list(q)[0]\n",
"l['user']['joinDate'] = str(l['user']['joinDate']) # Cast to string so date can be pretty-printed\n",
"print(json.dumps(l, indent=4))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 7. The most used profile background color."
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'_id': 'F5F8FA', 'uses': 4162}]"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(db.Tweets.aggregate([\n",
" {'$group': {'_id': '$user.headerColor', 'uses': { '$sum': 1 } }},\n",
" {'$sort': {'uses': -1}},\n",
" {'$limit': 1}\n",
"]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 8. How many of those tweets, are possibility sensitive?"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"128"
]
},
"execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({'mightBeSensitive': True})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 9. From all those accounts, how many of them have more than 2000 number of followers."
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3290"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({'user.followers': {'$gt': 2000}})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 10. What percentage of those tweets included a media file (video, photo, gif.)?"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"822"
]
},
"execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.Tweets.count_documents({'containsMedia': True})"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment