Skip to content

Instantly share code, notes, and snippets.

@N-Coder
Created May 3, 2018 16:40
Show Gist options
  • Save N-Coder/d149f010d192498b674816cf26c3c287 to your computer and use it in GitHub Desktop.
Save N-Coder/d149f010d192498b674816cf26c3c287 to your computer and use it in GitHub Desktop.
Twitter crawler to obtain information on hashtag users and their network for the Data Science Lab
from pprint import pprint
import twitter
api = twitter.Api(consumer_key="",
consumer_secret="",
access_token_key="",
access_token_secret="")
pprint(api.VerifyCredentials())
def main():
hashtags = find_intersting_hashtags()
for hashtag in hashtags:
attendees, attendee_tweets = find_tweets_and_users_using_hashtag(hashtag)
attendees = set(attendees)
non_attendees = set()
users_to_check = []
# initialize users_to_check with followers(attendees)
for attendee in attendees:
followers = crawl_user_followers(attendee)
users_to_check.extend(followers)
# check for all yet unknown users whether they attended the event
for user in users_to_check:
if user in attendees or user in non_attendees:
continue # user was already checked
has_used, OC_cnt, RT_cnt = has_user_used_hashtag(user, hashtag)
if has_used:
# this user has already used the tag, so his future behaviour is uninteresting for us
attendees.add(user)
# maybe some of his followers haven't used the tag yet, but might see this user's post with the tag
users_to_check.extend(crawl_user_followers(user))
else:
# this user hasn't used the tag yet, make a snapshot of his network so that we can compare later
non_attendees.add(user)
# check how many of the people he follows have used the hashtag, making the tag more likely to show up in his feed
users_to_check.extend(crawl_user_friends(user))
tweets = dict()
users = dict()
def add_tweet(tweet):
tweet_data = tweet.AsDict()
user_data = tweet_data["user"]
tweet_data["user"] = user_data["id"]
if tweet.retweeted_status:
add_tweet(tweet.retweeted_status)
tweet_data["retweeted_status"] = tweet_data["retweeted_status"]["id"]
tweets[tweet_data["id"]] = tweet_data
users[user_data["id"]] = user_data
def find_intersting_hashtags():
hashtags = []
for hashtag in (
api.GetTrendsWoeid(676757) # Munich
+ api.GetTrendsWoeid(2459115) # New York
+ api.GetTrendsWoeid(44418) # London
# + api.GetTrendsCurrent()
):
tag = hashtag.name
if " " in tag:
continue
if tag[0] != "#":
tag = "#" + tag
hashtags.append(tag)
return hashtags
def find_tweets_and_users_using_hashtag(hashtag="#MSI2018"):
for tweet in api.GetSearch(hashtag, count=100, lang="en", return_json=False):
add_tweet(tweet)
print("%s: %s" % (tweet.user.name, tweet.text))
if tweet.retweeted_status:
print("\tRT, OC retweeted %s times" % (tweet.retweet_count,))
else:
print("\tOC, retweeted %s times" % (tweet.retweet_count,))
return [], [] # FIXME
def crawl_user_followers(user_id=614754689): # lolesports
users[user_id]["followers"] = api.GetFollowerIDs() # TODO cursor
pprint(users[user_id]["followers"])
# https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-lookup
for user_data in api.UsersLookup([uid for uid in users[user_id]["followers"] if uid not in users.keys()]): # TODO cursor
users[user_data["id"]] = user_data
return [] # FIXME
def crawl_user_friends(user_id=614754689): # lolesports
users[user_id]["friends"] = api.GetFriendIDs() # TODO cursor
pprint(users[user_id]["friends"])
# https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-lookup
for user_data in api.UsersLookup([uid for uid in users[user_id]["friends"] if uid not in users.keys()]): # TODO cursor
users[user_data["id"]] = user_data
return [] # FIXME
def has_user_used_hashtag(user_id=123123, hashtag="#MSI2018"):
# Find out whether follower already used hashtag by crawling his whole timeline or via the search api
# user.statuses_count # The number of Tweets (including retweets) issued by the user.
SEARCH_API_THRESHOLD = (200 * 900) / 180 # getting the whole timeline is only worth it for users with less than 1000 tweets
if users[user_id]["statuses_count"] < SEARCH_API_THRESHOLD:
api.GetUserTimeline(user_id) # 200 tweets / req * 900 req / 15 min
# only up to 3200 statuses
else:
api.GetSearch("%s from:%s" % (hashtag, users[user_id])) # 180 req / 15 min
# only 7d of history
users[user_id]["used_hashtags_date"][hashtag] = True or False
# also count the number of usages in OC and RTs
return False, 0, 1 # FIXME
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment