Created
May 3, 2018 16:40
-
-
Save N-Coder/d149f010d192498b674816cf26c3c287 to your computer and use it in GitHub Desktop.
Twitter crawler to obtain information on hashtag users and their network for the Data Science Lab
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint | |
import twitter | |
api = twitter.Api(consumer_key="", | |
consumer_secret="", | |
access_token_key="", | |
access_token_secret="") | |
pprint(api.VerifyCredentials()) | |
def main(): | |
hashtags = find_intersting_hashtags() | |
for hashtag in hashtags: | |
attendees, attendee_tweets = find_tweets_and_users_using_hashtag(hashtag) | |
attendees = set(attendees) | |
non_attendees = set() | |
users_to_check = [] | |
# initialize users_to_check with followers(attendees) | |
for attendee in attendees: | |
followers = crawl_user_followers(attendee) | |
users_to_check.extend(followers) | |
# check for all yet unknown users whether they attended the event | |
for user in users_to_check: | |
if user in attendees or user in non_attendees: | |
continue # user was already checked | |
has_used, OC_cnt, RT_cnt = has_user_used_hashtag(user, hashtag) | |
if has_used: | |
# this user has already used the tag, so his future behaviour is uninteresting for us | |
attendees.add(user) | |
# maybe some of his followers haven't used the tag yet, but might see this user's post with the tag | |
users_to_check.extend(crawl_user_followers(user)) | |
else: | |
# this user hasn't used the tag yet, make a snapshot of his network so that we can compare later | |
non_attendees.add(user) | |
# check how many of the people he follows have used the hashtag, making the tag more likely to show up in his feed | |
users_to_check.extend(crawl_user_friends(user)) | |
tweets = dict() | |
users = dict() | |
def add_tweet(tweet): | |
tweet_data = tweet.AsDict() | |
user_data = tweet_data["user"] | |
tweet_data["user"] = user_data["id"] | |
if tweet.retweeted_status: | |
add_tweet(tweet.retweeted_status) | |
tweet_data["retweeted_status"] = tweet_data["retweeted_status"]["id"] | |
tweets[tweet_data["id"]] = tweet_data | |
users[user_data["id"]] = user_data | |
def find_intersting_hashtags(): | |
hashtags = [] | |
for hashtag in ( | |
api.GetTrendsWoeid(676757) # Munich | |
+ api.GetTrendsWoeid(2459115) # New York | |
+ api.GetTrendsWoeid(44418) # London | |
# + api.GetTrendsCurrent() | |
): | |
tag = hashtag.name | |
if " " in tag: | |
continue | |
if tag[0] != "#": | |
tag = "#" + tag | |
hashtags.append(tag) | |
return hashtags | |
def find_tweets_and_users_using_hashtag(hashtag="#MSI2018"): | |
for tweet in api.GetSearch(hashtag, count=100, lang="en", return_json=False): | |
add_tweet(tweet) | |
print("%s: %s" % (tweet.user.name, tweet.text)) | |
if tweet.retweeted_status: | |
print("\tRT, OC retweeted %s times" % (tweet.retweet_count,)) | |
else: | |
print("\tOC, retweeted %s times" % (tweet.retweet_count,)) | |
return [], [] # FIXME | |
def crawl_user_followers(user_id=614754689): # lolesports | |
users[user_id]["followers"] = api.GetFollowerIDs() # TODO cursor | |
pprint(users[user_id]["followers"]) | |
# https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-lookup | |
for user_data in api.UsersLookup([uid for uid in users[user_id]["followers"] if uid not in users.keys()]): # TODO cursor | |
users[user_data["id"]] = user_data | |
return [] # FIXME | |
def crawl_user_friends(user_id=614754689): # lolesports | |
users[user_id]["friends"] = api.GetFriendIDs() # TODO cursor | |
pprint(users[user_id]["friends"]) | |
# https://developer.twitter.com/en/docs/accounts-and-users/follow-search-get-users/api-reference/get-users-lookup | |
for user_data in api.UsersLookup([uid for uid in users[user_id]["friends"] if uid not in users.keys()]): # TODO cursor | |
users[user_data["id"]] = user_data | |
return [] # FIXME | |
def has_user_used_hashtag(user_id=123123, hashtag="#MSI2018"): | |
# Find out whether follower already used hashtag by crawling his whole timeline or via the search api | |
# user.statuses_count # The number of Tweets (including retweets) issued by the user. | |
SEARCH_API_THRESHOLD = (200 * 900) / 180 # getting the whole timeline is only worth it for users with less than 1000 tweets | |
if users[user_id]["statuses_count"] < SEARCH_API_THRESHOLD: | |
api.GetUserTimeline(user_id) # 200 tweets / req * 900 req / 15 min | |
# only up to 3200 statuses | |
else: | |
api.GetSearch("%s from:%s" % (hashtag, users[user_id])) # 180 req / 15 min | |
# only 7d of history | |
users[user_id]["used_hashtags_date"][hashtag] = True or False | |
# also count the number of usages in OC and RTs | |
return False, 0, 1 # FIXME |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment