-
-
Save shuHelicopter/5ea5bcfca444780eb9c56e8f131627f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract data from Twitter with tweepy | |
import csv | |
fil = open('companies_mdf.csv') # 'companies_mdf.csv' is the dataset after the first preprocessing with R | |
csv_f = csv.reader(fil) | |
name_list = [] | |
# get the list of companies' names from previous data file | |
for row in csv_f: | |
name_list.append(row[11]) # '11' is the index of the column which stores information of companies' name | |
twt = [] | |
k = 0 | |
for name in name_list[1:]: # start from 1 due to the header | |
try: | |
company = {} | |
user = api.get_user(screen_name = name) | |
company['name'] = name | |
print company['name'] # used for check the process of crawling | |
company['friends_num'] = user.friends_count | |
print company['friends_num'] # used for check the process of crawling | |
company['followers_num'] = user.followers_count | |
print company['followers_num'] # used for check the process of crawling | |
company['statuses_num'] = user.statuses_count | |
print company['statuses_num'] # used for check the process of crawling | |
company['favourites_num'] = user.favourites_count | |
print company['favourites_num'] # used for check the process of crawling | |
except tweepy.RateLimitError: | |
print '********* Rate Limit Error *********' | |
time.sleep(60*16) # handle rate limit problem due to Tweety's policy | |
company['name'] = name | |
company['friends_num'] = user.friends_count | |
company['followers_num'] = user.followers_count | |
company['statuses_num'] = user.statuses_count | |
company['favourites_num'] = user.favourites_count | |
except tweepy.TweepError: | |
company['name'] = name | |
company['friends_num'] = None | |
company['followers_num'] = None | |
company['statuses_num'] = None | |
company['favourites_num'] = None | |
twt.append(company) | |
k = k+1 | |
print '############## %d ############' % k # used for check the process of crawling | |
# store data extracted from Twitter into another data file | |
f = codecs.open('twitter.csv', 'w', 'utf-8') | |
twt_data = pd.DataFrame(twt) | |
twt_data.to_csv('twitter.csv') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment