Skip to content

Instantly share code, notes, and snippets.

@shuHelicopter
Created August 22, 2016 03:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shuHelicopter/5ea5bcfca444780eb9c56e8f131627f9 to your computer and use it in GitHub Desktop.
Save shuHelicopter/5ea5bcfca444780eb9c56e8f131627f9 to your computer and use it in GitHub Desktop.
# extract data from Twitter with tweepy
import csv
fil = open('companies_mdf.csv') # 'companies_mdf.csv' is the dataset after the first preprocessing with R
csv_f = csv.reader(fil)
name_list = []
# get the list of companies' names from previous data file
for row in csv_f:
name_list.append(row[11]) # '11' is the index of the column which stores information of companies' name
twt = []
k = 0
for name in name_list[1:]: # start from 1 due to the header
try:
company = {}
user = api.get_user(screen_name = name)
company['name'] = name
print company['name'] # used for check the process of crawling
company['friends_num'] = user.friends_count
print company['friends_num'] # used for check the process of crawling
company['followers_num'] = user.followers_count
print company['followers_num'] # used for check the process of crawling
company['statuses_num'] = user.statuses_count
print company['statuses_num'] # used for check the process of crawling
company['favourites_num'] = user.favourites_count
print company['favourites_num'] # used for check the process of crawling
except tweepy.RateLimitError:
print '********* Rate Limit Error *********'
time.sleep(60*16) # handle rate limit problem due to Tweety's policy
company['name'] = name
company['friends_num'] = user.friends_count
company['followers_num'] = user.followers_count
company['statuses_num'] = user.statuses_count
company['favourites_num'] = user.favourites_count
except tweepy.TweepError:
company['name'] = name
company['friends_num'] = None
company['followers_num'] = None
company['statuses_num'] = None
company['favourites_num'] = None
twt.append(company)
k = k+1
print '############## %d ############' % k # used for check the process of crawling
# store data extracted from Twitter into another data file
f = codecs.open('twitter.csv', 'w', 'utf-8')
twt_data = pd.DataFrame(twt)
twt_data.to_csv('twitter.csv')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment