gdsaxton/Social_Metrics_Tutorial_Script_User_Timeline_All_Pages.py

## Social_Metrics_Tutorial_Script_User_Timeline_All_Pages.py
#!/usr/bin/env python


"""
Social_Metrics_Tutorial_Script_User_Timeline_All_Pages.py - DOWNLOADS ALL AVAILABLE RECENT
TWEETS FROM 5 MLB ACCOUNTS INTO SQLITE DATABASE

BEFORE RUNNING THIS SCRIPT, YOU WILL NEED TO:
  1. HAVE ANACONDA PYTHON 2.7 INSTALLED
  2. HAVE CREATED CSV FILE (E.G., IN EXCEL) CONTAINING TWITTER HANDLES YOU
     WISH TO DOWNLOAD (SEE TUTORIAL FOR DETAILS)


THE CODE IS DIVIDED INTO SIX PARTS:
  1. Importing necessary Python packages
  2. Importing Twython and Twitter app key and access token
       - YOU NEED TO MODIFY THIS SECTION IN ORDER TO GET SCRIPT TO WORK (LINES 53-54)
  3. Defining function for getting Twitter data
  4. Set up columns for database to allow use of SQLAlchemy
  5. Writing function for parsing data returned by Twitter API/creating variables to store
  6. Main loop over each of the Twitter handles in the accounts table of the database.
       - YOU CAN MODIFY DATABASE NAME HERE BY CHANGING 'test.sqlite' (LINE 564)
       - # OF PAGES OF TWEETS TO BE DOWNLOADED PER ACCOUNT CAN BE CHANGED (LINES 596, 615)
"""


###### PART I: IMPORT PYTHON PACKAGES (ALL BUT SIMPLEJSON & TWYTHON COME W/ ANACONDA) ######
import sys
import string
import sqlite3
import time
import datetime
from pprint import pprint
import sqlalchemy
from sqlalchemy.orm import mapper, sessionmaker
from sqlalchemy import Column, Integer, String, ForeignKey, Text, DateTime, Unicode, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import exc
from types import *
from datetime import datetime, date, time
import simplejson           #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install simplejson
from twython import Twython #NEEDS TO BE INSTALLED SEPARATELY ONCE: pip install Twython


###### PART II: IMPORT TWYTHON, ADD TWITTER APP KEY & ACCESS TOKEN (TO ACCESS API) ######

#REPLACE 'APP_KEY' WITH YOUR APP KEY, ACCESS_TOKEN WITH YOUR ACCESS TOKEN, IN THE NEXT TWO LINES
APP_KEY = ' '
ACCESS_TOKEN = ' '
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)


###### PART III: DEFINE TWYTHON FUNCTION FOR GETTING ALL AVAILABLE PAGES OF TWEETS PER USER ######

def get_data_user_timeline_all_pages(kid, page):
    try:
        '''
        'count' specifies the number of tweets to try and retrieve, up to a maximum of 200 per distinct request.
        The value of count is best thought of as a limit to the number of tweets to return because suspended or
        deleted content is removed after the count has been applied. We include retweets in the count,
        even if include_rts is not supplied. It is recommended you always send include_rts=1 when using this API method.

        THE SCRIPT IS CURRENT SET UP TO DOWNLOAD ALL AVAILABLE PAGES -- 16 PAGES OF 200 TWEETS = 3,200 TWEETS/ACCOUNT TOTAL

        '''
        d = twitter.get_user_timeline(screen_name=kid, count="200", page=page, include_entities="true", include_rts="1")
    except Exception, e:
        print "Error reading id %s, exception: %s" % (kid, e)
        return None
    #print len(d), #d[0]  #NUMBER OF ENTRIES RETURNED, FIRST ENTRY
    #print "d.keys(): ", d[0].keys()
    return d


###### PART IV: SET UP COLUMNS FOR TWEET AND ACCOUNT TABLES IN DATABASE ######
'''
THIS PART OF THE SCRIPT CAN BE SPLIT OFF TO A SEPARATE FILE -- WILL ALSO BE USED
FOR OTHER SCRIPTS TO ANALYZE THE DATA AFTER DOWNLOADING THE TWEETS
'''

Base = declarative_base()

#TWEET TABLE - THIS TABLE WILL BE CREATED AND POPULATED BY THIS PYTHON SCRIPT
class TWEET(Base):
    __tablename__ = 'user_timeline'                 #NAME OF TABLE HERE
    rowid = Column(Integer, primary_key=True)
    query = Column(String)
    tweet_id = Column(String)
    tweet_id_str = Column(String, unique=True)		##### UNIQUE CONSTRAINT #####
    inserted_date = Column(DateTime)
    truncated = Column(String)
    language = Column(String)
    possibly_sensitive = Column(String)  ### NEW
    coordinates = Column(String)					#Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).
    retweeted_status = Column(String)
    withheld_in_countries = Column(String)
    withheld_scope = Column(String)
    created_at_text = Column(String)   				#UTC time when this Tweet was created.
    created_at = Column(DateTime)
    month = Column(String)
    year = Column(String)
    content = Column(Text)
    from_user_screen_name = Column(String)			#The screen name, handle, or alias that this user identifies themselves with. screen_names are unique but subject to change. Use id_str as a user identifier whenever possible.
    from_user_id = Column(String)
    from_user_followers_count = Column(Integer)  	#The number of followers this account currently has.
    from_user_friends_count = Column(Integer)  		#The number of users this account is following (AKA their "followings").
    from_user_listed_count = Column(Integer)  		#The number of public lists that this user is a member of.
    from_user_favourites_count = Column(Integer)	#The number of tweets this user has favorited in the account's lifetime. British spelling used in the field name for historical reasons.
    from_user_statuses_count = Column(Integer)  	#The number of tweets (including retweets) issued by the user.
    from_user_description = Column(String)  		#The user-defined UTF-8 string describing their account.
    from_user_location = Column(String)  			#The user-defined location for this account's profile.
    from_user_created_at = Column(String)  			#The UTC datetime that the user account was created on Twitter.
    retweet_count = Column(Integer)
    favorite_count = Column(Integer)				#Indicates approximately how many times this Tweet has been "favorited" by Twitter users.
    entities_urls = Column(Unicode(255))
    entities_urls_count = Column(Integer)
    entities_hashtags = Column(Unicode(255))
    entities_hashtags_count = Column(Integer)
    entities_mentions = Column(Unicode(255))
    entities_mentions_count = Column(Integer)
    in_reply_to_screen_name = Column(String)
    in_reply_to_status_id = Column(String)
    source = Column(String)
    entities_expanded_urls = Column(Text)
    entities_media_count = Column(Integer)
    media_expanded_url = Column(Text)
    media_url = Column(Text)
    media_type = Column(Text)
    video_link = Column(Integer)
    photo_link = Column(Integer)
    twitpic = Column(Integer)
    num_characters = Column(Integer)
    num_words = Column(Integer)
    retweeted_user = Column(Text)
    retweeted_user_description = Column(Text)
    retweeted_user_screen_name = Column(Text)
    retweeted_user_followers_count = Column(Integer)
    retweeted_user_listed_count = Column(Integer)
    retweeted_user_statuses_count = Column(Integer)
    retweeted_user_location = Column(Text)
    retweeted_tweet_created_at = Column(DateTime)
    Unique_ID = Column(Integer)							#SPECIFIC TO EACH ACCOUNT

    def __init__(self, query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive, coordinates,
    retweeted_status, withheld_in_countries, withheld_scope, created_at_text, created_at, month, year, content,
    from_user_screen_name, from_user_id, from_user_followers_count, from_user_friends_count,
    from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,
    from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,
    entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,
    in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls,
    entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic,
    num_characters, num_words,
    retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count,
    retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location,
    retweeted_tweet_created_at,
    Unique_ID,
    ):


        self.query = query
        self.tweet_id = tweet_id
        self.tweet_id_str = tweet_id_str
        self.inserted_date = inserted_date
        self.truncated = truncated
        self.language = language
        self.possibly_sensitive = possibly_sensitive
        self.coordinates = coordinates
        self.retweeted_status = retweeted_status
        self.withheld_in_countries = withheld_in_countries
        self.withheld_scope = withheld_scope
        self.created_at_text = created_at_text
        self.created_at = created_at
        self.month = month
        self.year = year
        self.content = content
        self.from_user_screen_name = from_user_screen_name
        self.from_user_id = from_user_id
        self.from_user_followers_count = from_user_followers_count
        self.from_user_friends_count = from_user_friends_count
        self.from_user_listed_count = from_user_listed_count
        self.from_user_favourites_count = from_user_favourites_count
        self.from_user_statuses_count = from_user_statuses_count
        self.from_user_description = from_user_description
        self.from_user_location = from_user_location
        self.from_user_created_at = from_user_created_at
        self.retweet_count = retweet_count
        self.favorite_count = favorite_count
        self.entities_urls = entities_urls
        self.entities_urls_count = entities_urls_count
        self.entities_hashtags = entities_hashtags
        self.entities_hashtags_count = entities_hashtags_count
        self.entities_mentions = entities_mentions
        self.entities_mentions_count = entities_mentions_count
        self.in_reply_to_screen_name = in_reply_to_screen_name
        self.in_reply_to_status_id = in_reply_to_status_id
        self.source = source
        self.entities_expanded_urls = entities_expanded_urls
        self.entities_media_count = entities_media_count
        self.media_expanded_url = media_expanded_url
        self.media_url = media_url
        self.media_type = media_type
        self.video_link = video_link
        self.photo_link = photo_link
        self.twitpic = twitpic
        self.num_characters = num_characters
        self.num_words = num_words
        self.retweeted_user = retweeted_user
        self.retweeted_user_description = retweeted_user_description
        self.retweeted_user_screen_name = retweeted_user_screen_name
        self.retweeted_user_followers_count = retweeted_user_followers_count
        self.retweeted_user_listed_count = retweeted_user_listed_count
        self.retweeted_user_statuses_count = retweeted_user_statuses_count
        self.retweeted_user_location = retweeted_user_location
        self.retweeted_tweet_created_at = retweeted_tweet_created_at
        self.Unique_ID = Unique_ID

    def __repr__(self):
       return "<sender, created_at('%s', '%s')>" % (self.from_user_screen_name,self.created_at)


#ACCOUNTS TABLE -- THIS IS WHAT YOU IMPORTED INTO SQLITE EARLIER IN THE TUTORIAL
class ACCOUNT(Base):
    __tablename__ = 'accounts'
    Unique_ID = Column(Integer, primary_key=True)
    org_name = Column(String)
    org_URL = Column(String)
    Twitter_URL = Column(String)
    Twitter_handle = Column(String)
    earliest_tweet_in_db = Column(String)
    number_of_tweets_in_db = Column(Integer)

    def __init__(self, org_name, org_URL, Twitter_URL, Twitter_handle,
        earliest_tweet_in_db, number_of_tweets_in_db,
    ):

        self.org_name = org_name
        self.org_URL = org_URL
        self.Twitter_URL = Twitter_URL
        self.Twitter_handle = Twitter_handle
        self.earliest_tweet_in_db = earliest_tweet_in_db
        self.number_of_tweets_in_db = number_of_tweets_in_db

    def __repr__(self):
       return "<Twitter handle, org_type('%s', '%s')>" % (self.Twitter_handle,self.org_type)


###### PART V: DEFINE FUNCTION FOR PARSING JSON-BASED TWEET DATA RETURNED BY TWITTER API -- CREATE VARIABLES TO BE STORED ######

def write_data(self, d, Twitter_handle,  Unique_ID):

    Unique_ID = Unique_ID
    query = Twitter_handle  #d['search_metadata']['query']  #THIS IS DIFFERENT FROM MENTIONS AND DMS FILES

    #for sender in d['results']:
    for entry in d: 			#d['statuses']:	#THIS IS DIFFERENT FROM MENTIONS AND DMS FILES
        #json_output = str(entry)
        #print json_output
        #json_output = unicode.join(u'\n',map(unicode,json_output_string))

        #ids.append(sender['id'])
        #print sender.keys()

        tweet_id = entry['id']
        tweet_id_str = entry['id_str']
        inserted_date = datetime.now()   #CURRENT DATE AND TIME


        ##### NEW VARIABLES -- UNTESTED #####
        truncated = entry['truncated']
        language = entry['lang']


        #WE NEED THE 'IF' CONDITION CHECK HERE BECAUSE THERE IS NOT A VALUE FOR ALL TWEETS, UNLIKE,
        #SAY, WITH THE 'TEXT' VARIABLE
        if 'possibly_sensitive' in entry:
            possibly_sensitive= entry['possibly_sensitive']
        else:
            possibly_sensitive = ''

        coordinates = []
        if 'coordinates' in entry and entry['coordinates'] != None:
            #print entry['coordinates']['coordinates']
            #[-98.0738001, 27.75346355]
            #-98.0738001 <type 'float'>
            #27.75346355 <type 'float'>
            for coordinate in entry['coordinates']['coordinates']:
                #print coordinate, type(coordinate)
                #coordinates.append(str(coordinate))
                coordinates.append(coordinate)
            #print "HERE ARE THE COORDINATES---------------------->", "".join(format(x, "10.8f") for x in coordinates)
            coordinates = ', '.join(map(str, coordinates))							#WILL NOT WORK
            #coordinates = entry['coordinates']['coordinates']
            #coordinates = ",".join("'{0}'".format(n) for n in coordinates)  #THIS IS A LIST OF INTEGERS
            #print type(coordinates), len(coordinates), coordinates
            #print type(coordinates), coordinates
        else:
            coordinates = ''

        if 'retweeted_status' in entry:
            retweeted_status = 'THIS IS A RETWEET'

            retweeted_user = entry['retweeted_status']['user']['id_str']  ###I SHOULD USE ALL STR OR NOT
            retweeted_user_description = entry['retweeted_status']['user']['description']
            retweeted_user_screen_name = entry['retweeted_status']['user']['screen_name']
            retweeted_user_followers_count = entry['retweeted_status']['user']['followers_count']
            retweeted_user_listed_count = entry['retweeted_status']['user']['listed_count']
            retweeted_user_statuses_count = entry['retweeted_status']['user']['statuses_count']
            retweeted_user_location = entry['retweeted_status']['user']['location']
            retweeted_tweet_created_at_text = entry['retweeted_status']['created_at']
            retweeted_tweet_created_at = datetime.strptime(retweeted_tweet_created_at_text, '%a %b %d %H:%M:%S +0000 %Y')   #<type 'datetime.datetime'>

        else:
            retweeted_status = ''
            retweeted_user = ''
            retweeted_user_description = ''
            retweeted_user_screen_name = ''
            retweeted_user_followers_count = ''
            retweeted_user_listed_count = ''
            retweeted_user_statuses_count = ''
            retweeted_user_location = ''
            retweeted_tweet_created_at = None

        if 'withheld_in_countries' in entry:
            withheld_in_countries = 'WITHHELD --> CHECK JSON'
        else:
            withheld_in_countries = ''

        if 'withheld_scope' in entry:
            withheld_scope = entry['withheld_scope']
        else:
            withheld_scope = ''


        #print sender['text']

        # init field values
        content = entry['text']
        content = content.replace('\n','')

        print '\n', content.encode('utf-8')

        num_characters = len(content) #NUMBER OF CHARACTERS (SPACES INCLUDED)
        words = content.split()
        num_words = len(words)

        created_at_text = entry['created_at']     ##### Fri Jun 24 18:14:34 +0000 2011 --> <type 'str'>

        ##### EXAMPLE OF HOW TO CONVERT STRING TO DATETIME FORMAT AND THEN MODIFY
        #t = datetime.strptime("20091229050936", "%Y%m%d%H%M%S")
        #print t.strftime('%H:%M %d %B %Y (UTC)')
        #fred = t.strftime('%H:%M %d %B %Y (UTC)')  ## 05:09 29 December 2009 (UTC) --> <type 'str'>

        #date_test = datetime.strptime('Fri Jun 24 18:14:34 +0000 2011', '%a %b %d %H:%M:%S +0000 %Y')   #<type 'datetime.datetime'>

        created_at = datetime.strptime(created_at_text, '%a %b %d %H:%M:%S +0000 %Y')   #<type 'datetime.datetime'>
        #date = twitter_date.strftime('%Y-%B-%d %H:%M')     ### BETTER OPTION BELOW (HAS MONTH, DAY AS NUMBERS)
        created_at2 = created_at.strftime('%Y-%m-%d %H:%M:%S')   ### THIS WORKS WELL, BUT IS SAME AS twitter_date ABOVE

        #month = datetime.strptime(created_at_text, '%b')
        #year = datetime.strptime(created_at_text, '%Y')   #ValueError: time data 'Thu Oct 17 17:47:34 +0000 2013' does not match format '%Y'

        month = created_at.strftime('%m')
        year = created_at.strftime('%Y')
        #print 'month', month, 'year', year


        #print created_at, created_at2

        #from_user = sender['from_user']            ##### THIS NEEDS TO BE UPDATED --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS
        from_user_screen_name = entry['user']['screen_name']
        #from_user_id = sender['from_user_id']      ##### THIS NEEDS TO BE UPDATED  --> 7/11/13 IT'S UNDER user, WHICH HAS SUB-KEYS
        from_user_id = entry['user']['id']
        from_user_followers_count = entry['user']['followers_count']
        from_user_friends_count = entry['user']['friends_count']
        from_user_listed_count = entry['user']['listed_count']
        from_user_favourites_count = entry['user']['favourites_count']
        print '\n', 'from_user_favourites_count-------------->', from_user_favourites_count
        from_user_statuses_count = entry['user']['statuses_count']
        from_user_description = entry['user']['description']
        from_user_location = entry['user']['location']
        from_user_created_at = entry['user']['created_at']

        retweet_count = entry['retweet_count']
        favorite_count = entry['favorite_count']

        in_reply_to_screen_name = entry['in_reply_to_screen_name']
        in_reply_to_status_id = entry['in_reply_to_status_id']


        #GENERATES VARIABLES FOR #URLS, HASHTAGS, AND MENTIONS
        entities_urls_count = len(entry['entities']['urls'])
        entities_hashtags_count = len(entry['entities']['hashtags'])
        entities_mentions_count = len(entry['entities']['user_mentions'])

        source = entry['source']


        #NOW WE ARE GOING TO CREATE VARIABLES FOR URLS, TAGS, AND MENTIONS --> THE DIFFICULTY HERE
        #IS THAT THERE IS NO UNIFORM NUMBER OF, SAY, TAGS IN EACH TWEET, SO WE CANNOT CREATE
        #A FIXED NUMBER OF COLUMNS/VARIABLES BEFOREHAND. INSTEAD, WE'LL CREATE A STRING FOR EACH
        #VARIABLE --> CAN BE PARSED OUT OR LOOPED OVER LATER AS NEEDED.

        entities_urls, entities_expanded_urls, entities_hashtags, entities_mentions = [], [], [], []

        #print type(entry['entities']['urls'])     ##### IT'S A LIST
        #urls = entry['entities']['urls']
        for link in entry['entities']['urls']:
            if 'url' in link:
                url = link['url']
                expanded_url = link['expanded_url']
                #print link['url'], link['expanded_url']
                entities_urls.append(url)
                entities_expanded_urls.append(expanded_url)
            else:
                print "No urls in entry"

        for hashtag in entry['entities']['hashtags']:
            if 'text' in hashtag:
                tag = hashtag['text']
                #print hashtag['text']
                entities_hashtags.append(tag)
            else:
                print "No hashtags in entry"

        for at in entry['entities']['user_mentions']:
            if 'screen_name' in at:
                mention = at['screen_name']
                #print at['screen_name']
                entities_mentions.append(mention)
            else:
                print "No mentions in entry"

        entities_mentions = string.join(entities_mentions, u", ")
        entities_hashtags = string.join(entities_hashtags, u", ")
        entities_urls = string.join(entities_urls, u", ")
        entities_expanded_urls = string.join(entities_expanded_urls, u", ")

        video_link = 0
        if 'vimeo' in entities_expanded_urls or 'youtube' in entities_expanded_urls or 'youtu' in entities_expanded_urls or 'vine' in entities_expanded_urls:
            video_link = 1					#All of these videos show up in 'View Media' in tweets
            #print "FOUND A VIDEO!!!"
        else:
            video_link = 0

        #if photo_text.find('twitpic'): # or photo_text.find('instagram') or photo_text.find('instagr'):
        #if 'twitpic' in photo_text:
        if 'twitpic' in entities_expanded_urls:
            twitpic = 1						#twitpic images show up in 'View Media' in tweets
            #print "FOUND A TWITPIC LINK!"
        else:
            twitpic = 0
        if 'twitpic' in entities_expanded_urls or 'instagram' in entities_expanded_urls or 'instagr' in entities_expanded_urls:
            photo_link = 1					#instagram images DO NOT show up in 'View Media' in tweets
            #print "FOUND A TWITPIC OR INSTAGRAM LINK!!!"
        else:
            photo_link = 0


        #CONVERT TO UNICODE FOR INSERTION INTO SQLITE DB
        entities_urls = unicode(entities_urls)
        entities_expanded_urls = unicode(entities_expanded_urls)
        entities_hashtags = unicode(entities_hashtags)
        entities_mentions = unicode(entities_mentions)

        #print "urls...?....", entities_urls
        #print "user_mentions...?....", entities_mentions
        #print "hashtags...?....", entities_hashtags


        #if 'symbols' in entry['entities']:
		#    print "HERE ARE THE SYMBOLS.......", entry['entities']['symbols']
        #else:
		#    print "THERE AIN'T NO entry['entities']['symbols']"

        if 'media' in entry['entities']:
			#print "HERE ARE THE MEDIA.......", entry['entities']['media']
			entities_media_count = len(entry['entities']['media'])
			#'expanded_url' #The fully resolved media URL [FULL TWEET PLUS PICTURE] --> e.g., http://twitter.com/StJude/status/347801636351135744/photo/1
			#'display_url' # 	Not a URL but a string to display instead of the media URL --> e.g., pic.twitter.com/hO4BjuqrnE
			#'media_url_https'  #The SSL URL of the media file --> e.g., https://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg
			#'type' # 	only photo for now --> e.g., photo
			#'media_url'  # The URL of the media file --> e.g., http://pbs.twimg.com/media/BNOjvtuCUAA4S66.jpg
        else:
            entities_media_count = ''


			#for a in entry['entities']['media']:
			#print a['expanded_url'], a['media_url']  #, a['media_type']
			#bob = entry['entities']['media'][0]
			#print bob
			#print bob['expanded_url']


        if 'media' in entry['entities']:
            if 'expanded_url' in entry['entities']['media'][0]:
		        media_expanded_url = entry['entities']['media'][0]['expanded_url']
            else:
                #print "THERE AIN'T NO expanded_url in entry['entities']['media']"
                media_expanded_url = ''

            if 'media_url' in entry['entities']['media'][0]:
		        media_url = entry['entities']['media'][0]['media_url']
            else:
		        #print "THERE AIN'T NO media_url in entry['entities']['media']"
		        media_url = ''

            if 'type' in entry['entities']['media'][0]:
		        media_type = entry['entities']['media'][0]['type']
            else:
		        #print "THERE AIN'T NO type in entry['entities']['media']"
		        media_type = ''
        else:
		    media_type = ''
		    media_url = ''
		    media_expanded_url = ''

        upd = TWEET(query, tweet_id, tweet_id_str, inserted_date, truncated, language, possibly_sensitive,
                coordinates, retweeted_status, withheld_in_countries, withheld_scope, created_at_text,
                created_at, month, year, content, from_user_screen_name, from_user_id, from_user_followers_count,
                from_user_friends_count, from_user_listed_count, from_user_favourites_count, from_user_statuses_count, from_user_description,
                from_user_location, from_user_created_at, retweet_count, favorite_count, entities_urls, entities_urls_count,
                entities_hashtags, entities_hashtags_count, entities_mentions, entities_mentions_count,
                in_reply_to_screen_name, in_reply_to_status_id, source, entities_expanded_urls, #json_output,
                entities_media_count, media_expanded_url, media_url, media_type, video_link, photo_link, twitpic,
                num_characters, num_words,
                retweeted_user, retweeted_user_description, retweeted_user_screen_name, retweeted_user_followers_count,
                retweeted_user_listed_count, retweeted_user_statuses_count, retweeted_user_location,
                retweeted_tweet_created_at,
                Unique_ID,
                )

        self.session.add(upd)
        try:
            self.session.commit()
        except exc.SQLAlchemyError:
            self.session.rollback()
            print "     NOT INSERTING --> IT'S A DUPLICATE"


###### PART VI: MAIN LOOP OVER EACH ID STORED IN 'ACCOUNTS' TABLE OF DATABASE - 16 PAGES OF 200 TWEETS PER ID ######

class Scrape:
    def __init__(self):
        engine = sqlalchemy.create_engine("sqlite:///test.sqlite", echo=False)  # YOUR DATABASE NAME HERE
        Session = sessionmaker(bind=engine)
        self.session = Session()
        Base.metadata.create_all(engine)

    def main(self):

        all_ids = self.session.query(ACCOUNT).all()

        print 'len(all_ids)', len(all_ids)


        #CHECK IF THE 'ACCOUNTS' TABLE HAS BEEN POPULATED; IF NOT, ADD DATA FROM
        #CSV FILE TO THE TABLE
        if len(all_ids) < 1:
            conn = sqlite3.connect('test.sqlite')
            import pandas as pd
            df = pd.read_csv('accounts.csv')
            df.to_sql('accounts', conn, if_exists='append', index=False)
            all_ids = self.session.query(ACCOUNT).all()

        keys = []
        for i in all_ids:
            Unique_ID = i.Unique_ID
            Twitter_handle = i.Twitter_handle
            kid = Twitter_handle
            rowid = i.Unique_ID
            print '\n', "\rprocessing id %s/%s  --  %s" % (rowid, len(all_ids), Twitter_handle),
            sys.stdout.flush()


            page = 1
            while page < 17: #TO DOWNLOAD FEWER PAGES OF TWEETS, MODIFY THIS PLUS LINE 621
                print "------XXXXXX------ STARTING PAGE", page
                d = get_data_user_timeline_all_pages(kid, page)
                if not d:
                    print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
                    break
                if len(d)==0:
                    print "THERE WERE NO STATUSES RETURNED........MOVING TO NEXT ID"
                    break

                write_data(self, d, Twitter_handle, Unique_ID)


                #self.session.commit()

                #print 'pausing for 1 second'
                #time.sleep(1) #PAUSE FOR 1 SECOND

                page += 1
                if page > 16:
                    print "WE'RE AT THE END OF PAGE 16!!!!!"
                    break
            self.session.commit()

        print  '\n', '\n', '\n', "FINISHED WITH ALL IDS"
        self.session.close()


if __name__ == "__main__":
    s = Scrape()
    s.main()