Skip to content

Instantly share code, notes, and snippets.

@enewe101
Created June 6, 2017 16:13
Show Gist options
  • Save enewe101/689ba8c8862e91683de6dbad33e5d300 to your computer and use it in GitHub Desktop.
Save enewe101/689ba8c8862e91683de6dbad33e5d300 to your computer and use it in GitHub Desktop.
# This ended up not working. Instead I crawled it sort of manually.
from bs4 import BeautifulSoup as Soup
import os
from datetime import datetime
from pod import ProgressTracker
import requests
import json
import sys
sys.path.append('..')
from LOCAL_SETTINGS import DATA_DIR
from helper import Timer
SITE_DIR = os.path.join(DATA_DIR, 'twitter')
HTML_DIR = os.path.join(SITE_DIR, 'html')
URL_FORMATTER = (
'https://twitter.com/i/profiles/show/%s/timeline/tweets?'
'include_available_features=1&include_entities=1&'
'max_position=%s&reset_error_state=false'
)
TARGETS = ['realDonaldTrump', 'HillaryClinton', 'BernieSanders', 'TedCruz']
START_POSITION = '805528947190616065'
END_DATE = datetime(2015, 11, 8)
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
def crawl_all():
if not os.path.exists(SITE_DIR):
os.makedirs(SITE_DIR)
if not os.path.exists(HTML_DIR):
os.makedirs(HTML_DIR)
# Open a tracker for this target
targets_tracker_path = os.path.join(SITE_DIR, 'targets.tracker')
targets_tracker = ProgressTracker(targets_tracker_path)
for target in TARGETS:
if targets_tracker.check_or_add(target):
continue
targets_tracker.increment_tries(target)
crawl_target(target)
targets_tracker.mark_done(target)
def crawl_target(target):
# Open a tracker for this target
target_tracker_path = os.path.join(SITE_DIR, '%s.tracker' % target)
target_tracker = ProgressTracker(target_tracker_path)
# Make the first request
url = URL_FORMATTER % (target, START_POSITION)
cur_datetime = datetime.now()
timer = Timer(4)
while cur_datetime > END_DATE:
if target_tracker.check_or_add(url):
print '\tskiping %s' % url
position = target_tracker[url]['min_position']
cur_datetime = datetime.fromtimestamp(
target_tracker[url]['datetime'])
url = URL_FORMATTER % (target, position)
continue
print 'crawling %s' % url
# Get the next batch of tweets
target_tracker.increment_tries(url)
timer.throttle() # Don't make requests too often
r = requests.get(url)
data = json.loads(r.text)
# Save them to disk
position = data['min_position']
out_fname = '%s-%s.html' % (target, position)
out_path = os.path.join(HTML_DIR, out_fname)
open(out_path, 'w').write(data['items_html'].encode('utf8'))
# Find the next date
timestamp = get_timestamp(data)
cur_datetime = datetime.fromtimestamp(timestamp)
print '\t%s' % cur_datetime.strftime('%Y-%m-%d %H:%M:%S')
# Update the tracker for this request
target_tracker.hold()
target_tracker[url]['datetime'] = timestamp
target_tracker[url]['min_position'] = position
target_tracker.mark_done(url)
target_tracker.unhold()
# Find the next url
url = URL_FORMATTER % (target, position)
def get_timestamp(data):
soup = Soup(data['items_html'])
tweets = soup.find('body').find_all('li', recursive=False)
last_tweet_time_elm = tweets[-1].find('span', class_='_timestamp')
return int(last_tweet_time_elm['data-time'])
if __name__ == '__main__':
crawl_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment