Created
June 6, 2017 16:13
-
-
Save enewe101/689ba8c8862e91683de6dbad33e5d300 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This ended up not working. Instead I crawled it sort of manually. | |
from bs4 import BeautifulSoup as Soup | |
import os | |
from datetime import datetime | |
from pod import ProgressTracker | |
import requests | |
import json | |
import sys | |
sys.path.append('..') | |
from LOCAL_SETTINGS import DATA_DIR | |
from helper import Timer | |
SITE_DIR = os.path.join(DATA_DIR, 'twitter') | |
HTML_DIR = os.path.join(SITE_DIR, 'html') | |
URL_FORMATTER = ( | |
'https://twitter.com/i/profiles/show/%s/timeline/tweets?' | |
'include_available_features=1&include_entities=1&' | |
'max_position=%s&reset_error_state=false' | |
) | |
TARGETS = ['realDonaldTrump', 'HillaryClinton', 'BernieSanders', 'TedCruz'] | |
START_POSITION = '805528947190616065' | |
END_DATE = datetime(2015, 11, 8) | |
TIME_FORMAT = '%Y-%m-%d %H:%M:%S' | |
def crawl_all(): | |
if not os.path.exists(SITE_DIR): | |
os.makedirs(SITE_DIR) | |
if not os.path.exists(HTML_DIR): | |
os.makedirs(HTML_DIR) | |
# Open a tracker for this target | |
targets_tracker_path = os.path.join(SITE_DIR, 'targets.tracker') | |
targets_tracker = ProgressTracker(targets_tracker_path) | |
for target in TARGETS: | |
if targets_tracker.check_or_add(target): | |
continue | |
targets_tracker.increment_tries(target) | |
crawl_target(target) | |
targets_tracker.mark_done(target) | |
def crawl_target(target): | |
# Open a tracker for this target | |
target_tracker_path = os.path.join(SITE_DIR, '%s.tracker' % target) | |
target_tracker = ProgressTracker(target_tracker_path) | |
# Make the first request | |
url = URL_FORMATTER % (target, START_POSITION) | |
cur_datetime = datetime.now() | |
timer = Timer(4) | |
while cur_datetime > END_DATE: | |
if target_tracker.check_or_add(url): | |
print '\tskiping %s' % url | |
position = target_tracker[url]['min_position'] | |
cur_datetime = datetime.fromtimestamp( | |
target_tracker[url]['datetime']) | |
url = URL_FORMATTER % (target, position) | |
continue | |
print 'crawling %s' % url | |
# Get the next batch of tweets | |
target_tracker.increment_tries(url) | |
timer.throttle() # Don't make requests too often | |
r = requests.get(url) | |
data = json.loads(r.text) | |
# Save them to disk | |
position = data['min_position'] | |
out_fname = '%s-%s.html' % (target, position) | |
out_path = os.path.join(HTML_DIR, out_fname) | |
open(out_path, 'w').write(data['items_html'].encode('utf8')) | |
# Find the next date | |
timestamp = get_timestamp(data) | |
cur_datetime = datetime.fromtimestamp(timestamp) | |
print '\t%s' % cur_datetime.strftime('%Y-%m-%d %H:%M:%S') | |
# Update the tracker for this request | |
target_tracker.hold() | |
target_tracker[url]['datetime'] = timestamp | |
target_tracker[url]['min_position'] = position | |
target_tracker.mark_done(url) | |
target_tracker.unhold() | |
# Find the next url | |
url = URL_FORMATTER % (target, position) | |
def get_timestamp(data): | |
soup = Soup(data['items_html']) | |
tweets = soup.find('body').find_all('li', recursive=False) | |
last_tweet_time_elm = tweets[-1].find('span', class_='_timestamp') | |
return int(last_tweet_time_elm['data-time']) | |
if __name__ == '__main__': | |
crawl_all() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment