enewe101/crawl.py

## crawl.py
# This ended up not working.  Instead I crawled it sort of manually.

from bs4 import BeautifulSoup as Soup
import os
from datetime import datetime
from pod import ProgressTracker
import requests
import json
import sys
sys.path.append('..')
from LOCAL_SETTINGS import DATA_DIR
from helper import Timer


SITE_DIR = os.path.join(DATA_DIR, 'twitter')
HTML_DIR = os.path.join(SITE_DIR, 'html')
URL_FORMATTER = (
	'https://twitter.com/i/profiles/show/%s/timeline/tweets?'
	'include_available_features=1&include_entities=1&'
	'max_position=%s&reset_error_state=false'
)
TARGETS = ['realDonaldTrump', 'HillaryClinton', 'BernieSanders', 'TedCruz']
START_POSITION = '805528947190616065'
END_DATE = datetime(2015, 11, 8)
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'


def crawl_all():

	if not os.path.exists(SITE_DIR):
		os.makedirs(SITE_DIR)
	if not os.path.exists(HTML_DIR):
		os.makedirs(HTML_DIR)

	# Open a tracker for this target
	targets_tracker_path = os.path.join(SITE_DIR, 'targets.tracker')
	targets_tracker = ProgressTracker(targets_tracker_path)

	for target in TARGETS:
		if targets_tracker.check_or_add(target):
			continue

		targets_tracker.increment_tries(target)
		crawl_target(target)
		targets_tracker.mark_done(target)


def crawl_target(target):

	# Open a tracker for this target
	target_tracker_path = os.path.join(SITE_DIR, '%s.tracker' % target)
	target_tracker = ProgressTracker(target_tracker_path)

	# Make the first request
	url = URL_FORMATTER % (target, START_POSITION)
	cur_datetime = datetime.now()

	timer = Timer(4)
	while cur_datetime > END_DATE:

		if target_tracker.check_or_add(url):
			print '\tskiping %s' % url
			position = target_tracker[url]['min_position']
			cur_datetime = datetime.fromtimestamp(
				target_tracker[url]['datetime'])
			url = URL_FORMATTER % (target, position)
			continue

		print 'crawling %s' % url

		# Get the next batch of tweets
		target_tracker.increment_tries(url)
		timer.throttle()		# Don't make requests too often
		r = requests.get(url)
		data = json.loads(r.text)

		# Save them to disk
		position = data['min_position']
		out_fname = '%s-%s.html' % (target, position)
		out_path = os.path.join(HTML_DIR, out_fname)
		open(out_path, 'w').write(data['items_html'].encode('utf8'))

		# Find the next date
		timestamp = get_timestamp(data)
		cur_datetime = datetime.fromtimestamp(timestamp)
		print '\t%s' % cur_datetime.strftime('%Y-%m-%d %H:%M:%S')

		# Update the tracker for this request
		target_tracker.hold()
		target_tracker[url]['datetime'] = timestamp
		target_tracker[url]['min_position'] = position
		target_tracker.mark_done(url)
		target_tracker.unhold()

		# Find the next url
		url = URL_FORMATTER % (target, position)


def get_timestamp(data):
	soup = Soup(data['items_html'])
	tweets = soup.find('body').find_all('li', recursive=False)
	last_tweet_time_elm = tweets[-1].find('span', class_='_timestamp')
	return int(last_tweet_time_elm['data-time'])


if __name__ == '__main__':
	crawl_all()
	# This ended up not working. Instead I crawled it sort of manually.

	from bs4 import BeautifulSoup as Soup
	import os
	from datetime import datetime
	from pod import ProgressTracker
	import requests
	import json
	import sys
	sys.path.append('..')
	from LOCAL_SETTINGS import DATA_DIR
	from helper import Timer


	SITE_DIR = os.path.join(DATA_DIR, 'twitter')
	HTML_DIR = os.path.join(SITE_DIR, 'html')
	URL_FORMATTER = (
	'https://twitter.com/i/profiles/show/%s/timeline/tweets?'
	'include_available_features=1&include_entities=1&'
	'max_position=%s&reset_error_state=false'
	)
	TARGETS = ['realDonaldTrump', 'HillaryClinton', 'BernieSanders', 'TedCruz']
	START_POSITION = '805528947190616065'
	END_DATE = datetime(2015, 11, 8)
	TIME_FORMAT = '%Y-%m-%d %H:%M:%S'


	def crawl_all():

	if not os.path.exists(SITE_DIR):
	os.makedirs(SITE_DIR)
	if not os.path.exists(HTML_DIR):
	os.makedirs(HTML_DIR)

	# Open a tracker for this target
	targets_tracker_path = os.path.join(SITE_DIR, 'targets.tracker')
	targets_tracker = ProgressTracker(targets_tracker_path)

	for target in TARGETS:
	if targets_tracker.check_or_add(target):
	continue

	targets_tracker.increment_tries(target)
	crawl_target(target)
	targets_tracker.mark_done(target)


	def crawl_target(target):

	# Open a tracker for this target
	target_tracker_path = os.path.join(SITE_DIR, '%s.tracker' % target)
	target_tracker = ProgressTracker(target_tracker_path)

	# Make the first request
	url = URL_FORMATTER % (target, START_POSITION)
	cur_datetime = datetime.now()

	timer = Timer(4)
	while cur_datetime > END_DATE:

	if target_tracker.check_or_add(url):
	print '\tskiping %s' % url
	position = target_tracker[url]['min_position']
	cur_datetime = datetime.fromtimestamp(
	target_tracker[url]['datetime'])
	url = URL_FORMATTER % (target, position)
	continue

	print 'crawling %s' % url

	# Get the next batch of tweets
	target_tracker.increment_tries(url)
	timer.throttle() # Don't make requests too often
	r = requests.get(url)
	data = json.loads(r.text)

	# Save them to disk
	position = data['min_position']
	out_fname = '%s-%s.html' % (target, position)
	out_path = os.path.join(HTML_DIR, out_fname)
	open(out_path, 'w').write(data['items_html'].encode('utf8'))

	# Find the next date
	timestamp = get_timestamp(data)
	cur_datetime = datetime.fromtimestamp(timestamp)
	print '\t%s' % cur_datetime.strftime('%Y-%m-%d %H:%M:%S')

	# Update the tracker for this request
	target_tracker.hold()
	target_tracker[url]['datetime'] = timestamp
	target_tracker[url]['min_position'] = position
	target_tracker.mark_done(url)
	target_tracker.unhold()

	# Find the next url
	url = URL_FORMATTER % (target, position)


	def get_timestamp(data):
	soup = Soup(data['items_html'])
	tweets = soup.find('body').find_all('li', recursive=False)
	last_tweet_time_elm = tweets[-1].find('span', class_='_timestamp')
	return int(last_tweet_time_elm['data-time'])


	if __name__ == '__main__':
	crawl_all()