denistnguyen/twitterscraper.py

## twitterscraper.py
# Look for ~~ and insert your information

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pymongo
import numpy as np
import pandas as pd


url = "~~INSERT URL~~"


# Open webdriver and url
driver = webdriver.Firefox()
driver.get(url)
driver.maximize_window()

# Scroll down to bottom of page
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    newHeight = driver.execute_script("return document.body.scrollHeight")
    if newHeight == lastHeight:
        break
    lastHeight = newHeight

pho = BeautifulSoup(driver.page_source, "html.parser")


# Connect to Mongo DB
try:
    conn = pymongo.MongoClient()
    print "Connected successfully..."
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e
conn

mydb = conn['~~DATABASE NAME~~']

# Extract information from url

dat = pd.DataFrame({'time': [], 'twit': [], 'tag': [], 'at': [], \
                    're': [], 'like': [], 'loc': []})

for t in pho.find_all('div', {'class': 'content'}):
    time = t.find('a', {'class': 'tweet-timestamp'})['title'].encode('UTF-8')
    try:
        twit = t.find('p', {'class': 'TweetTextSize'}).text.encode('UTF-8')
    except:
        pass
    tag = []
    for l in t.find_all('a', {'class': 'twitter-hashtag'}):
        tag.append(l.text.encode('UTF-8')[1:])
    if tag == []:
        tag = np.NaN
    at = []
    for a in t.find_all('a', {'class': 'twitter-atreply'}):
        try:
            at.append(a['href'].encode('UTF-8')[1:])
        except:
            pass
    if at == []:
        at = np.NaN
    thumbs = t.find_all('span', \
                        {'class': 'ProfileTweet-actionCountForPresentation'})
    try:
        re = thumbs[0].text.encode('UTF-8')
        if re[-1] == 'K':
            re = int(float(re[:-1].encode('UTF-8')) * 1000)
        else:
            re = int(float(re.encode('UTF-8')))
    except:
        pass
    try:
        like = thumbs[2].text.encode('UTF-8')
        if like[-1] == 'K':
            like = int(float(like[:-1].encode('UTF-8'))*1000)
        else:
            like = int(float(like.encode('UTF-8')))
    except:
        pass
    if t.find_all('span', {'class': 'Tweet-geo'}) == []:
        loc = np.NaN
    else:
        loc = t.find('span', {'class': 'Tweet-geo'})['title'].encode('UTF-8')
    entry = {'time': time, 'twit': twit, 'tag': [tag], 'at': [at], \
             're': re, 'like': like, 'loc': loc}
    dat = dat.append([pd.DataFrame(entry)], ignore_index=True)
    mydb.~~COLLECTION NAME~~.insert(entry)

# Export as a csv for safekeeping
dat.to_csv('~~INSERT FILE NAME~~', encoding='utf-8')

print "Mission Complete..."
driver.close()
print "Have a nice day!"
	# Look for ~~ and insert your information

	from selenium import webdriver
	from bs4 import BeautifulSoup
	import time
	import pymongo
	import numpy as np
	import pandas as pd


	url = "~~INSERT URL~~"


	# Open webdriver and url
	driver = webdriver.Firefox()
	driver.get(url)
	driver.maximize_window()

	# Scroll down to bottom of page
	lastHeight = driver.execute_script("return document.body.scrollHeight")
	while True:
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(2)
	newHeight = driver.execute_script("return document.body.scrollHeight")
	if newHeight == lastHeight:
	break
	lastHeight = newHeight

	pho = BeautifulSoup(driver.page_source, "html.parser")


	# Connect to Mongo DB
	try:
	conn = pymongo.MongoClient()
	print "Connected successfully..."
	except pymongo.errors.ConnectionFailure, e:
	print "Could not connect to MongoDB: %s" % e
	conn

	mydb = conn['~~DATABASE NAME~~']

	# Extract information from url

	dat = pd.DataFrame({'time': [], 'twit': [], 'tag': [], 'at': [], \
	're': [], 'like': [], 'loc': []})

	for t in pho.find_all('div', {'class': 'content'}):
	time = t.find('a', {'class': 'tweet-timestamp'})['title'].encode('UTF-8')
	try:
	twit = t.find('p', {'class': 'TweetTextSize'}).text.encode('UTF-8')
	except:
	pass
	tag = []
	for l in t.find_all('a', {'class': 'twitter-hashtag'}):
	tag.append(l.text.encode('UTF-8')[1:])
	if tag == []:
	tag = np.NaN
	at = []
	for a in t.find_all('a', {'class': 'twitter-atreply'}):
	try:
	at.append(a['href'].encode('UTF-8')[1:])
	except:
	pass
	if at == []:
	at = np.NaN
	thumbs = t.find_all('span', \
	{'class': 'ProfileTweet-actionCountForPresentation'})
	try:
	re = thumbs[0].text.encode('UTF-8')
	if re[-1] == 'K':
	re = int(float(re[:-1].encode('UTF-8')) * 1000)
	else:
	re = int(float(re.encode('UTF-8')))
	except:
	pass
	try:
	like = thumbs[2].text.encode('UTF-8')
	if like[-1] == 'K':
	like = int(float(like[:-1].encode('UTF-8'))*1000)
	else:
	like = int(float(like.encode('UTF-8')))
	except:
	pass
	if t.find_all('span', {'class': 'Tweet-geo'}) == []:
	loc = np.NaN
	else:
	loc = t.find('span', {'class': 'Tweet-geo'})['title'].encode('UTF-8')
	entry = {'time': time, 'twit': twit, 'tag': [tag], 'at': [at], \
	're': re, 'like': like, 'loc': loc}
	dat = dat.append([pd.DataFrame(entry)], ignore_index=True)
	mydb.~~COLLECTION NAME~~.insert(entry)

	# Export as a csv for safekeeping
	dat.to_csv('~~INSERT FILE NAME~~', encoding='utf-8')

	print "Mission Complete..."
	driver.close()
	print "Have a nice day!"