Created
May 30, 2016 14:29
-
-
Save denistnguyen/39dc997b82c0b7b51b55050e22dfa5be to your computer and use it in GitHub Desktop.
Twitter Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Look for ~~ and insert your information | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import time | |
import pymongo | |
import numpy as np | |
import pandas as pd | |
url = "~~INSERT URL~~" | |
# Open webdriver and url | |
driver = webdriver.Firefox() | |
driver.get(url) | |
driver.maximize_window() | |
# Scroll down to bottom of page | |
lastHeight = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(2) | |
newHeight = driver.execute_script("return document.body.scrollHeight") | |
if newHeight == lastHeight: | |
break | |
lastHeight = newHeight | |
pho = BeautifulSoup(driver.page_source, "html.parser") | |
# Connect to Mongo DB | |
try: | |
conn = pymongo.MongoClient() | |
print "Connected successfully..." | |
except pymongo.errors.ConnectionFailure, e: | |
print "Could not connect to MongoDB: %s" % e | |
conn | |
mydb = conn['~~DATABASE NAME~~'] | |
# Extract information from url | |
dat = pd.DataFrame({'time': [], 'twit': [], 'tag': [], 'at': [], \ | |
're': [], 'like': [], 'loc': []}) | |
for t in pho.find_all('div', {'class': 'content'}): | |
time = t.find('a', {'class': 'tweet-timestamp'})['title'].encode('UTF-8') | |
try: | |
twit = t.find('p', {'class': 'TweetTextSize'}).text.encode('UTF-8') | |
except: | |
pass | |
tag = [] | |
for l in t.find_all('a', {'class': 'twitter-hashtag'}): | |
tag.append(l.text.encode('UTF-8')[1:]) | |
if tag == []: | |
tag = np.NaN | |
at = [] | |
for a in t.find_all('a', {'class': 'twitter-atreply'}): | |
try: | |
at.append(a['href'].encode('UTF-8')[1:]) | |
except: | |
pass | |
if at == []: | |
at = np.NaN | |
thumbs = t.find_all('span', \ | |
{'class': 'ProfileTweet-actionCountForPresentation'}) | |
try: | |
re = thumbs[0].text.encode('UTF-8') | |
if re[-1] == 'K': | |
re = int(float(re[:-1].encode('UTF-8')) * 1000) | |
else: | |
re = int(float(re.encode('UTF-8'))) | |
except: | |
pass | |
try: | |
like = thumbs[2].text.encode('UTF-8') | |
if like[-1] == 'K': | |
like = int(float(like[:-1].encode('UTF-8'))*1000) | |
else: | |
like = int(float(like.encode('UTF-8'))) | |
except: | |
pass | |
if t.find_all('span', {'class': 'Tweet-geo'}) == []: | |
loc = np.NaN | |
else: | |
loc = t.find('span', {'class': 'Tweet-geo'})['title'].encode('UTF-8') | |
entry = {'time': time, 'twit': twit, 'tag': [tag], 'at': [at], \ | |
're': re, 'like': like, 'loc': loc} | |
dat = dat.append([pd.DataFrame(entry)], ignore_index=True) | |
mydb.~~COLLECTION NAME~~.insert(entry) | |
# Export as a csv for safekeeping | |
dat.to_csv('~~INSERT FILE NAME~~', encoding='utf-8') | |
print "Mission Complete..." | |
driver.close() | |
print "Have a nice day!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment