Skip to content

Instantly share code, notes, and snippets.

@denistnguyen
Created May 30, 2016 14:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denistnguyen/39dc997b82c0b7b51b55050e22dfa5be to your computer and use it in GitHub Desktop.
Save denistnguyen/39dc997b82c0b7b51b55050e22dfa5be to your computer and use it in GitHub Desktop.
Twitter Scraper
# Look for ~~ and insert your information
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pymongo
import numpy as np
import pandas as pd
url = "~~INSERT URL~~"
# Open webdriver and url
driver = webdriver.Firefox()
driver.get(url)
driver.maximize_window()
# Scroll down to bottom of page
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
pho = BeautifulSoup(driver.page_source, "html.parser")
# Connect to Mongo DB
try:
conn = pymongo.MongoClient()
print "Connected successfully..."
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
conn
mydb = conn['~~DATABASE NAME~~']
# Extract information from url
dat = pd.DataFrame({'time': [], 'twit': [], 'tag': [], 'at': [], \
're': [], 'like': [], 'loc': []})
for t in pho.find_all('div', {'class': 'content'}):
time = t.find('a', {'class': 'tweet-timestamp'})['title'].encode('UTF-8')
try:
twit = t.find('p', {'class': 'TweetTextSize'}).text.encode('UTF-8')
except:
pass
tag = []
for l in t.find_all('a', {'class': 'twitter-hashtag'}):
tag.append(l.text.encode('UTF-8')[1:])
if tag == []:
tag = np.NaN
at = []
for a in t.find_all('a', {'class': 'twitter-atreply'}):
try:
at.append(a['href'].encode('UTF-8')[1:])
except:
pass
if at == []:
at = np.NaN
thumbs = t.find_all('span', \
{'class': 'ProfileTweet-actionCountForPresentation'})
try:
re = thumbs[0].text.encode('UTF-8')
if re[-1] == 'K':
re = int(float(re[:-1].encode('UTF-8')) * 1000)
else:
re = int(float(re.encode('UTF-8')))
except:
pass
try:
like = thumbs[2].text.encode('UTF-8')
if like[-1] == 'K':
like = int(float(like[:-1].encode('UTF-8'))*1000)
else:
like = int(float(like.encode('UTF-8')))
except:
pass
if t.find_all('span', {'class': 'Tweet-geo'}) == []:
loc = np.NaN
else:
loc = t.find('span', {'class': 'Tweet-geo'})['title'].encode('UTF-8')
entry = {'time': time, 'twit': twit, 'tag': [tag], 'at': [at], \
're': re, 'like': like, 'loc': loc}
dat = dat.append([pd.DataFrame(entry)], ignore_index=True)
mydb.~~COLLECTION NAME~~.insert(entry)
# Export as a csv for safekeeping
dat.to_csv('~~INSERT FILE NAME~~', encoding='utf-8')
print "Mission Complete..."
driver.close()
print "Have a nice day!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment