Skip to content

Instantly share code, notes, and snippets.

@heffo42
heffo42 / main.py
Created July 6, 2019 23:03
Twitter Scraping Script
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
import datetime
from pymongo import MongoClient
import pymongo
@heffo42
heffo42 / request.py
Created July 6, 2019 23:22
Launch a Twitter Advanced Search HTTP Request
import requests
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}'
lang = 'en', keywords = "bitcoin%2C%20crypto%2C%20btc", since = "2019-07-05", until = "2019-07-05"
myUa = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
HEADER = {'User-Agent': myUa}
query = '{} since:{} until:{}'.format(keywords, since, until)
query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
#tweets is the html data as a bs4 (beautifulsoup) object
def writeTweets(tweets):
newTweetRecords = []
for tweet in tweets:
try:
if tweet.find("a", {"class" : "js-action-profile-promoted"}):
continue
text = tweet.find("p", {"class" : "tweet-text"}).get_text()
date = tweet.find("span", {"class" : "_timestamp"})["data-time-ms"]
tweetId = tweet['data-item-id']
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \
'default&include_available_features=1&include_entities=1&' \
'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}'
response = requests.get(url, headers=HEADER)
soup = BeautifulSoup(response.text, 'lxml')
tweets = soup.find_all("li", {"data-item-type": "tweet"})
writeTweets(tweets)
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
for i in range(10000):
def writeTweets(tweets):
'''
See Extraction_Exapmple.py to see how to parse tweets into newTweetRecords
'''
try:
result = collection.insert_many(newTweetRecords, ordered=False)
i += len(newTweetRecords)
except pymongo.errors.BulkWriteError as e:
panic = list(filter(lambda x: x['code'] != 11000, e.details['writeErrors']))
if len(panic) > 0:
def main():
client = MongoClient('mongodb://mongodb:27017', username='<username>', password='<password>', authSource='admin')
tweetCol = client['twitter']['tweets']
todoCol = client['twitter']['queriesTodo']
currentQuery = todoCol.find_one_and_delete({})
while currentQuery != None:
executeQuery(currentQuery["qWords"], currentQuery["since"], currentQuery["until"], tweetCol)
currentQuery = todoCol.find_one_and_delete({})
version: '3.1'
services:
mongodb:
image: mongo:4.0.0-xenial
volumes:
- './mongodb:/data/db'
networks:
- backend
deploy:
placement:
client = MongoClient('mongodb://<mongo ip>:27017', username='<username>', password='<password>', authSource='admin')
todoCol = client['twitter']['queriesTodo']
queryDates = []
since = date(2014,1,1)
qWords = "btc%2C%20OR%20bitcoin%2C%20OR%20crypto"
daterange = pd.date_range(date(2014,1,2), date(2019,7,5))
for until in daterange:
query = {"since" : since.strftime("%Y-%m-%d"), "until": until.strftime("%Y-%m-%d"), "qWords": qWords}
queryDates.append(query)
import digitalocean
from pymongo import MongoClient
#Creation
with open('/Users/dhefferna/Desktop/TwitterProject/Admin/cloud-config.yml', 'r') as file:
USER_DATA = file.read()
SSH = ["ssh pubkey"]
key = "<DigitalOcean API Key>"
for i in range(100):
droplet = digitalocean.Droplet(token=key,
#cloud-config
runcmd:
- ufw allow 22/tcp
- ufw allow 2376/tcp
- ufw allow 2377/tcp
- ufw allow 7946/tcp
- ufw allow 7946/udp
- ufw allow 4789/udp
- ufw reload
- systemctl restart docker