This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import time | |
import json | |
import datetime | |
from pymongo import MongoClient | |
import pymongo | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}' | |
lang = 'en', keywords = "bitcoin%2C%20crypto%2C%20btc", since = "2019-07-05", until = "2019-07-05" | |
myUa = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' | |
HEADER = {'User-Agent': myUa} | |
query = '{} since:{} until:{}'.format(keywords, since, until) | |
query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#tweets is the html data as a bs4 (beautifulsoup) object | |
def writeTweets(tweets): | |
newTweetRecords = [] | |
for tweet in tweets: | |
try: | |
if tweet.find("a", {"class" : "js-action-profile-promoted"}): | |
continue | |
text = tweet.find("p", {"class" : "tweet-text"}).get_text() | |
date = tweet.find("span", {"class" : "_timestamp"})["data-time-ms"] | |
tweetId = tweet['data-item-id'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \ | |
'default&include_available_features=1&include_entities=1&' \ | |
'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}' | |
response = requests.get(url, headers=HEADER) | |
soup = BeautifulSoup(response.text, 'lxml') | |
tweets = soup.find_all("li", {"data-item-type": "tweet"}) | |
writeTweets(tweets) | |
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"] | |
for i in range(10000): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def writeTweets(tweets): | |
''' | |
See Extraction_Exapmple.py to see how to parse tweets into newTweetRecords | |
''' | |
try: | |
result = collection.insert_many(newTweetRecords, ordered=False) | |
i += len(newTweetRecords) | |
except pymongo.errors.BulkWriteError as e: | |
panic = list(filter(lambda x: x['code'] != 11000, e.details['writeErrors'])) | |
if len(panic) > 0: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main(): | |
client = MongoClient('mongodb://mongodb:27017', username='<username>', password='<password>', authSource='admin') | |
tweetCol = client['twitter']['tweets'] | |
todoCol = client['twitter']['queriesTodo'] | |
currentQuery = todoCol.find_one_and_delete({}) | |
while currentQuery != None: | |
executeQuery(currentQuery["qWords"], currentQuery["since"], currentQuery["until"], tweetCol) | |
currentQuery = todoCol.find_one_and_delete({}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version: '3.1' | |
services: | |
mongodb: | |
image: mongo:4.0.0-xenial | |
volumes: | |
- './mongodb:/data/db' | |
networks: | |
- backend | |
deploy: | |
placement: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
client = MongoClient('mongodb://<mongo ip>:27017', username='<username>', password='<password>', authSource='admin') | |
todoCol = client['twitter']['queriesTodo'] | |
queryDates = [] | |
since = date(2014,1,1) | |
qWords = "btc%2C%20OR%20bitcoin%2C%20OR%20crypto" | |
daterange = pd.date_range(date(2014,1,2), date(2019,7,5)) | |
for until in daterange: | |
query = {"since" : since.strftime("%Y-%m-%d"), "until": until.strftime("%Y-%m-%d"), "qWords": qWords} | |
queryDates.append(query) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import digitalocean | |
from pymongo import MongoClient | |
#Creation | |
with open('/Users/dhefferna/Desktop/TwitterProject/Admin/cloud-config.yml', 'r') as file: | |
USER_DATA = file.read() | |
SSH = ["ssh pubkey"] | |
key = "<DigitalOcean API Key>" | |
for i in range(100): | |
droplet = digitalocean.Droplet(token=key, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#cloud-config | |
runcmd: | |
- ufw allow 22/tcp | |
- ufw allow 2376/tcp | |
- ufw allow 2377/tcp | |
- ufw allow 7946/tcp | |
- ufw allow 7946/udp | |
- ufw allow 4789/udp | |
- ufw reload | |
- systemctl restart docker |
OlderNewer