heffo42

## main.py
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
import datetime
from pymongo import MongoClient
import pymongo


## request.py
import requests

INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}'

lang = 'en', keywords = "bitcoin%2C%20crypto%2C%20btc", since = "2019-07-05", until = "2019-07-05"
myUa = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
HEADER = {'User-Agent': myUa}

query = '{} since:{} until:{}'.format(keywords, since, until)
query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')

## Extraction_Example.py
#tweets is the html data as a bs4 (beautifulsoup) object
def writeTweets(tweets):
    newTweetRecords = []
    for tweet in tweets:
        try:
            if tweet.find("a", {"class" : "js-action-profile-promoted"}):
                continue
            text = tweet.find("p", {"class" : "tweet-text"}).get_text()
            date = tweet.find("span", {"class" : "_timestamp"})["data-time-ms"]
            tweetId = tweet['data-item-id']

## Pagination_Example.py
RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \
             'default&include_available_features=1&include_entities=1&' \
             'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}'
response = requests.get(url, headers=HEADER)
soup = BeautifulSoup(response.text, 'lxml')
tweets = soup.find_all("li", {"data-item-type": "tweet"})
writeTweets(tweets)
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]

for i in range(10000):

## MongoDBInsert.py
def writeTweets(tweets):
    '''
    See Extraction_Exapmple.py to see how to parse tweets into newTweetRecords
    '''
    try:
        result = collection.insert_many(newTweetRecords, ordered=False)
        i += len(newTweetRecords)
    except pymongo.errors.BulkWriteError as e:
        panic = list(filter(lambda x: x['code'] != 11000, e.details['writeErrors']))
        if len(panic) > 0:

## MongoDBExecute.py
def main():
    client = MongoClient('mongodb://mongodb:27017', username='<username>', password='<password>', authSource='admin')
    tweetCol = client['twitter']['tweets']
    todoCol = client['twitter']['queriesTodo']
    currentQuery = todoCol.find_one_and_delete({})
    while currentQuery != None:
        executeQuery(currentQuery["qWords"], currentQuery["since"], currentQuery["until"], tweetCol)
        currentQuery = todoCol.find_one_and_delete({})

## docker-compose.yml
version: '3.1'
services:
  mongodb:
    image: mongo:4.0.0-xenial
    volumes:
      - './mongodb:/data/db'
    networks:
      - backend
    deploy:
      placement:

## InitQueries.py
client = MongoClient('mongodb://<mongo ip>:27017', username='<username>', password='<password>', authSource='admin')
todoCol = client['twitter']['queriesTodo']

queryDates = []
since = date(2014,1,1)
qWords = "btc%2C%20OR%20bitcoin%2C%20OR%20crypto"
daterange = pd.date_range(date(2014,1,2), date(2019,7,5))
for until in daterange:
    query = {"since" : since.strftime("%Y-%m-%d"), "until": until.strftime("%Y-%m-%d"), "qWords": qWords}
    queryDates.append(query)

## DOAutomation.py
import digitalocean
from pymongo import MongoClient

#Creation
with open('/Users/dhefferna/Desktop/TwitterProject/Admin/cloud-config.yml', 'r') as file:
    USER_DATA = file.read()
SSH = ["ssh pubkey"]
key = "<DigitalOcean API Key>"
for i in range(100):
    droplet = digitalocean.Droplet(token=key,

## cloud-config.yml
#cloud-config
runcmd:
  - ufw allow 22/tcp
  - ufw allow 2376/tcp
  - ufw allow 2377/tcp
  - ufw allow 7946/tcp
  - ufw allow 7946/udp
  - ufw allow 4789/udp
  - ufw reload
  - systemctl restart docker
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	import time
	import json
	import datetime
	from pymongo import MongoClient
	import pymongo
	import requests

	INIT_URL = 'https://twitter.com/search?f=tweets&vertical=default&q={q}&l={lang}'

	lang = 'en', keywords = "bitcoin%2C%20crypto%2C%20btc", since = "2019-07-05", until = "2019-07-05"
	myUa = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
	HEADER = {'User-Agent': myUa}

	query = '{} since:{} until:{}'.format(keywords, since, until)
	query = query.replace(' ', '%20').replace('#', '%23').replace(':', '%3A')
	#tweets is the html data as a bs4 (beautifulsoup) object
	def writeTweets(tweets):
	newTweetRecords = []
	for tweet in tweets:
	try:
	if tweet.find("a", {"class" : "js-action-profile-promoted"}):
	continue
	text = tweet.find("p", {"class" : "tweet-text"}).get_text()
	date = tweet.find("span", {"class" : "_timestamp"})["data-time-ms"]
	tweetId = tweet['data-item-id']
	RELOAD_URL = 'https://twitter.com/i/search/timeline?f=tweets&vertical=' \
	'default&include_available_features=1&include_entities=1&' \
	'reset_error_state=false&src=typd&max_position={pos}&q={q}&l={lang}'
	response = requests.get(url, headers=HEADER)
	soup = BeautifulSoup(response.text, 'lxml')
	tweets = soup.find_all("li", {"data-item-type": "tweet"})
	writeTweets(tweets)
	next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]

	for i in range(10000):
	def writeTweets(tweets):
	'''
	See Extraction_Exapmple.py to see how to parse tweets into newTweetRecords
	'''
	try:
	result = collection.insert_many(newTweetRecords, ordered=False)
	i += len(newTweetRecords)
	except pymongo.errors.BulkWriteError as e:
	panic = list(filter(lambda x: x['code'] != 11000, e.details['writeErrors']))
	if len(panic) > 0:
	def main():
	client = MongoClient('mongodb://mongodb:27017', username='<username>', password='<password>', authSource='admin')
	tweetCol = client['twitter']['tweets']
	todoCol = client['twitter']['queriesTodo']
	currentQuery = todoCol.find_one_and_delete({})
	while currentQuery != None:
	executeQuery(currentQuery["qWords"], currentQuery["since"], currentQuery["until"], tweetCol)
	currentQuery = todoCol.find_one_and_delete({})
	version: '3.1'
	services:
	mongodb:
	image: mongo:4.0.0-xenial
	volumes:
	- './mongodb:/data/db'
	networks:
	- backend
	deploy:
	placement:
	client = MongoClient('mongodb://<mongo ip>:27017', username='<username>', password='<password>', authSource='admin')
	todoCol = client['twitter']['queriesTodo']

	queryDates = []
	since = date(2014,1,1)
	qWords = "btc%2C%20OR%20bitcoin%2C%20OR%20crypto"
	daterange = pd.date_range(date(2014,1,2), date(2019,7,5))
	for until in daterange:
	query = {"since" : since.strftime("%Y-%m-%d"), "until": until.strftime("%Y-%m-%d"), "qWords": qWords}
	queryDates.append(query)
	import digitalocean
	from pymongo import MongoClient

	#Creation
	with open('/Users/dhefferna/Desktop/TwitterProject/Admin/cloud-config.yml', 'r') as file:
	USER_DATA = file.read()
	SSH = ["ssh pubkey"]
	key = "<DigitalOcean API Key>"
	for i in range(100):
	droplet = digitalocean.Droplet(token=key,
	#cloud-config
	runcmd:
	- ufw allow 22/tcp
	- ufw allow 2376/tcp
	- ufw allow 2377/tcp
	- ufw allow 7946/tcp
	- ufw allow 7946/udp
	- ufw allow 4789/udp
	- ufw reload
	- systemctl restart docker