Natalie Olivo nmolivo

## acronym_text_matcher.py
from nltk.corpus import stopwords

def find_org(text, acronym):
    # good for abbrevs that are in all caps.
    try:
        text = text.split(acronym)[0]
    except:
        text = text
    orig_text_token_list = text.split(" ")
    text_token_list = [x.title() for x in orig_text_token_list]

## s3-pickle-import.py
import pickle
import boto3
import boto3.session

cred = boto3.Session().get_credentials()
ACCESS_KEY = cred.access_key
SECRET_KEY = cred.secret_key
SESSION_TOKEN = cred.token  ## optional

s3client = boto3.client('s3',

## tesu_scraper_04
# programmatically edit metadata of s3 bucket files, so the links open in a browser tab and do not download
# to your computer when clicked
client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)

for i in html_list:
    key = i

    k = client.head_object(Bucket = bucket_name, Key = key)
    client.copy_object(Bucket = bucket_name, Key = key, CopySource = bucket + '/' + key, ContentType ='text/html', MetadataDirective='REPLACE')


## tesu_scraper_03
  #open doc, from folder 'docs', extract XML coding
  pathway = '/home/ec2-user/ec2docs/'+file
  document = zipfile.ZipFile(pathway)
  xml_content = document.read('word/document.xml')
  document.close()
  xml_str = str(xml_content)

  #create linklist for doc, by going through the XML and finding the links
  link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.

## tesu_scraper_02
#Save our s3 word docs to the EC2 instance

bucket_name = 'name_of_s3_bucket' # replace with your bucket name
session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')

#for loop to loop through all files in the file_list and downlaod them to the EC2 instance.
for word_doc in file_list:
#word_doc is the name of our file, as a string.

## tesu_scraper_01
### Create list of files to download to EC2 Instance
from boto3.session import Session

ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket('name_of_s3_bucket')

## adfuller-test-viz
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):

    #Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=96)
    rolstd = pd.rolling_std(timeseries, window=96)
    #I use 96 as my window because that is how many of my observations are contained in one 24-hr-period

    #Plot rolling statistics:
    fig = plt.figure(figsize=(12, 8))

## bokeh-valenbisi-map
from bokeh.io import output_file, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, Range1d, PanTool, WheelZoomTool, BoxSelectTool, HoverTool,
)
from bokeh.plotting import figure

map_options = GMapOptions(lat=39.4699, lng=-0.3763, map_type="roadmap", zoom=13)

plot = GMapPlot(x_range=Range1d(), y_range=Range1d(), map_options=map_options)
plot.title.text = "Valenbisi Stations"

## long-lat-selenium
#function to webscrape
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 30)
coords = []
driver.get('https://www.google.com/maps')
for school in schools:
    searchbox = wait.until(EC.presence_of_element_located((By.ID, 'searchboxinput')))
    searchbox.clear()
    searchbox.send_keys(school + ' school Washington DC')
    driver.find_element_by_id('searchbox-searchbutton').click()

## selenium-import
import os
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
	from nltk.corpus import stopwords

	def find_org(text, acronym):
	# good for abbrevs that are in all caps.
	try:
	text = text.split(acronym)[0]
	except:
	text = text
	orig_text_token_list = text.split(" ")
	text_token_list = [x.title() for x in orig_text_token_list]
	import pickle
	import boto3
	import boto3.session

	cred = boto3.Session().get_credentials()
	ACCESS_KEY = cred.access_key
	SECRET_KEY = cred.secret_key
	SESSION_TOKEN = cred.token ## optional

	s3client = boto3.client('s3',
	# programmatically edit metadata of s3 bucket files, so the links open in a browser tab and do not download
	# to your computer when clicked
	client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)

	for i in html_list:
	key = i

	k = client.head_object(Bucket = bucket_name, Key = key)
	client.copy_object(Bucket = bucket_name, Key = key, CopySource = bucket + '/' + key, ContentType ='text/html', MetadataDirective='REPLACE')
	#open doc, from folder 'docs', extract XML coding
	pathway = '/home/ec2-user/ec2docs/'+file
	document = zipfile.ZipFile(pathway)
	xml_content = document.read('word/document.xml')
	document.close()
	xml_str = str(xml_content)

	#create linklist for doc, by going through the XML and finding the links
	link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.
	#Save our s3 word docs to the EC2 instance

	bucket_name = 'name_of_s3_bucket' # replace with your bucket name
	session = Session(aws_access_key_id=ACCESS_KEY,
	aws_secret_access_key=SECRET_KEY)
	s3 = session.resource('s3')

	#for loop to loop through all files in the file_list and downlaod them to the EC2 instance.
	for word_doc in file_list:
	#word_doc is the name of our file, as a string.
	### Create list of files to download to EC2 Instance
	from boto3.session import Session

	ACCESS_KEY='your_access_key'
	SECRET_KEY='your_secret_key'

	session = Session(aws_access_key_id=ACCESS_KEY,
	aws_secret_access_key=SECRET_KEY)
	s3 = session.resource('s3')
	your_bucket = s3.Bucket('name_of_s3_bucket')
	from statsmodels.tsa.stattools import adfuller
	def test_stationarity(timeseries):

	#Determing rolling statistics
	rolmean = pd.rolling_mean(timeseries, window=96)
	rolstd = pd.rolling_std(timeseries, window=96)
	#I use 96 as my window because that is how many of my observations are contained in one 24-hr-period

	#Plot rolling statistics:
	fig = plt.figure(figsize=(12, 8))
	from bokeh.io import output_file, show
	from bokeh.models import (
	GMapPlot, GMapOptions, ColumnDataSource, Circle, Range1d, PanTool, WheelZoomTool, BoxSelectTool, HoverTool,
	)
	from bokeh.plotting import figure

	map_options = GMapOptions(lat=39.4699, lng=-0.3763, map_type="roadmap", zoom=13)

	plot = GMapPlot(x_range=Range1d(), y_range=Range1d(), map_options=map_options)
	plot.title.text = "Valenbisi Stations"
	#function to webscrape
	driver = webdriver.Firefox()
	wait = WebDriverWait(driver, 30)
	coords = []
	driver.get('https://www.google.com/maps')
	for school in schools:
	searchbox = wait.until(EC.presence_of_element_located((By.ID, 'searchboxinput')))
	searchbox.clear()
	searchbox.send_keys(school + ' school Washington DC')
	driver.find_element_by_id('searchbox-searchbutton').click()
	import os
	import pandas as pd
	from time import sleep

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.firefox.firefox_binary import FirefoxBinary