Skip to content

Instantly share code, notes, and snippets.

@nmolivo
nmolivo / acronym_text_matcher.py
Last active February 20, 2020 08:02
enter text and acronym to find what the acronym stands for, if un-abbreviated text of acronym appears in text. assumes text which contains acronym meaning appears before acronym, if acronym is mentioned. case sensitive. Allows for 2 stop words.
from nltk.corpus import stopwords
def find_org(text, acronym):
# good for abbrevs that are in all caps.
try:
text = text.split(acronym)[0]
except:
text = text
orig_text_token_list = text.split(" ")
text_token_list = [x.title() for x in orig_text_token_list]
import pickle
import boto3
import boto3.session
cred = boto3.Session().get_credentials()
ACCESS_KEY = cred.access_key
SECRET_KEY = cred.secret_key
SESSION_TOKEN = cred.token ## optional
s3client = boto3.client('s3',
# programmatically edit metadata of s3 bucket files, so the links open in a browser tab and do not download
# to your computer when clicked
client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
for i in html_list:
key = i
k = client.head_object(Bucket = bucket_name, Key = key)
client.copy_object(Bucket = bucket_name, Key = key, CopySource = bucket + '/' + key, ContentType ='text/html', MetadataDirective='REPLACE')
#open doc, from folder 'docs', extract XML coding
pathway = '/home/ec2-user/ec2docs/'+file
document = zipfile.ZipFile(pathway)
xml_content = document.read('word/document.xml')
document.close()
xml_str = str(xml_content)
#create linklist for doc, by going through the XML and finding the links
link_list = re.findall('>http.*?\<',xml_str) #it returns text starting with '>http', ending with '<', inclusive.
#Save our s3 word docs to the EC2 instance
bucket_name = 'name_of_s3_bucket' # replace with your bucket name
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
#for loop to loop through all files in the file_list and downlaod them to the EC2 instance.
for word_doc in file_list:
#word_doc is the name of our file, as a string.
### Create list of files to download to EC2 Instance
from boto3.session import Session
ACCESS_KEY='your_access_key'
SECRET_KEY='your_secret_key'
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket('name_of_s3_bucket')
@nmolivo
nmolivo / adfuller-test-viz
Created March 20, 2018 20:34
adfuller-test
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = pd.rolling_mean(timeseries, window=96)
rolstd = pd.rolling_std(timeseries, window=96)
#I use 96 as my window because that is how many of my observations are contained in one 24-hr-period
#Plot rolling statistics:
fig = plt.figure(figsize=(12, 8))
@nmolivo
nmolivo / bokeh-valenbisi-map
Created March 17, 2018 18:51
bokeh-valenbisi-map
from bokeh.io import output_file, show
from bokeh.models import (
GMapPlot, GMapOptions, ColumnDataSource, Circle, Range1d, PanTool, WheelZoomTool, BoxSelectTool, HoverTool,
)
from bokeh.plotting import figure
map_options = GMapOptions(lat=39.4699, lng=-0.3763, map_type="roadmap", zoom=13)
plot = GMapPlot(x_range=Range1d(), y_range=Range1d(), map_options=map_options)
plot.title.text = "Valenbisi Stations"
@nmolivo
nmolivo / long-lat-selenium
Created February 8, 2018 08:17
long-lat-selenium
#function to webscrape
driver = webdriver.Firefox()
wait = WebDriverWait(driver, 30)
coords = []
driver.get('https://www.google.com/maps')
for school in schools:
searchbox = wait.until(EC.presence_of_element_located((By.ID, 'searchboxinput')))
searchbox.clear()
searchbox.send_keys(school + ' school Washington DC')
driver.find_element_by_id('searchbox-searchbutton').click()
@nmolivo
nmolivo / selenium-import
Created February 8, 2018 08:12
selenium-import
import os
import pandas as pd
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary