Skip to content

Instantly share code, notes, and snippets.

View haayanau's full-sized avatar

Haaya Naushan haayanau

View GitHub Profile
@haayanau
haayanau / get_job_posting.py
Created October 20, 2021 17:07
Scrape job posting sample
import time
import pandas as pd
import numpy as np
from newspaper import Article, ArticleException, Config
url = "https://neuvoo.com/view/?jpos=&jp=1&l=&id=5cc0b117a2cf&lang=en&sal-job=1&ss=1&context=serp&testid=champion&nb=true&reqid=05412eb2337b92dbca9c2ea510dc2053&source=neuvoosearch"
def download_single_article(link):
practice = False
@haayanau
haayanau / resolve_urls.py
Last active October 20, 2021 17:44
Resolve urls
def resolve_url(my_website):
result = {}
result['cleaned_url'] = my_website
r = requests.get(my_website, allow_redirects=True, timeout=5)
my_resolved_website = r.url
result['resolved_url'] = my_resolved_website
return result
@haayanau
haayanau / sample_code_DIME_world_bank_data_projects.py
Last active September 30, 2021 15:37
Sample code for DIME Data Science Consultant (World Bank data projects) position
"""This script is for a disinformation investigation about voter fraud done in collaboration with
researchers at the Shorenstein Center at the Harvard Kennedy School. The code contained in this
file is written by Haaya Naushan. This investigation required processing roughly 30 million tweets,
many of these tweets contained urls linking to fake news. Analysis required resolving the urls from
their shortened "tco" form (Twitter's format for shortening urls) and determining the Alexa web rank.
Processing data of this size required multi-threading of several processes, and many supporting
functions that are contained within a separate Python library for modularity. Concurrency
speeds up this script significantly, and is possible with multi-threading since we are only running
I/O operations in parallel. The script runs from the command line with optional arguments to change
default settings. """
@haayanau
haayanau / hard_traveling_plot_causalforest.py
Last active May 13, 2021 13:31
Hard traveling plot Causal Forest
# set plot size
fig, ax = plt.subplots(figsize=(12, 8))
# plot treatment effects for peripheral neighbourhoods
ax.plot(z_per['cate'],
marker='.', linestyle='-', linewidth=0.5, label='CATE_below_median', color='steelblue')
# uncomment for confidence intervals for peripheral neighbourhoods
# ax.plot(z_per['lb'],
# marker='.', linestyle='-', linewidth=0.5, color='green')
# ax.plot(z_per['ub'],
@haayanau
haayanau / hard_traveling_causalforest_predict_CATE.py
Last active May 13, 2021 11:44
Hard traveling Causal Forest predict treatment effects
treatment_effects_per, lb_per, ub_per = est_per.predict(X, interval=True, alpha=0.05)
treatment_effects_not_per, lb_not_per, ub_not_per = est_not_per.predict(X2, interval=True, alpha=0.05)
te_per = []
for i in range(len(treatment_effects_per)):
dict_te = {}
dict_te['cate'] = treatment_effects_per[i][0] + treatment_effects_per[i][1]
dict_te['lb'] = lb_per[i][0] + lb_per[i][1]
dict_te['ub'] = ub_per[i][0] + ub_per[i][1]
te_per.append(dict_te)
@haayanau
haayanau / hard_traveling_causalforest_model.py
Last active May 13, 2021 11:41
Hard traveling Causal Forest model
from econml.grf import CausalIVForest
# set parameters for the causal forest for each subgroup
est_per = CausalIVForest(criterion='het',
n_estimators=10000,
min_samples_leaf=5,
max_depth=None,
max_samples=0.5,
honest=True,
inference=True,
@haayanau
haayanau / hard_traveling_causalforest_variables.py
Last active May 12, 2021 21:47
Hard traveling Causal Forest variables
from sklearn.model_selection import train_test_split
# split dataset by subgroup
df_per = df[df['population_total']<=1884]
df_not_per = df[df['population_total']>=1885]
# split for train and test sets for each subgroup
train_per, test_per = train_test_split(df_per, test_size=0.2)
train_not_per, test_not_per = train_test_split(df_not_per, test_size=0.2)
treatment = ['obstruction', 'protection']
outcome = ['chng_employment']
@haayanau
haayanau / hard_traveling_plot_CATE.py
Last active May 12, 2021 21:08
Hard traveling - plot conditional average treatment effects
# set plot size
fig, ax = plt.subplots(figsize=(12, 8))
# plot lines for treatment effects
ax.plot(z_per['cate'],
marker='.', linestyle='-',
linewidth=0.5,
label='CATE_population_below_median',
color='indigo'
)
@haayanau
haayanau / hard_traveling_predict_CATE.py
Created May 12, 2021 13:23
Hard traveling - predict conditional average treatment effects
# predict treatment effects for smaller peripheral neighbourhoods
y_pred_per = deepIvEst_per.predict(t, x)
te_df_per = pd.DataFrame(y_pred_per, columns=['cate'])
te_df_per.sort_values('cate', inplace=True, ascending=True)
te_df_per.reset_index(inplace=True, drop=True)
z_per = te_df_per.rolling(window=15, center=True).mean()
# predict treatment effects for larger central neighbourhoods
y_pred_not_per = deepIvEst_not_per.predict(t2, x2)
te_df_not_per = pd.DataFrame(y_pred_not_per, columns=['cate'])
@haayanau
haayanau / hard_traveling_deepiv_parameters.py
Last active May 12, 2021 16:21
Hard traveling DeepIV parameters
# code adapted from https://microsoft.github.io/dowhy/example_notebooks/dowhy-conditional-treatment-effects.html
keras_fit_options_1 = {"epochs": 50,
"validation_split": 0.1,
"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
}
keras_fit_options_2 = {"epochs": 100,
"validation_split": 0.1,
"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
}