Haaya Naushan haayanau

## get_job_posting.py
import time
import pandas as pd
import numpy as np
from newspaper import Article, ArticleException, Config


url = "https://neuvoo.com/view/?jpos=&jp=1&l=&id=5cc0b117a2cf&lang=en&sal-job=1&ss=1&context=serp&testid=champion&nb=true&reqid=05412eb2337b92dbca9c2ea510dc2053&source=neuvoosearch"

def download_single_article(link):
    practice = False

## resolve_urls.py
def resolve_url(my_website):
    result = {}
    result['cleaned_url'] = my_website
    r = requests.get(my_website, allow_redirects=True, timeout=5)
    my_resolved_website = r.url
    result['resolved_url'] = my_resolved_website

    return result

## sample_code_DIME_world_bank_data_projects.py
"""This script is for a disinformation investigation about voter fraud done in collaboration with
    researchers at the Shorenstein Center at the Harvard Kennedy School. The code contained in this
    file is written by Haaya Naushan. This investigation required processing roughly 30 million tweets,
    many of these tweets contained urls linking to fake news. Analysis required resolving the urls from
    their shortened "tco" form (Twitter's format for shortening urls) and determining the Alexa web rank.
    Processing data of this size required multi-threading of several processes, and many supporting
    functions that are contained within a separate Python library for modularity. Concurrency
    speeds up this script significantly, and is possible with multi-threading since we are only running
    I/O operations in parallel. The script runs from the command line with optional arguments to change
    default settings. """

## hard_traveling_plot_causalforest.py
# set plot size
fig, ax = plt.subplots(figsize=(12, 8))

# plot treatment effects for peripheral neighbourhoods
ax.plot(z_per['cate'],
        marker='.', linestyle='-', linewidth=0.5, label='CATE_below_median', color='steelblue')
# uncomment for confidence intervals for peripheral neighbourhoods
# ax.plot(z_per['lb'],
#         marker='.', linestyle='-', linewidth=0.5, color='green')
# ax.plot(z_per['ub'],

## hard_traveling_causalforest_predict_CATE.py
treatment_effects_per, lb_per, ub_per = est_per.predict(X, interval=True, alpha=0.05)
treatment_effects_not_per, lb_not_per, ub_not_per = est_not_per.predict(X2, interval=True, alpha=0.05)

te_per = []
for i in range(len(treatment_effects_per)):
    dict_te = {}
    dict_te['cate'] = treatment_effects_per[i][0] + treatment_effects_per[i][1]
    dict_te['lb'] = lb_per[i][0] + lb_per[i][1]
    dict_te['ub'] = ub_per[i][0] + ub_per[i][1]
    te_per.append(dict_te)

## hard_traveling_causalforest_model.py
from econml.grf import CausalIVForest

# set parameters for the causal forest for each subgroup
est_per = CausalIVForest(criterion='het',
                     n_estimators=10000,
                     min_samples_leaf=5,
                     max_depth=None,
                     max_samples=0.5,
                     honest=True,
                     inference=True,

## hard_traveling_causalforest_variables.py
from sklearn.model_selection import train_test_split
# split dataset by subgroup
df_per = df[df['population_total']<=1884]
df_not_per = df[df['population_total']>=1885]
# split for train and test sets for each subgroup
train_per, test_per = train_test_split(df_per, test_size=0.2)
train_not_per, test_not_per = train_test_split(df_not_per, test_size=0.2)

treatment = ['obstruction', 'protection']
outcome = ['chng_employment']

## hard_traveling_plot_CATE.py
# set plot size
fig, ax = plt.subplots(figsize=(12, 8))

# plot lines for treatment effects
ax.plot(z_per['cate'],
        marker='.', linestyle='-',
        linewidth=0.5,
        label='CATE_population_below_median',
        color='indigo'
        )

## hard_traveling_predict_CATE.py
# predict treatment effects for smaller peripheral neighbourhoods
y_pred_per = deepIvEst_per.predict(t, x)
te_df_per = pd.DataFrame(y_pred_per, columns=['cate'])
te_df_per.sort_values('cate', inplace=True, ascending=True)
te_df_per.reset_index(inplace=True, drop=True)
z_per = te_df_per.rolling(window=15, center=True).mean()

# predict treatment effects for larger central neighbourhoods
y_pred_not_per = deepIvEst_not_per.predict(t2, x2)
te_df_not_per = pd.DataFrame(y_pred_not_per, columns=['cate'])

## hard_traveling_deepiv_parameters.py
# code adapted from https://microsoft.github.io/dowhy/example_notebooks/dowhy-conditional-treatment-effects.html

keras_fit_options_1 = {"epochs": 50,
                      "validation_split": 0.1,
                      "callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
                      }
keras_fit_options_2 = {"epochs": 100,
                      "validation_split": 0.1,
                      "callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
                      }
	import time
	import pandas as pd
	import numpy as np
	from newspaper import Article, ArticleException, Config


	url = "https://neuvoo.com/view/?jpos=&jp=1&l=&id=5cc0b117a2cf&lang=en&sal-job=1&ss=1&context=serp&testid=champion&nb=true&reqid=05412eb2337b92dbca9c2ea510dc2053&source=neuvoosearch"

	def download_single_article(link):
	practice = False
	def resolve_url(my_website):
	result = {}
	result['cleaned_url'] = my_website
	r = requests.get(my_website, allow_redirects=True, timeout=5)
	my_resolved_website = r.url
	result['resolved_url'] = my_resolved_website

	return result
	"""This script is for a disinformation investigation about voter fraud done in collaboration with
	researchers at the Shorenstein Center at the Harvard Kennedy School. The code contained in this
	file is written by Haaya Naushan. This investigation required processing roughly 30 million tweets,
	many of these tweets contained urls linking to fake news. Analysis required resolving the urls from
	their shortened "tco" form (Twitter's format for shortening urls) and determining the Alexa web rank.
	Processing data of this size required multi-threading of several processes, and many supporting
	functions that are contained within a separate Python library for modularity. Concurrency
	speeds up this script significantly, and is possible with multi-threading since we are only running
	I/O operations in parallel. The script runs from the command line with optional arguments to change
	default settings. """
	# set plot size
	fig, ax = plt.subplots(figsize=(12, 8))

	# plot treatment effects for peripheral neighbourhoods
	ax.plot(z_per['cate'],
	marker='.', linestyle='-', linewidth=0.5, label='CATE_below_median', color='steelblue')
	# uncomment for confidence intervals for peripheral neighbourhoods
	# ax.plot(z_per['lb'],
	# marker='.', linestyle='-', linewidth=0.5, color='green')
	# ax.plot(z_per['ub'],
	treatment_effects_per, lb_per, ub_per = est_per.predict(X, interval=True, alpha=0.05)
	treatment_effects_not_per, lb_not_per, ub_not_per = est_not_per.predict(X2, interval=True, alpha=0.05)

	te_per = []
	for i in range(len(treatment_effects_per)):
	dict_te = {}
	dict_te['cate'] = treatment_effects_per[i][0] + treatment_effects_per[i][1]
	dict_te['lb'] = lb_per[i][0] + lb_per[i][1]
	dict_te['ub'] = ub_per[i][0] + ub_per[i][1]
	te_per.append(dict_te)
	from econml.grf import CausalIVForest

	# set parameters for the causal forest for each subgroup
	est_per = CausalIVForest(criterion='het',
	n_estimators=10000,
	min_samples_leaf=5,
	max_depth=None,
	max_samples=0.5,
	honest=True,
	inference=True,
	from sklearn.model_selection import train_test_split
	# split dataset by subgroup
	df_per = df[df['population_total']<=1884]
	df_not_per = df[df['population_total']>=1885]
	# split for train and test sets for each subgroup
	train_per, test_per = train_test_split(df_per, test_size=0.2)
	train_not_per, test_not_per = train_test_split(df_not_per, test_size=0.2)

	treatment = ['obstruction', 'protection']
	outcome = ['chng_employment']
	# set plot size
	fig, ax = plt.subplots(figsize=(12, 8))

	# plot lines for treatment effects
	ax.plot(z_per['cate'],
	marker='.', linestyle='-',
	linewidth=0.5,
	label='CATE_population_below_median',
	color='indigo'
	)
	# predict treatment effects for smaller peripheral neighbourhoods
	y_pred_per = deepIvEst_per.predict(t, x)
	te_df_per = pd.DataFrame(y_pred_per, columns=['cate'])
	te_df_per.sort_values('cate', inplace=True, ascending=True)
	te_df_per.reset_index(inplace=True, drop=True)
	z_per = te_df_per.rolling(window=15, center=True).mean()

	# predict treatment effects for larger central neighbourhoods
	y_pred_not_per = deepIvEst_not_per.predict(t2, x2)
	te_df_not_per = pd.DataFrame(y_pred_not_per, columns=['cate'])
	# code adapted from https://microsoft.github.io/dowhy/example_notebooks/dowhy-conditional-treatment-effects.html

	keras_fit_options_1 = {"epochs": 50,
	"validation_split": 0.1,
	"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
	}
	keras_fit_options_2 = {"epochs": 100,
	"validation_split": 0.1,
	"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)]
	}