This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import pandas as pd | |
import numpy as np | |
from newspaper import Article, ArticleException, Config | |
url = "https://neuvoo.com/view/?jpos=&jp=1&l=&id=5cc0b117a2cf&lang=en&sal-job=1&ss=1&context=serp&testid=champion&nb=true&reqid=05412eb2337b92dbca9c2ea510dc2053&source=neuvoosearch" | |
def download_single_article(link): | |
practice = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def resolve_url(my_website): | |
result = {} | |
result['cleaned_url'] = my_website | |
r = requests.get(my_website, allow_redirects=True, timeout=5) | |
my_resolved_website = r.url | |
result['resolved_url'] = my_resolved_website | |
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""This script is for a disinformation investigation about voter fraud done in collaboration with | |
researchers at the Shorenstein Center at the Harvard Kennedy School. The code contained in this | |
file is written by Haaya Naushan. This investigation required processing roughly 30 million tweets, | |
many of these tweets contained urls linking to fake news. Analysis required resolving the urls from | |
their shortened "tco" form (Twitter's format for shortening urls) and determining the Alexa web rank. | |
Processing data of this size required multi-threading of several processes, and many supporting | |
functions that are contained within a separate Python library for modularity. Concurrency | |
speeds up this script significantly, and is possible with multi-threading since we are only running | |
I/O operations in parallel. The script runs from the command line with optional arguments to change | |
default settings. """ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set plot size | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# plot treatment effects for peripheral neighbourhoods | |
ax.plot(z_per['cate'], | |
marker='.', linestyle='-', linewidth=0.5, label='CATE_below_median', color='steelblue') | |
# uncomment for confidence intervals for peripheral neighbourhoods | |
# ax.plot(z_per['lb'], | |
# marker='.', linestyle='-', linewidth=0.5, color='green') | |
# ax.plot(z_per['ub'], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
treatment_effects_per, lb_per, ub_per = est_per.predict(X, interval=True, alpha=0.05) | |
treatment_effects_not_per, lb_not_per, ub_not_per = est_not_per.predict(X2, interval=True, alpha=0.05) | |
te_per = [] | |
for i in range(len(treatment_effects_per)): | |
dict_te = {} | |
dict_te['cate'] = treatment_effects_per[i][0] + treatment_effects_per[i][1] | |
dict_te['lb'] = lb_per[i][0] + lb_per[i][1] | |
dict_te['ub'] = ub_per[i][0] + ub_per[i][1] | |
te_per.append(dict_te) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from econml.grf import CausalIVForest | |
# set parameters for the causal forest for each subgroup | |
est_per = CausalIVForest(criterion='het', | |
n_estimators=10000, | |
min_samples_leaf=5, | |
max_depth=None, | |
max_samples=0.5, | |
honest=True, | |
inference=True, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
# split dataset by subgroup | |
df_per = df[df['population_total']<=1884] | |
df_not_per = df[df['population_total']>=1885] | |
# split for train and test sets for each subgroup | |
train_per, test_per = train_test_split(df_per, test_size=0.2) | |
train_not_per, test_not_per = train_test_split(df_not_per, test_size=0.2) | |
treatment = ['obstruction', 'protection'] | |
outcome = ['chng_employment'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set plot size | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# plot lines for treatment effects | |
ax.plot(z_per['cate'], | |
marker='.', linestyle='-', | |
linewidth=0.5, | |
label='CATE_population_below_median', | |
color='indigo' | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# predict treatment effects for smaller peripheral neighbourhoods | |
y_pred_per = deepIvEst_per.predict(t, x) | |
te_df_per = pd.DataFrame(y_pred_per, columns=['cate']) | |
te_df_per.sort_values('cate', inplace=True, ascending=True) | |
te_df_per.reset_index(inplace=True, drop=True) | |
z_per = te_df_per.rolling(window=15, center=True).mean() | |
# predict treatment effects for larger central neighbourhoods | |
y_pred_not_per = deepIvEst_not_per.predict(t2, x2) | |
te_df_not_per = pd.DataFrame(y_pred_not_per, columns=['cate']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# code adapted from https://microsoft.github.io/dowhy/example_notebooks/dowhy-conditional-treatment-effects.html | |
keras_fit_options_1 = {"epochs": 50, | |
"validation_split": 0.1, | |
"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)] | |
} | |
keras_fit_options_2 = {"epochs": 100, | |
"validation_split": 0.1, | |
"callbacks": [keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)] | |
} |
NewerOlder