import numpy as np
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from scipy import stats
from matplotlib import pyplot as plt
path = '/path/to/export.xml'
with open(path) as f:
import numpy as np
import pandas as pd
import forestci as fci
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
# set seed
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
basepath = '/Volumes/UNTITLED/wimoveis/anuncios/'
hrefs = pd.read_csv('hrefs.csv') # get URLs
hrefs = set(hrefs['href']) # remove duplicate URLs
import os
from bs4 import BeautifulSoup
hrefs = []
path = '/Volumes/UNTITLED/wimoveis/paginas/'
for fname in os.listdir(path):
if ('.html' in fname) and ('._' not in fname):
with open(path + fname, mode = 'r') as f:
html =
import time
import requests
destination = '/Volumes/UNTITLED/wimoveis/paginas/'
base_url = ''
num_pages = 1557 # number of results pages
for i in range(1, num_pages):
print('page', i)
query_url = base_url + 'apartamentos-venda-distrito-federal-goias-pagina-{}.html'.format(i)
response = requests.get(query_url)
thiagomarzagao /
Created September 2, 2018 21:00
automatically replies dad w/ 6 random numbers between 1-60 every time he asks for Lotto numbers
import time
import random
import telepot
from telepot.loop import MessageLoop
bot = telepot.Bot('xxxxx')
family_group_id = 'xxxxx'
def handle(msg):
thiagomarzagao /
Created October 29, 2017 02:08
Stata script to replicate "Why is democracy declining in Latin America?"
* The main model
xtscc pressfree leftism presvrl leftismXpresvrl ief gdpcapita
* Effect of VRLpres at in-sample min, ave, and max, values of presidential leftism
lincom _b[presvrl] + _b[leftismXpresvrl]*1.5
lincom _b[presvrl] + _b[leftismXpresvrl]*7.194014
lincom _b[presvrl] + _b[leftismXpresvrl]*18
* Figure 1 data (the 'xtscc' add-on does not accept the # operator, so the 'margins'
* command could not be used to graph the interaction)

Keybase proof

I hereby claim:

  • I am thiagomarzagao on github.
  • I am thiagomarzagao ( on keybase.
  • I have a public key ASAVdCRfhIPn2ajeJwty8IPOGKV0NmlWGuSLIU5iSELYego

To claim this, I am signing this object:

thiagomarzagao / howto.html
Last active May 28, 2017 18:57
replicating "Using NLP to measure democracy"
To produce the ADS I relied on supervised learning. I tried three different approaches, compared the results,
and picked the approach that worked best. More specifically, I tried: a) a combination of Latent Semantic Analysis
and tree-based regression methods; b) a combination of Latent Dirichlet Allocation and tree-based regression methods;
and c) the Wordscores algorithm. The Wordscores algorithm outperformed the alternatives.
I created a <a href="">web application</a> where anyone can tweak the training data and
see how the results change (no coding required). <u>Data and code</u>. The two corpora (A and B) are available
in <a href="">MatrixMarket format</a>.
Each corpus is accompanied by other files: an internal index; a Python pickle with a dictionary mapping word IDs
to words; and a Python pickle with a dictionary mapping words to word IDs. Here are the links:
<a href="
comprasnet <- read.table('subset.csv',
stringsAsFactors = FALSE,
sep = ',',
nrows = 1000)
corpus <- Corpus(VectorSource(comprasnet$V2))
corpus <- tm_map(corpus, PlainTextDocument)
tfidf <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))