I hereby claim:
- I am thiagomarzagao on github.
- I am thiagomarzagao (https://keybase.io/thiagomarzagao) on keybase.
- I have a public key ASAVdCRfhIPn2ajeJwty8IPOGKV0NmlWGuSLIU5iSELYego
To claim this, I am signing this object:
import os | |
import re | |
import pickle | |
from nltk.stem import RSLPStemmer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def pre_process(): | |
vanilla = u'[^\u0041-\u005A \ | |
\u0061-\u007A \ | |
\u00C0-\u00D6 \ |
import os | |
import pickle | |
import numpy as np | |
basepath = '/caminho/ate/CSVs/' # altere conforme necessario | |
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname] | |
grupos = [] | |
classes = [] | |
counter = 0 | |
for fname in flist: |
### WORDSCORES (LBG-2003) | |
### author: Thiago Marzagao | |
### contact: marzagao ddott 1 at osu ddott edu | |
import os | |
import numpy as np | |
import pandas as pd | |
ipath = '/Users/username/inputdata/' # folder containing the CSV files | |
opath = '/Users/username/outputdata/' # folder where output will be saved |
### FIGHTIN' WORDS (MCQ-2008) | |
### author: Thiago Marzagao | |
### contact: marzagao ddott 1 at osu ddott edu | |
import os | |
import sys | |
import pandas as pd | |
import numpy as np | |
from numpy import matrix as m | |
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 |
library(tm) | |
library(Matrix) | |
setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/') | |
comprasnet <- read.table('subset.csv', | |
stringsAsFactors = FALSE, | |
sep = ',', | |
nrows = 1000) | |
corpus <- Corpus(VectorSource(comprasnet$V2)) | |
corpus <- tm_map(corpus, PlainTextDocument) |
library(tm) | |
setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/') | |
comprasnet <- read.table('subset.csv', | |
stringsAsFactors = FALSE, | |
sep = ',', | |
nrows = 1000) | |
corpus <- Corpus(VectorSource(comprasnet$V2)) | |
corpus <- tm_map(corpus, PlainTextDocument) | |
tfidf <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf)) |
To produce the ADS I relied on supervised learning. I tried three different approaches, compared the results, | |
and picked the approach that worked best. More specifically, I tried: a) a combination of Latent Semantic Analysis | |
and tree-based regression methods; b) a combination of Latent Dirichlet Allocation and tree-based regression methods; | |
and c) the Wordscores algorithm. The Wordscores algorithm outperformed the alternatives. | |
I created a <a href="http://democracy-scores.org">web application</a> where anyone can tweak the training data and | |
see how the results change (no coding required). <u>Data and code</u>. The two corpora (A and B) are available | |
in <a href="http://math.nist.gov/MatrixMarket/formats.html#MMformat">MatrixMarket format</a>. | |
Each corpus is accompanied by other files: an internal index; a Python pickle with a dictionary mapping word IDs | |
to words; and a Python pickle with a dictionary mapping words to word IDs. Here are the links: | |
<a href="https://s3.amazonaws.com/thiagomarzagao/corpor |
I hereby claim:
To claim this, I am signing this object:
import time | |
import random | |
import telepot | |
from telepot.loop import MessageLoop | |
bot = telepot.Bot('xxxxx') | |
family_group_id = 'xxxxx' | |
def handle(msg): |