Thiago Marzagão thiagomarzagao

## parseX.py
import os
import re
import pickle
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

def pre_process():
    vanilla = u'[^\u0041-\u005A \
                  \u0061-\u007A \
                  \u00C0-\u00D6 \

## parseY.py
import os
import pickle
import numpy as np

basepath = '/caminho/ate/CSVs/' # altere conforme necessario
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname]
grupos = []
classes = []
counter = 0
for fname in flist:

## wordscores.py
### WORDSCORES (LBG-2003)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu

import os
import numpy as np
import pandas as pd

ipath = '/Users/username/inputdata/' # folder containing the CSV files
opath = '/Users/username/outputdata/' # folder where output will be saved

## fightinwords.py
### FIGHTIN' WORDS (MCQ-2008)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu

import os
import sys
import pandas as pd
import numpy as np
from numpy import matrix as m


## mangled.html
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10

## deepnet.R
library(tm)
library(Matrix)

setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/')
comprasnet <- read.table('subset.csv',
                         stringsAsFactors = FALSE,
                         sep = ',',
                         nrows = 1000)
corpus <- Corpus(VectorSource(comprasnet$V2))
corpus <- tm_map(corpus, PlainTextDocument)

## h2o.R
library(tm)

setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/')
comprasnet <- read.table('subset.csv',
                         stringsAsFactors = FALSE,
                         sep = ',',
                         nrows = 1000)
corpus <- Corpus(VectorSource(comprasnet$V2))
corpus <- tm_map(corpus, PlainTextDocument)
tfidf <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf))

## howto.html
To produce the ADS I relied on supervised learning. I tried three different approaches, compared the results,
and picked the approach that worked best. More specifically, I tried: a) a combination of Latent Semantic Analysis
and tree-based regression methods; b) a combination of Latent Dirichlet Allocation and tree-based regression methods;
and c) the Wordscores algorithm. The Wordscores algorithm outperformed the alternatives.
I created a <a href="http://democracy-scores.org">web application</a> where anyone can tweak the training data and
see how the results change (no coding required). <u>Data and code</u>. The two corpora (A and B) are available
in <a href="http://math.nist.gov/MatrixMarket/formats.html#MMformat">MatrixMarket format</a>.
Each corpus is accompanied by other files: an internal index; a Python pickle with a dictionary mapping word IDs
to words; and a Python pickle with a dictionary mapping words to word IDs. Here are the links:
<a href="https://s3.amazonaws.com/thiagomarzagao/corpor

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                thiagomarzagao
                / keybase.md
            
            
              Created
              June 21, 2017 22:46
            
          
    Keybase proof

I hereby claim:

I am thiagomarzagao on github.
I am thiagomarzagao (https://keybase.io/thiagomarzagao) on keybase.
I have a public key ASAVdCRfhIPn2ajeJwty8IPOGKV0NmlWGuSLIU5iSELYego

To claim this, I am signing this object:

  
## dad_Lotto_numbers.py
import time
import random
import telepot
from telepot.loop import MessageLoop

bot = telepot.Bot('xxxxx')

family_group_id = 'xxxxx'

def handle(msg):
	import os
	import re
	import pickle
	from nltk.stem import RSLPStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer

	def pre_process():
	vanilla = u'[^\u0041-\u005A \
	\u0061-\u007A \
	\u00C0-\u00D6 \
	import os
	import pickle
	import numpy as np

	basepath = '/caminho/ate/CSVs/' # altere conforme necessario
	flist = [fname for fname in os.listdir(basepath) if '.csv' in fname]
	grupos = []
	classes = []
	counter = 0
	for fname in flist:
	### WORDSCORES (LBG-2003)
	### author: Thiago Marzagao
	### contact: marzagao ddott 1 at osu ddott edu

	import os
	import numpy as np
	import pandas as pd

	ipath = '/Users/username/inputdata/' # folder containing the CSV files
	opath = '/Users/username/outputdata/' # folder where output will be saved
	### FIGHTIN' WORDS (MCQ-2008)
	### author: Thiago Marzagao
	### contact: marzagao ddott 1 at osu ddott edu

	import os
	import sys
	import pandas as pd
	import numpy as np
	from numpy import matrix as m
	<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
	2
	3
	4
	5
	6
	7
	8
	9
	10
	library(tm)
	library(Matrix)

	setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/')
	comprasnet <- read.table('subset.csv',
	stringsAsFactors = FALSE,
	sep = ',',
	nrows = 1000)
	corpus <- Corpus(VectorSource(comprasnet$V2))
	corpus <- tm_map(corpus, PlainTextDocument)
	To produce the ADS I relied on supervised learning. I tried three different approaches, compared the results,
	and picked the approach that worked best. More specifically, I tried: a) a combination of Latent Semantic Analysis
	and tree-based regression methods; b) a combination of Latent Dirichlet Allocation and tree-based regression methods;
	and c) the Wordscores algorithm. The Wordscores algorithm outperformed the alternatives.
	I created a <a href="http://democracy-scores.org">web application</a> where anyone can tweak the training data and
	see how the results change (no coding required). <u>Data and code</u>. The two corpora (A and B) are available
	in <a href="http://math.nist.gov/MatrixMarket/formats.html#MMformat">MatrixMarket format</a>.
	Each corpus is accompanied by other files: an internal index; a Python pickle with a dictionary mapping word IDs
	to words; and a Python pickle with a dictionary mapping words to word IDs. Here are the links:
	<a href="https://s3.amazonaws.com/thiagomarzagao/corpor
	import time
	import random
	import telepot
	from telepot.loop import MessageLoop

	bot = telepot.Bot('xxxxx')

	family_group_id = 'xxxxx'

	def handle(msg):