Skip to content

Instantly share code, notes, and snippets.

View thiagomarzagao's full-sized avatar

Thiago Marzagão thiagomarzagao

View GitHub Profile
library(tm)
library(Matrix)
setwd('/Users/thiagomarzagao/Dropbox/dataScience/UnB-CIC/aulaText/')
comprasnet <- read.table('subset.csv',
stringsAsFactors = FALSE,
sep = ',',
nrows = 1000)
corpus <- Corpus(VectorSource(comprasnet$V2))
corpus <- tm_map(corpus, PlainTextDocument)
@thiagomarzagao
thiagomarzagao / mangled.html
Last active February 11, 2016 23:11
HTML that shows mangled code block
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1
2
3
4
5
6
7
8
9
10
@thiagomarzagao
thiagomarzagao / fightinwords.py
Created February 11, 2016 17:45
Fighting' Words in Python
### FIGHTIN' WORDS (MCQ-2008)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu
import os
import sys
import pandas as pd
import numpy as np
from numpy import matrix as m
@thiagomarzagao
thiagomarzagao / wordscores.py
Created February 11, 2016 17:41
Wordscores in Python
### WORDSCORES (LBG-2003)
### author: Thiago Marzagao
### contact: marzagao ddott 1 at osu ddott edu
import os
import numpy as np
import pandas as pd
ipath = '/Users/username/inputdata/' # folder containing the CSV files
opath = '/Users/username/outputdata/' # folder where output will be saved
@thiagomarzagao
thiagomarzagao / parseY.py
Created December 7, 2015 16:20
cria Y.pkl
import os
import pickle
import numpy as np
basepath = '/caminho/ate/CSVs/' # altere conforme necessario
flist = [fname for fname in os.listdir(basepath) if '.csv' in fname]
grupos = []
classes = []
counter = 0
for fname in flist:
@thiagomarzagao
thiagomarzagao / parseX.py
Created December 7, 2015 16:18
cria X.pkl
import os
import re
import pickle
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
def pre_process():
vanilla = u'[^\u0041-\u005A \
\u0061-\u007A \
\u00C0-\u00D6 \
@thiagomarzagao
thiagomarzagao / catmat_svm.py
Created December 5, 2015 20:19
código p/ treinar classificador SVM p/ CATMAT
import os
import pickle
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn import cross_validation
# carrega X
with open('X.pkl', mode = 'rb') as fbuffer:
X = pickle.load(fbuffer)
@thiagomarzagao
thiagomarzagao / index.html
Created February 1, 2015 20:04
plot Brazil
<!DOCTYPE html>
<meta charset="utf-8">
<body>
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="http://d3js.org/topojson.v1.min.js"></script>
<script>
var width = 960,
height = 1160;
'''
parse (HTML -> JSON) e-Compras GDF content
'''
import os
import re
import json
import socket
from bs4 import BeautifulSoup
# scrape e-Compras GDF (https://www.compras.df.gov.br/)
import os
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.compras.df.gov.br/publico/'
basepath = '/Users/thiagomarzagao/Desktop/HTML/'
primeiro_id = 0 # ID of the first auction
ultimo_id = 48355 # ID of the last auction (as of 12/18/14)