larsmans /
Last active Aug 29, 2015
Brat-to-CSV converter
# Quick and dirty Brat-to-CSV conversion.
from __future__ import print_function
import csv
import io
import re
import sys
# copy server/src/{gtbtokenize,tokenise}.py from Brat
from tokenise import gtb_token_boundary_gen
larsmans /
Created Nov 10, 2014
Detach subdirectory from Git repository as separate repo
# Usage: git-detach <directory> <target>
# Creates a new Git repository at <target> from the contents of <directory>
# and its history. Does not remove the directory from its original repo.
# E.g.: suppose project/ is a Git repository with a subdirectory lib/, then
# git-detach project/lib/ standalone-lib/
# creates a new repository standalone-lib/ holding the contents of project/lib/
# as a separate repo.
larsmans /
Created Oct 9, 2014
Supervised tf (tf-chi², tf-rf) for scikit-learn
import numpy as np
#from scipy.special import chdtrc
from scipy.sparse import spdiags
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
def _chisquare(f_obs, f_exp, reduce):
"""Replacement for scipy.stats.chisquare with custom reduction.
larsmans / README
Created Dec 24, 2013
Sentiment analysis with scikit-learn
Sentiment analysis experiment using scikit-learn
The script reproduces the sentiment analysis approach from Pang,
Lee and Vaithyanathan (2002), who tried to classify movie reviews as positive
or negative, with three differences:
* tf-idf weighting is applied to terms
* the three-fold cross validation split is different
* regularization is tuned by cross validation
larsmans /
Created Oct 21, 2013
Upgrade Ubuntu 13.04 to Linux Mint 15
# Updated version of Jeff Shaffner's instructions,
# Run with "sudo sh".
set -e
set -x
apt-get update && apt-get --yes upgrade
add-apt-repository "deb olivia \
import numpy as np
import os
import sys
from collections import defaultdict
from git import Repo
from scipy.sparse import csc_matrix
path = sys.argv[1]
extensions = [".py", ".pyx", ".pxd"]
larsmans /
Created Jul 14, 2013
k-means feature mapper for scikit-learn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
class KMeansTransformer(BaseEstimator, TransformerMixin):
def __init__(self, centroids):
self.centroids = centroids
def fit(self, X, y=None):
return self
larsmans / csc_columnwise_max.pyx
Created Mar 11, 2013
Columnwise maximum of scipy.sparse.csc_matrix, in Cython
View csc_columnwise_max.pyx
cimport numpy as np
def csc_columnwise_max(np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[int, ndim=1] indices,
np.ndarray[int, ndim=1] indptr,
np.ndarray[np.float64_t, ndim=1] out):
cdef double mx
cdef int n_features = indptr.shape[0] - 1
cdef int i, j
larsmans /
Created Feb 14, 2013
k-means clustering in pure Python
# K-means clustering using Lloyd's algorithm in pure Python.
# Written by Lars Buitinck. This code is in the public domain.
# The main program runs the clustering algorithm on a bunch of text documents
# specified as command-line arguments. These documents are first converted to
# sparse vectors, represented as lists of (index, value) pairs.
from collections import defaultdict
larsmans / gist:3745866
Created Sep 18, 2012
Inspecting scikit-learn CountVectorizer output with a Pandas DataFrame
View gist:3745866
>>> from pandas import DataFrame
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> docs = ["You can catch more flies with honey than you can with vinegar.",
... "You can lead a horse to water, but you can't make him drink."]
>>> vect = CountVectorizer(min_df=0., max_df=1.0)
>>> X = vect.fit_transform(docs)
>>> print(DataFrame(X.A, columns=vect.get_feature_names()).to_string())
but can catch drink flies him honey horse lead make more than to vinegar water with you
0 0 2 1 0 1 0 1 0 0 0 1 1 0 1 0 2 2
1 1 2 0 1 0 1 0 1 1 1 0 0 1 0 1 0 2
