Skip to content

Instantly share code, notes, and snippets.

@larsmans
larsmans / tocsv.py
Last active August 29, 2015 14:10
Brat-to-CSV converter
# Quick and dirty Brat-to-CSV conversion.
from __future__ import print_function
import csv
import io
import re
import sys
# copy server/src/{gtbtokenize,tokenise}.py from Brat
from tokenise import gtb_token_boundary_gen
@larsmans
larsmans / git-detach.sh
Created November 10, 2014 13:16
Detach subdirectory from Git repository as separate repo
#!/bin/sh
# Usage: git-detach <directory> <target>
# Creates a new Git repository at <target> from the contents of <directory>
# and its history. Does not remove the directory from its original repo.
#
# E.g.: suppose project/ is a Git repository with a subdirectory lib/, then
# git-detach project/lib/ standalone-lib/
# creates a new repository standalone-lib/ holding the contents of project/lib/
# as a separate repo.
@larsmans
larsmans / supervised_tf.py
Created October 9, 2014 11:19
Supervised tf (tf-chi², tf-rf) for scikit-learn
import numpy as np
#from scipy.special import chdtrc
from scipy.sparse import spdiags
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
def _chisquare(f_obs, f_exp, reduce):
"""Replacement for scipy.stats.chisquare with custom reduction.
@larsmans
larsmans / README
Created December 24, 2013 18:53
Sentiment analysis with scikit-learn
Sentiment analysis experiment using scikit-learn
================================================
The script sentiment.py reproduces the sentiment analysis approach from Pang,
Lee and Vaithyanathan (2002), who tried to classify movie reviews as positive
or negative, with three differences:
* tf-idf weighting is applied to terms
* the three-fold cross validation split is different
* regularization is tuned by cross validation
@larsmans
larsmans / ubuntu-to-mint.sh
Created October 21, 2013 17:47
Upgrade Ubuntu 13.04 to Linux Mint 15
# Updated version of Jeff Shaffner's instructions,
# http://jeffshaffner.wordpress.com/2012/09/27/how-to-convert-ubuntu-12-04-to-linux-mint-13/
# Run with "sudo sh ubuntu-to-mint.sh".
set -e
set -x
apt-get update && apt-get --yes upgrade
add-apt-repository "deb http://packages.linuxmint.com/ olivia \
import numpy as np
import os
import sys
from collections import defaultdict
from git import Repo
from scipy.sparse import csc_matrix
path = sys.argv[1]
extensions = [".py", ".pyx", ".pxd"]
@larsmans
larsmans / kmtransformer.py
Created July 14, 2013 21:12
k-means feature mapper for scikit-learn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
class KMeansTransformer(BaseEstimator, TransformerMixin):
def __init__(self, centroids):
self.centroids = centroids
def fit(self, X, y=None):
return self
@larsmans
larsmans / csc_columnwise_max.pyx
Created March 11, 2013 22:36
Columnwise maximum of scipy.sparse.csc_matrix, in Cython
cimport numpy as np
def csc_columnwise_max(np.ndarray[np.float64_t, ndim=1] data,
np.ndarray[int, ndim=1] indices,
np.ndarray[int, ndim=1] indptr,
np.ndarray[np.float64_t, ndim=1] out):
cdef double mx
cdef int n_features = indptr.shape[0] - 1
cdef int i, j
@larsmans
larsmans / kmeans.py
Created February 14, 2013 13:38
k-means clustering in pure Python
#!/usr/bin/python
#
# K-means clustering using Lloyd's algorithm in pure Python.
# Written by Lars Buitinck. This code is in the public domain.
#
# The main program runs the clustering algorithm on a bunch of text documents
# specified as command-line arguments. These documents are first converted to
# sparse vectors, represented as lists of (index, value) pairs.
from collections import defaultdict
@larsmans
larsmans / gist:3745866
Created September 18, 2012 21:00
Inspecting scikit-learn CountVectorizer output with a Pandas DataFrame
>>> from pandas import DataFrame
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> docs = ["You can catch more flies with honey than you can with vinegar.",
... "You can lead a horse to water, but you can't make him drink."]
>>> vect = CountVectorizer(min_df=0., max_df=1.0)
>>> X = vect.fit_transform(docs)
>>> print(DataFrame(X.A, columns=vect.get_feature_names()).to_string())
but can catch drink flies him honey horse lead make more than to vinegar water with you
0 0 2 1 0 1 0 1 0 0 0 1 1 0 1 0 2 2
1 1 2 0 1 0 1 0 1 1 1 0 0 1 0 1 0 2