Lars larsmans

## tocsv.py
# Quick and dirty Brat-to-CSV conversion.

from __future__ import print_function
import csv
import io
import re
import sys

# copy server/src/{gtbtokenize,tokenise}.py from Brat
from tokenise import gtb_token_boundary_gen

## csc_columnwise_max.pyx
cimport numpy as np


def csc_columnwise_max(np.ndarray[np.float64_t, ndim=1] data,
                       np.ndarray[int, ndim=1] indices,
                       np.ndarray[int, ndim=1] indptr,
                       np.ndarray[np.float64_t, ndim=1] out):
    cdef double mx
    cdef int n_features = indptr.shape[0] - 1
    cdef int i, j

## kmtransformer.py
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel


class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, centroids):
        self.centroids = centroids

    def fit(self, X, y=None):
        return self

## nearest_developers.py
import numpy as np
import os
import sys

from collections import defaultdict
from git import Repo
from scipy.sparse import csc_matrix

path = sys.argv[1]
extensions = [".py", ".pyx", ".pxd"]

## ubuntu-to-mint.sh
# Updated version of Jeff Shaffner's instructions,
# http://jeffshaffner.wordpress.com/2012/09/27/how-to-convert-ubuntu-12-04-to-linux-mint-13/
# Run with "sudo sh ubuntu-to-mint.sh".

set -e
set -x

apt-get update && apt-get --yes upgrade

add-apt-repository "deb http://packages.linuxmint.com/ olivia \

## README
Sentiment analysis experiment using scikit-learn
================================================

The script sentiment.py reproduces the sentiment analysis approach from Pang,
Lee and Vaithyanathan (2002), who tried to classify movie reviews as positive
or negative, with three differences:

* tf-idf weighting is applied to terms
* the three-fold cross validation split is different
* regularization is tuned by cross validation

## git-detach.sh
#!/bin/sh

# Usage: git-detach <directory> <target>
# Creates a new Git repository at <target> from the contents of <directory>
# and its history. Does not remove the directory from its original repo.
#
# E.g.: suppose project/ is a Git repository with a subdirectory lib/, then
#   git-detach project/lib/ standalone-lib/
# creates a new repository standalone-lib/ holding the contents of project/lib/
# as a separate repo.

## gist:3745866
>>> from pandas import DataFrame
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> docs = ["You can catch more flies with honey than you can with vinegar.",
...         "You can lead a horse to water, but you can't make him drink."]
>>> vect = CountVectorizer(min_df=0., max_df=1.0)
>>> X = vect.fit_transform(docs)
>>> print(DataFrame(X.A, columns=vect.get_feature_names()).to_string())
   but  can  catch  drink  flies  him  honey  horse  lead  make  more  than  to  vinegar  water  with  you
0    0    2      1      0      1    0      1      0     0     0     1     1   0        1      0     2    2
1    1    2      0      1      0    1      0      1     1     1     0     0   1        0      1     0    2

## heaps.erl
% Copyright (c) 2010-2014, Lars Buitinck
% May be used, redistributed and modified under the terms of the
% GNU Lesser General Public License (LGPL), version 2.1 or later

% Heaps/priority queues in Erlang

% Heaps are data structures that return the entries inserted into them in
% sorted order. This makes them the data structure of choice for implementing
% priority queues, a central element of algorithms such as best-first/A*
% search and Kruskal's minimum-spanning-tree algorithm.

## kmeans.py
#!/usr/bin/python
#
# K-means clustering using Lloyd's algorithm in pure Python.
# Written by Lars Buitinck. This code is in the public domain.
#
# The main program runs the clustering algorithm on a bunch of text documents
# specified as command-line arguments. These documents are first converted to
# sparse vectors, represented as lists of (index, value) pairs.

from collections import defaultdict
	# Quick and dirty Brat-to-CSV conversion.

	from __future__ import print_function
	import csv
	import io
	import re
	import sys

	# copy server/src/{gtbtokenize,tokenise}.py from Brat
	from tokenise import gtb_token_boundary_gen
	cimport numpy as np


	def csc_columnwise_max(np.ndarray[np.float64_t, ndim=1] data,
	np.ndarray[int, ndim=1] indices,
	np.ndarray[int, ndim=1] indptr,
	np.ndarray[np.float64_t, ndim=1] out):
	cdef double mx
	cdef int n_features = indptr.shape[0] - 1
	cdef int i, j
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.metrics.pairwise import rbf_kernel


	class KMeansTransformer(BaseEstimator, TransformerMixin):
	def __init__(self, centroids):
	self.centroids = centroids

	def fit(self, X, y=None):
	return self
	import numpy as np
	import os
	import sys

	from collections import defaultdict
	from git import Repo
	from scipy.sparse import csc_matrix

	path = sys.argv[1]
	extensions = [".py", ".pyx", ".pxd"]
	# Updated version of Jeff Shaffner's instructions,
	# http://jeffshaffner.wordpress.com/2012/09/27/how-to-convert-ubuntu-12-04-to-linux-mint-13/
	# Run with "sudo sh ubuntu-to-mint.sh".

	set -e
	set -x

	apt-get update && apt-get --yes upgrade

	add-apt-repository "deb http://packages.linuxmint.com/ olivia \
	Sentiment analysis experiment using scikit-learn
	================================================

	The script sentiment.py reproduces the sentiment analysis approach from Pang,
	Lee and Vaithyanathan (2002), who tried to classify movie reviews as positive
	or negative, with three differences:

	* tf-idf weighting is applied to terms
	* the three-fold cross validation split is different
	* regularization is tuned by cross validation
	#!/bin/sh

	# Usage: git-detach <directory> <target>
	# Creates a new Git repository at <target> from the contents of <directory>
	# and its history. Does not remove the directory from its original repo.
	#
	# E.g.: suppose project/ is a Git repository with a subdirectory lib/, then
	# git-detach project/lib/ standalone-lib/
	# creates a new repository standalone-lib/ holding the contents of project/lib/
	# as a separate repo.
	>>> from pandas import DataFrame
	>>> from sklearn.feature_extraction.text import CountVectorizer
	>>> docs = ["You can catch more flies with honey than you can with vinegar.",
	... "You can lead a horse to water, but you can't make him drink."]
	>>> vect = CountVectorizer(min_df=0., max_df=1.0)
	>>> X = vect.fit_transform(docs)
	>>> print(DataFrame(X.A, columns=vect.get_feature_names()).to_string())
	but can catch drink flies him honey horse lead make more than to vinegar water with you
	0 0 2 1 0 1 0 1 0 0 0 1 1 0 1 0 2 2
	1 1 2 0 1 0 1 0 1 1 1 0 0 1 0 1 0 2
	% Copyright (c) 2010-2014, Lars Buitinck
	% May be used, redistributed and modified under the terms of the
	% GNU Lesser General Public License (LGPL), version 2.1 or later

	% Heaps/priority queues in Erlang

	% Heaps are data structures that return the entries inserted into them in
	% sorted order. This makes them the data structure of choice for implementing
	% priority queues, a central element of algorithms such as best-first/A*
	% search and Kruskal's minimum-spanning-tree algorithm.
	#!/usr/bin/python
	#
	# K-means clustering using Lloyd's algorithm in pure Python.
	# Written by Lars Buitinck. This code is in the public domain.
	#
	# The main program runs the clustering algorithm on a bunch of text documents
	# specified as command-line arguments. These documents are first converted to
	# sparse vectors, represented as lists of (index, value) pairs.

	from collections import defaultdict