This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quick and dirty Brat-to-CSV conversion. | |
from __future__ import print_function | |
import csv | |
import io | |
import re | |
import sys | |
# copy server/src/{gtbtokenize,tokenise}.py from Brat | |
from tokenise import gtb_token_boundary_gen |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Usage: git-detach <directory> <target> | |
# Creates a new Git repository at <target> from the contents of <directory> | |
# and its history. Does not remove the directory from its original repo. | |
# | |
# E.g.: suppose project/ is a Git repository with a subdirectory lib/, then | |
# git-detach project/lib/ standalone-lib/ | |
# creates a new repository standalone-lib/ holding the contents of project/lib/ | |
# as a separate repo. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
#from scipy.special import chdtrc | |
from scipy.sparse import spdiags | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.preprocessing import LabelBinarizer | |
def _chisquare(f_obs, f_exp, reduce): | |
"""Replacement for scipy.stats.chisquare with custom reduction. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sentiment analysis experiment using scikit-learn | |
================================================ | |
The script sentiment.py reproduces the sentiment analysis approach from Pang, | |
Lee and Vaithyanathan (2002), who tried to classify movie reviews as positive | |
or negative, with three differences: | |
* tf-idf weighting is applied to terms | |
* the three-fold cross validation split is different | |
* regularization is tuned by cross validation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Updated version of Jeff Shaffner's instructions, | |
# http://jeffshaffner.wordpress.com/2012/09/27/how-to-convert-ubuntu-12-04-to-linux-mint-13/ | |
# Run with "sudo sh ubuntu-to-mint.sh". | |
set -e | |
set -x | |
apt-get update && apt-get --yes upgrade | |
add-apt-repository "deb http://packages.linuxmint.com/ olivia \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import os | |
import sys | |
from collections import defaultdict | |
from git import Repo | |
from scipy.sparse import csc_matrix | |
path = sys.argv[1] | |
extensions = [".py", ".pyx", ".pxd"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.metrics.pairwise import rbf_kernel | |
class KMeansTransformer(BaseEstimator, TransformerMixin): | |
def __init__(self, centroids): | |
self.centroids = centroids | |
def fit(self, X, y=None): | |
return self |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cimport numpy as np | |
def csc_columnwise_max(np.ndarray[np.float64_t, ndim=1] data, | |
np.ndarray[int, ndim=1] indices, | |
np.ndarray[int, ndim=1] indptr, | |
np.ndarray[np.float64_t, ndim=1] out): | |
cdef double mx | |
cdef int n_features = indptr.shape[0] - 1 | |
cdef int i, j |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# K-means clustering using Lloyd's algorithm in pure Python. | |
# Written by Lars Buitinck. This code is in the public domain. | |
# | |
# The main program runs the clustering algorithm on a bunch of text documents | |
# specified as command-line arguments. These documents are first converted to | |
# sparse vectors, represented as lists of (index, value) pairs. | |
from collections import defaultdict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> from pandas import DataFrame | |
>>> from sklearn.feature_extraction.text import CountVectorizer | |
>>> docs = ["You can catch more flies with honey than you can with vinegar.", | |
... "You can lead a horse to water, but you can't make him drink."] | |
>>> vect = CountVectorizer(min_df=0., max_df=1.0) | |
>>> X = vect.fit_transform(docs) | |
>>> print(DataFrame(X.A, columns=vect.get_feature_names()).to_string()) | |
but can catch drink flies him honey horse lead make more than to vinegar water with you | |
0 0 2 1 0 1 0 1 0 0 0 1 1 0 1 0 2 2 | |
1 1 2 0 1 0 1 0 1 1 1 0 0 1 0 1 0 2 |
NewerOlder