Skip to content

Instantly share code, notes, and snippets.

"""Tool to check if function/class definitions in Python files match with
their __all__ attribute. Rudimentary support for Cython.
"""
import sys
import re
from collections import Counter
for filename in sys.argv[1:]:
with open(filename, 'rt') as inp:
@andreasvc
andreasvc / aclrename.py
Created January 31, 2016 17:37
Script to rename papers from ACL Anthology to 'author year title.pdf'
"""Script to rename papers from ACL Anthology to 'author year title.pdf'
Given PDF files from the ACL anthology http://aclweb.org/anthology/
downloads bibtex file and extracts author, year, title
to suggest more descriptive names.
Before: N04-1016.pdf
After: Lapata & Keller 2004 The Web as a Baseline: Evaluating the Perform[...]
Usage:
"""Interactive scatter plot using MPLD3 with API inspired by seaborn."""
import mpld3
import numpy as np
import pandas
import matplotlib
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
@andreasvc
andreasvc / lineidx.py
Last active August 29, 2015 14:26
Benchmark of indexing of line offsets in text file.
"""Benchmark of indexing of line offsets in text file.
Usage example:
>>> index = indexfile_iter('1027.txt')
>>> index[5]
115
>>> import bisect
>>> bisect.bisect(index, 115) - 1
5
@andreasvc
andreasvc / bow.py
Created July 8, 2015 15:58
Extract Bag-of-Words (BOW) models from a corpus of text files.
"""Extract several BOW models from a corpus of text files.
The models are stored in Matrix Market format which can be read
by gensim. The texts are read from .txt files in the directory
specified as TOPDIR. The output is written to the current directory."""
# NB: All strings are utf8 (not unicode).
import os
import glob
import nltk
import gensim
# -*- coding: UTF-8 -*-
"""Preprocessing of text files.
Writes one paragraph per line, and normalizes punctuation & whitespace.
No sentence or word tokenization.
Usage: preprocess.py [FILE]
or: preprocess.py --batch FILES...
By default, produce cleaned version given a single filename to standard output.
Diagnostic information is written to standard error.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# requires sidsl:
# git clone https://github.com/simongog/sdsl-lite.git
# cd sdsl-lite
# ./install.sh $HOME/.local
# uses pv to display progress (not essential)
# http://www.ivarch.com/programs/pv.shtml
all: fm-index indices
@andreasvc
andreasvc / jsoneq.py
Last active August 29, 2015 14:09
Unordered equality test of JSON data
"""Convert JSON to an immutable representation so that equality can be tested
without regard for order."""
import json
class decoder(json.JSONDecoder):
# http://stackoverflow.com/questions/10885238/python-change-list-type-for-json-decoding
def __init__(self, list_type=list, **kwargs):
json.JSONDecoder.__init__(self, **kwargs)
# Use the custom JSONArray