Skip to content

Instantly share code, notes, and snippets.

View bwang482's full-sized avatar

Bo Wang bwang482

View GitHub Profile
@bwang482
bwang482 / sentence_embedding.py
Last active March 10, 2017 19:06
Sentence embedding method in [Arora et al. ICLR 2017] - https://openreview.net/pdf?id=SyK00v5xx
from __future__ import division
import gensim
import itertools
import numpy as np
from collections import Counter
from sklearn.decomposition import PCA
def gensim_load_vec(path):
w2v_model = gensim.models.Word2Vec.load_word2vec_format(path, binary=False)
@bwang482
bwang482 / kld.py
Last active February 20, 2019 17:26
Small modification from https://gist.github.com/mrorii/961963
#!/usr/bin/python
import re, math, collections
from collections import Counter
def tokenize(_str):
stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
tokens = collections.defaultdict(lambda: 0.)
for m in re.finditer(r"(\w+)", _str, re.UNICODE):
m = m.group(1).lower()