Skip to content

Instantly share code, notes, and snippets.

View alexanderpanchenko's full-sized avatar

Alexander Panchenko alexanderpanchenko

View GitHub Profile
%matplotlib inline
python WikiExtractor.py wiki-20180120-pages-articles-multistream.xml.bz2 --discard_elements gallery,timeline,noinclude --processes $(nproc) --filter_disambig_pages -b 100M
https://dumps.wikimedia.org/enwiki/20180720/
sed -e 's/<[^>]*>//g' file.html
@alexanderpanchenko
alexanderpanchenko / cprofile.py
Created February 1, 2018 14:05
cProfile usage
def profiling(function):
import cProfile
import pstats
from io import StringIO
pr = cProfile.Profile()
pr.enable()
function()
pr.disable()
from multiprocessing import Pool
num_cores = 8
def foo(ego):
en = {ego: 200}
return en
with Pool(num_cores) as pool:
for res in pool.imap_unordered(foo, range(10)):
from gensim.models.keyedvectors import KeyedVectors
import faiss
import numpy as np
from time import time
import codecs
def build_vector_index(w2v_fpath):
w2v = KeyedVectors.load_word2vec_format(w2v_fpath, binary=False, unicode_errors='ignore')
w2v.init_sims(replace=True)
index = faiss.IndexFlatIP(w2v.vector_size)
%load_ext autoreload
%autoreload 2
@alexanderpanchenko
alexanderpanchenko / index.py
Created August 30, 2017 09:50
How to create an index of sentences in ES
import codecs
import json
import requests
ES_ENDPOINT = "http://localhost:9200"
class IndexBuilder(object):
def __init__(self):
self._index = "wsd"
self._dtype = "sentence"
@alexanderpanchenko
alexanderpanchenko / aws-emr.sh
Created April 27, 2017 13:07
forward port to EMR
master=ec2-xx-xx-xx-xx.eu-west-1.compute.amazonaws.com
ssh -i ~/.ssh/ireland.pem -N -L 8088:$master:8088 hadoop@$master &
ssh -i ~/.ssh/ireland.pem -N -L 20888:$master:20888 hadoop@$master &
ssh -i ~/.ssh/ireland.pem -N -L 19888:$master:19888 hadoop@$master &
sudo mkfs -t ext4 /dev/xvdcb
sudo mkdir /mnt2
sudo mount -t ext4 /dev/xvdcb /mnt2
import os
import glob
import urllib3
import argparse
import requests
import mimetypes
from PIL import Image
from io import BytesIO
import re
from os.path import splitext, join