Skip to content

Instantly share code, notes, and snippets.

View urigoren's full-sized avatar

Uri Goren urigoren

View GitHub Profile
from collections import namedtuple
from datetime import datetime
date_pattern = "%Y-%m-%dT%H:%M:%S.%fZ"
Point = namedtuple("Point", ("x", "y"))
def serialize_datetime(nt):
assert hasattr(nt, '_asdict')
import numpy as np
import pdfplumber
import itertools, collections, sys, os, re, json
from pprint import pprint as pr
from copy import deepcopy
from operator import itemgetter as at
class CartesianText:
__slots__ = ["text", "x0", "x1", "y0", "y1", "page_height"]
import collections
import wikipedia
from bs4 import BeautifulSoup
def infobox(wiki_page):
"""Returns the infobox of a given wikipedia page"""
if isinstance(wiki_page, str):
wiki_page = wikipedia.page(wiki_page)
try:
soup = BeautifulSoup(wiki_page.html()).find_all("table", {"class": "infobox"})[0]
import numpy as np
import collections, itertools, string
from scipy.cluster import hierarchy
from scipy.spatial import distance
from sklearn.feature_extraction import text
from editdistance import distance as editdistance
def edit_pdist(toks, normalize=False):
"""Return pairwise editdistance matrix"""
n = len(toks)
"""
A python wrapper for the icount.co.il api
https://www.icount.co.il/api-v3/
"""
import json
from urllib import request, parse
def post(url, data):
req = request.Request(url, data=parse.urlencode(data).encode())
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import re
from html import unescape
def html2text(htm):
ret = unescape(htm)
ret = ret.translate({
8209: ord('-'),
ord('`'): ord("'"),
ord('’'): ord("'"),
8220: ord('"'),
8221: ord('"'),
@urigoren
urigoren / json2csv.py
Created September 26, 2018 21:49
A simple command line tool that transforms a json of format: `{"word": count}` to a readable CSV format
import json, sys
from operator import itemgetter as at
fname = sys.argv[1]
assert fname.endswith('.json')
with open(fname, 'r') as f:
d = json.load(f)
with open(fname.replace('.json', '.csv'), 'w') as f:
f.write('{k},{v}\n'.format(k="key", v="val"))
for k,v in sorted(d.items(), key=at(1), reverse=True):
f.write('"{k}",{v}\n'.format(k=k.replace('"','""'), v=v))
categorical_data = [
(0,1,2),
(0,1),
(0,1,3),
(0,1,3),
(0,1,3),
(0,1,2,3),
(2, 3),
(2, 3),
(2, 3),
import sys, os, json, subprocess
from argparse import ArgumentParser
__dir__ = os.path.dirname(os.path.abspath(__file__))
def shell(cmd):
"""Run bash command"""
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()