Skip to content

Instantly share code, notes, and snippets.

View arnehuang's full-sized avatar

Arne Huang arnehuang

View GitHub Profile
@arnehuang
arnehuang / PySpark Logging
Last active April 30, 2020 20:14
PySpark Logging
logging.basicConfig(format='[%(asctime)s] %(levelname)s [%(filename)s:%(funcName)s:%(lineno)d] %(message)s')
logger = logging.getLogger('driver_logger')
logger.setLevel(logging.DEBUG)
@arnehuang
arnehuang / db_helpers
Last active May 25, 2018 17:11
psycopg2 postgres python3 connect to database and execute query
import psycopg2.extras
import os
def connect_db(autocommit=False):
""" Connects to a data base and returns a connection and a cursor. """
conn = psycopg2.connect(
database=os.environ.get('DATABASE_DBNAME'),
user=os.environ.get('DATABASE_USERNAME'),
host=os.environ.get('DATABASE_HOST'),
password=os.environ.get('DATABASE_PASSWORD')
@arnehuang
arnehuang / ngram_cosine.py
Created April 11, 2018 13:45
matching two addresses via ngram one hot encoded cosine similarities
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
import string
import itertools
vector_of_possibilities = [''.join(i) for i in itertools.product(string.ascii_lowercase + string.digits, repeat=3)]
def get_3grams(astring):
newstring = [achar for achar in astring.lower() if achar.isalnum()]
return [''.join(agram) for agram in ngrams(newstring, 3)]