Skip to content

Instantly share code, notes, and snippets.

View mndrake's full-sized avatar
🏠
Working from home

David Carlson mndrake

🏠
Working from home
View GitHub Profile
@mndrake
mndrake / anomaly_comments_recipe.py
Last active June 3, 2022 17:51
Extract Cluster Comments for Anomaly
from urllib.parse import urlparse
import re
import dataiku
import pandas as pd
PROJECT_ID = 'CUSTOMERSEGMENTATION'
ANALYSIS_ID = 'UjW24hJ1'
ML_TASK_ID = 'LsiobCLw'
MODEL_ID = 'A-CUSTOMERSEGMENTATION-UjW24hJ1-LsiobCLw-s1-pp1-m1'
from io import StringIO
import logging
class StreamingLog(object):
def __init__(self, logger_name, level=logging.INFO):
self.stream = StringIO()
self.handler = logging.StreamHandler(self.stream)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
self.handler.setFormatter(formatter)
self.log = logging.getLogger(logger_name)
@mndrake
mndrake / add_packages.R
Created March 22, 2022 16:22
miniCRAN Example
library(miniCRAN)
library(remotes)
# CRAN mirror to use (recommend checkpoint date for installed version of R https://mran.microsoft.com/timemachine)
cran_repo <- c(CRAN = "https://cran.microsoft.com/snapshot/2018-11-30")
# local path to create miniCRAN repo
miniCRAN_dir <- "/data/dataiku/miniCRAN"
@mndrake
mndrake / create_cluster.py
Last active May 22, 2020 19:05
Databricks cluster creation and config for Databricks Connect
#!python
import functools
import json
import os
import requests
import urllib
import uuid
import configparser
# TODO: CURRENTLY ONLY WORKS FOR AWS, NEED TO ADD ADDITIONAL PARSING FOR AZURE
@mndrake
mndrake / 00-setup.py
Last active May 2, 2020 00:22
Setup Jupyter kernel for Databricks dbconnect
from IPython.core.magic import line_magic, line_cell_magic, Magics, magics_class
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
dbutils = DBUtils(sc)
@magics_class
class DatabricksConnectMagics(Magics):
@mndrake
mndrake / hocr_parse.py
Created September 12, 2019 15:23
HOCR output parsing from pytesseract
#!/usr/bin/env python
# coding: utf-8
# In[11]:
# dependencies
import pytesseract
from bs4 import BeautifulSoup
from PIL import Image
@mndrake
mndrake / .vimrc
Last active June 18, 2018 02:42
python 2.7 vim config
"*****************************************************************************
"" Vim-PLug core
"*****************************************************************************
if has('vim_starting')
set nocompatible " Be iMproved
endif
let g:vim_bootstrap_langs = "html, javascript,python,scala"
let g:vim_bootstrap_editor = "vim"
@mndrake
mndrake / init.vim
Last active June 17, 2018 19:39
neovim init.vim
"*****************************************************************************
"" Vim-PLug core
"*****************************************************************************
if has('vim_starting')
set nocompatible " Be iMproved
endif
let vimplug_exists=expand('~/.config/nvim/autoload/plug.vim')
let g:vim_bootstrap_langs = "html,javascript,python,scala"
@mndrake
mndrake / start.sh
Last active June 17, 2018 15:22
domino start script for jupyterlab
#!/bin/bash
set -o nounset -o errexit -o pipefail
IP_ADDR=$(/sbin/ifconfig eth0 | grep "inet addr" | cut -d ":" -f2 | cut -d " " -f1)
CONF_DIR="$HOME/.ipython/profile_default"
CONF_FILE="${CONF_DIR}/ipython_notebook_config.py"
mkdir -p "${CONF_DIR}"
cat <<EOF >>"${CONF_FILE}"
@mndrake
mndrake / parquet_split.py
Created May 27, 2018 13:20
read/write to split parquet files
import os
from io import BytesIO
import pyarrow as pa
import pyarrow.parquet as pq
kilobytes = 1024
megabytes = kilobytes * 1000
chunksize = int(10 * megabytes)