Skip to content

Instantly share code, notes, and snippets.

@drj42
drj42 / org-mode-reference-in.org
Created February 6, 2012 23:53
This is a cheat sheet for Emacs org-mode... in org-mode format!
@drj42
drj42 / gist:7221275
Created October 29, 2013 19:44
Mojolicious ingress cemetery portal name generator
#!/usr/bin/env perl
use Mojolicious::Lite;
# Instructions:
# $ cpanm Mojolicious
# $ morbo grave_names.pl
#
# browse to http://localhost:3000
# Documentation browser under "/perldoc"
@drj42
drj42 / csvprocessor.py
Last active August 29, 2015 14:18 — forked from miku/csvprocessor.py
from luigi.format import Format
import csvkit
class CSVOutputProcessor(object):
"""
A simple CSV output processor to be hooked into Format's
`pipe_writer`.
If `cols` are given, the names are used as CSV header, otherwise no
explicit header is written.
@drj42
drj42 / gist:a1ff0e57e11e930291da
Last active August 29, 2015 14:22
Load csv's in spark
"""
Launch pyspark with the flag:
--packages com.databricks:spark-csv_2.10:1.0.3
"""
from pyspark.sql import SQLContext
from pyspark.sql.types import StructField, StringType, StructType
def create_schema(fields):
""" Create a dataframe schema from a list of field names.
@drj42
drj42 / task_params.py
Created November 21, 2015 16:32
TaskParameter quirks
import luigi
class Foo(luigi.Task):
message = 'Foo'
class RunOnceTask(luigi.Task):
my_task = luigi.TaskParameter()
@drj42
drj42 / luigi_sftp.py
Last active January 18, 2022 17:47
sftp remote target for luigi
"""sftp.py - PySftp connections wrapped up in a luigi.Target.
TODO: get rid of the redundant stuff, write some tests, contribute to luigi
upstream.
"""
# -*- coding: utf-8 -*-
import io
import os
import random
@drj42
drj42 / df_reduce_by_key.py
Created March 15, 2016 18:45
PySpark Helper Function - perform reduceByKey on a dataframe
# Removes a lot of the boiler plate involved in converting a pyspark dataframe
# to and from an rdd, in order to do a reduceByKey operation.
#
# Lifted from:
# - http://codereview.stackexchange.com/questions/115082/generic-reduceby-or-groupby-aggregate-functionality-with-spark-dataframe
from pyspark.sql import Row
from pyspark.sql.functions import struct
from pyspark.sql import DataFrame
@drj42
drj42 / s3.py
Created January 27, 2017 18:56 — forked from stalkerg/s3.py
Async Tornado S3 uploader with AWS4 sign
import hashlib
import hmac
import mimetypes
import binascii
from calendar import timegm
from datetime import datetime
import time
from email.utils import formatdate
from urllib.parse import quote, urlparse
@drj42
drj42 / asyncio_producer_consumer.py
Created January 30, 2017 19:47 — forked from akrylysov/asyncio_producer_consumer.py
Python 3 asyncio basic producer / consumer example
import asyncio
import random
q = asyncio.Queue()
async def producer(num):
while True:
await q.put(num + random.random())
await asyncio.sleep(random.random())
@drj42
drj42 / async_worker_pool.py
Created January 30, 2017 21:06 — forked from thehesiod/async_worker_pool.py
Asynchronous Worker Pool
import asyncio
from datetime import datetime, timezone
import os
def utc_now():
# utcnow returns a naive datetime, so we have to set the timezone manually <sigh>
return datetime.utcnow().replace(tzinfo=timezone.utc)
class Terminator:
pass