Skip to content

Instantly share code, notes, and snippets.


Izhar Firdaus kagesenshi

View GitHub Profile
View sysinfo-satsuki.txt
Computer Information:
Manufacturer: Unknown
Model: Unknown
Form Factor: Desktop
No Touch Input Detected
Processor Information:
CPU Vendor: GenuineIntel
CPU Brand: Intel(R) Core(TM) i5-4460 CPU @ 3.20GHz
CPU Family: 0x6
kagesenshi /
Last active Oct 24, 2019
Hubcap data transformation helpers
# Data cleansing helper functions
from pyspark.sql import functions as F
from pyspark.sql import Window, DataFrame
from IPython.display import HTML, Markdown
from import VectorAssembler
import pandas as pd
import matplotlib.pyplot as plt
def transform(self, function, *args, **kwargs):
return function(self, *args, **kwargs)
kagesenshi /
Created Jan 22, 2016
Jupyterhub Spawner with Kerberos keytabs
# to use this, set REALM to your KRB realm, and create keytabs for each user in
# /etc/security/keystabs/<username>.jupyter.keytab
# Save this file in your site-packages directory as
# then in /etc/, set:
# c.JupyterHub.spawner_class = 'krbspawner.KerberosSpawner'
kagesenshi /
Last active Dec 2, 2015
Template for writing Python Hive UDF Transform that supports multiprocessing
from __future__ import print_function
import sys
from multiprocessing import Process, Pool, cpu_count
def transform(*args):
# -- do something here --
return []
def process_line(line):
line = line.strip().split('\t')
kagesenshi /
Last active Nov 27, 2015
PySpark CSV to DataFrame
def csvRDD_to_rowRDD(rdd):
#expect a RDD that stores csv
# eg: rdd = sc.textFile('myfile.csv')
from pyspark.sql import Row
rdd = rdd.zipWithIndex()
fail_key = 'X_IMPORT_FAIL'
def extract_row(keys):
from datetime import datetime, timedelta
import urllib
import re
from lxml.html import fromstring
from cssselect import GenericTranslator, SelectorError
import os
import json
base_url = ''
import urllib
import json
import re
from dateutil.parser import parse as parse_date
from datetime import datetime
f = urllib.urlopen("").read()
stage1 = []
kagesenshi /
Last active Jan 12, 2016
Epidemic Week Calculator
from datetime import date
from datetime import timedelta
import copy
# ported from npmjs epi-week package
#getFirstWeek = (year) ->
# end = new Date(year, 0, 1)
kagesenshi /
Created Jul 9, 2015
Berita Harian Headline Scraper
import scrapy
import argh
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
kagesenshi /
Last active May 12, 2017
Facebook GraphAPI Crawler
import facebook
import argh
import requests
from ConfigParser import ConfigParser
from pprint import pprint
import time
import json
import logging
import traceback