Skip to content

Instantly share code, notes, and snippets.

Avatar

Izhar Firdaus kagesenshi

View GitHub Profile
View sysinfo-satsuki.txt
Computer Information:
Manufacturer: Unknown
Model: Unknown
Form Factor: Desktop
No Touch Input Detected
Processor Information:
CPU Vendor: GenuineIntel
CPU Brand: Intel(R) Core(TM) i5-4460 CPU @ 3.20GHz
CPU Family: 0x6
@kagesenshi
kagesenshi / hubcap.py
Last active Oct 24, 2019
Hubcap data transformation helpers
View hubcap.py
# Data cleansing helper functions
from pyspark.sql import functions as F
from pyspark.sql import Window, DataFrame
from IPython.display import HTML, Markdown
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import matplotlib.pyplot as plt
def transform(self, function, *args, **kwargs):
return function(self, *args, **kwargs)
@kagesenshi
kagesenshi / krbspawner.py
Created Jan 22, 2016
Jupyterhub Spawner with Kerberos keytabs
View krbspawner.py
# to use this, set REALM to your KRB realm, and create keytabs for each user in
# /etc/security/keystabs/<username>.jupyter.keytab
#
# Save this file in your site-packages directory as krbspawner.py
#
# then in /etc/jupyterhub_config.py, set:
#
# c.JupyterHub.spawner_class = 'krbspawner.KerberosSpawner'
@kagesenshi
kagesenshi / multiprocess_python_udf.py
Last active Dec 2, 2015
Template for writing Python Hive UDF Transform that supports multiprocessing
View multiprocess_python_udf.py
from __future__ import print_function
import sys
from multiprocessing import Process, Pool, cpu_count
def transform(*args):
# -- do something here --
return []
def process_line(line):
line = line.strip().split('\t')
@kagesenshi
kagesenshi / pyspark_csvrdd_to_rowrdd.py
Last active Nov 27, 2015
PySpark CSV to DataFrame
View pyspark_csvrdd_to_rowrdd.py
def csvRDD_to_rowRDD(rdd):
#expect a RDD that stores csv
# eg: rdd = sc.textFile('myfile.csv')
from pyspark.sql import Row
rdd = rdd.zipWithIndex()
fail_key = 'X_IMPORT_FAIL'
def extract_row(keys):
View doe_air_polution_index_historical_scraper.py
from datetime import datetime, timedelta
import urllib
import re
from lxml.html import fromstring
from cssselect import GenericTranslator, SelectorError
import os
import json
base_url = 'http://apims.doe.gov.my/v2/'
HOURS = {
View doe_air_polution_index_scraper.py
import urllib
import json
import re
from dateutil.parser import parse as parse_date
from datetime import datetime
f = urllib.urlopen("http://apims.doe.gov.my/v2/").read()
stage1 = []
@kagesenshi
kagesenshi / epiweek.py
Last active Jan 12, 2016
Epidemic Week Calculator
View epiweek.py
from datetime import date
from datetime import timedelta
import copy
# ported from npmjs epi-week package
# https://github.com/wombleton/epi-week
#
#getFirstWeek = (year) ->
# end = new Date(year, 0, 1)
@kagesenshi
kagesenshi / bharian_scraper.py
Created Jul 9, 2015
Berita Harian Headline Scraper
View bharian_scraper.py
import scrapy
import argh
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
@kagesenshi
kagesenshi / facebook-crawler.py
Last active May 12, 2017
Facebook GraphAPI Crawler
View facebook-crawler.py
import facebook
import argh
import requests
from ConfigParser import ConfigParser
from pprint import pprint
import time
import json
import logging
import traceback
logging.basicConfig(level=logging.INFO)