Skip to content

Instantly share code, notes, and snippets.

View kagesenshi's full-sized avatar

Izhar Firdaus kagesenshi

View GitHub Profile
@kagesenshi
kagesenshi / create_root.sh
Created February 7, 2012 01:51
Fedora 16 LXC script
#!/bin/bash
if [ $# != 2 ]; then
echo "usage: $0 <directory> <hostname>"
exit 1
fi
if [ ! -d $1 ];then
mkdir -p $1
fi
@kagesenshi
kagesenshi / fedora-contributor-emails.py
Created March 7, 2012 16:36
Query for email addresses of Fedora contributors by country
#!/usr/bin/python
from fedora.client import AccountSystem
from getpass import getpass
import sys
if len(sys.argv) != 2:
print "Usage: %s <country code>" % (sys.argv[0])
sys.exit(1)
@kagesenshi
kagesenshi / tranalyzer.py
Created September 7, 2012 04:18
ZODB Tranalyzer (tranalyzer.py)
#!/usr/bin/env python
#
# $Id: tranalyzer.py,v 1.5 1999/10/22 14:23:55 tsarna Exp tsarna $
#
# Copyright (c) 1999 Tyler C. Sarna
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:

Most active GitHub users in Malaysia

The count of contributions (summary of Pull Requests, opened issues and commits) to public repos at GitHub.com from Wed, 14 Nov 2012 03:35:10 GMT till Thu, 14 Nov 2013 03:35:10 GMT.

Only first 1000 GitHub users according to the count of followers are taken. This is because of limitations of GitHub search. Sorting algo in pseudocode:

githubUsers
 .filter((user) -&gt; user.followers &gt; 0)
@kagesenshi
kagesenshi / facebook-crawler.py
Last active May 12, 2017 14:26
Facebook GraphAPI Crawler
import facebook
import argh
import requests
from ConfigParser import ConfigParser
from pprint import pprint
import time
import json
import logging
import traceback
logging.basicConfig(level=logging.INFO)
@kagesenshi
kagesenshi / bharian_scraper.py
Created July 9, 2015 03:54
Berita Harian Headline Scraper
import scrapy
import argh
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
@kagesenshi
kagesenshi / epiweek.py
Last active January 12, 2016 02:42
Epidemic Week Calculator
from datetime import date
from datetime import timedelta
import copy
# ported from npmjs epi-week package
# https://github.com/wombleton/epi-week
#
#getFirstWeek = (year) ->
# end = new Date(year, 0, 1)
import urllib
import json
import re
from dateutil.parser import parse as parse_date
from datetime import datetime
f = urllib.urlopen("http://apims.doe.gov.my/v2/").read()
stage1 = []
from datetime import datetime, timedelta
import urllib
import re
from lxml.html import fromstring
from cssselect import GenericTranslator, SelectorError
import os
import json
base_url = 'http://apims.doe.gov.my/v2/'
HOURS = {
@kagesenshi
kagesenshi / pyspark_csvrdd_to_rowrdd.py
Last active November 27, 2015 08:18
PySpark CSV to DataFrame
def csvRDD_to_rowRDD(rdd):
#expect a RDD that stores csv
# eg: rdd = sc.textFile('myfile.csv')
from pyspark.sql import Row
rdd = rdd.zipWithIndex()
fail_key = 'X_IMPORT_FAIL'
def extract_row(keys):