Skip to content

Instantly share code, notes, and snippets.

View rohitdholakia's full-sized avatar

Rohit Dholakia rohitdholakia

View GitHub Profile
@rohitdholakia
rohitdholakia / accepted_tags.py
Created December 2, 2013 01:33
Accepted answers for tags
from __future__ import division
from collections import defaultdict
accepted_answers_for_tags = defaultdict(int)
count = 0
for key in question_details.iterkeys():
if question_details[key]['acceptedId']:
for t in question_details[key]['tags']:
accepted_answers_for_tags[t] += 1
@rohitdholakia
rohitdholakia / tags_dict.py
Created December 2, 2013 01:25
num questions for tags
from collections import defaultdict
tags_dict = defaultdict(int)
for key in question_details.iterkeys():
for t in question_details[key]['tags']:
tags_dict[t] += 1
print sorted(tags_dict.iteritems(), key = lambda x: x[1], reverse = True)[:20]
@rohitdholakia
rohitdholakia / question_details.py
Created December 2, 2013 01:21
Storing the details of questions
import os
import Utils
question_details = {}
top_dir = '/directory/with/your/site/data'
with open(os.path.join(top_dir, 'posts.xml')) as posts:
for event, elem in etree.iterparse(posts):
if Utils.getPostTypeId(elem) != "1":
continue
@rohitdholakia
rohitdholakia / Summary.py
Created December 2, 2013 01:16
Summary of stackexchange site
#give the top level xml directory and this script will return
'''
num users
Num epic users
num famous questions
num questions
num answers
'''
from lxml import etree
@rohitdholakia
rohitdholakia / loading_data_0.py
Last active December 29, 2015 18:49
Script to load all ngrams as keys to redis
import time
import os
import redis
path = '/path/to/unigrams'
client = redis.Redis(host = 'host-ip-here', port = 6385, db = 0)
pipeline = client.pipeline(transaction = False)
for f in os.listdir(path):
print ' starting with file ', f
start = time.time()
@rohitdholakia
rohitdholakia / get_data.sh
Created November 30, 2013 00:15
getting data from google books
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-a.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-b.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-c.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-d.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-e.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-f.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-g.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-h.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-i.gz
wget http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-j.gz

Get Moses

Assuming you want to clone a branch "b" ( highly recommended to stick to Release-1.0), do the following :

git clone -b RELEASE-1.0 https://github.com/moses-smt/mosesdecoder.git

Set up Boost

@rohitdholakia
rohitdholakia / NumEpic.py
Last active December 17, 2015 13:19
Num Epic users
import sys
from lxml import etree
import Utils
with open(sys.argv[1]) as userXml:
context = etree.iterparse(userXml)
countEpic = 0
for event, elem in context:
name = Utils.getBadgeName(elem)
@rohitdholakia
rohitdholakia / Ordinal.py
Created May 20, 2013 03:35
Ordinal add up
def hashing(l):
n = 0
for letter in l:
n = n + ord(letter)
return n%3
@rohitdholakia
rohitdholakia / outputlogreg.txt
Created May 18, 2013 20:54
Example Output of logit reg
[[ -1.26705603e-04 1.22045489e-01 -1.11196205e-05 0.00000000e+00
-3.09116956e-07 -1.04906557e-05 -3.00129134e-03 -8.41874652e-04
-2.14845603e-01 -8.51142269e-05 1.18718039e-01 2.82917922e-02]]
Error is 0.00478333333333