Skip to content

Instantly share code, notes, and snippets.

@hiropppe
hiropppe / nb.py
Last active December 16, 2015 03:52
# -*- coding: utf-8 -*-
from abc import ABCMeta, abstractmethod
import math
import sys
from collections import defaultdict
class NB:
drop table cat;
drop table item;
drop table item_cat;
create table if not exists cat (
cid int(11) not null,
cpath varchar(10) not null,
cname varchar(100) not null,
primary key(cid)
);
# Pipeline(SparkML ml)
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
pos_files = sc.wholeTextFiles("hdfs://hdp1.containers.dev:9000/user/root/data/binary_clf/small/1")
# Example Pipeline
X = ["a b c d e spark", "b d", "spark f g h", "hadoop mapreduce"]
X_rdd = sc.parallelize(X, 2)
y = [1, 0, 1, 0]
y_rdd = sc.parallelize(y, 2)
Z = DictRDD((X_rdd, y_rdd), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray])
dist_pipeline = SparkPipeline((
import numpy as np
from pyspark.sql import Row, SQLContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
import numpy as np
from splearn.rdd import ArrayRDD
from splearn.rdd import DictRDD
from splearn.feature_extraction.text import SparkCountVectorizer
from splearn.feature_extraction.text import SparkHashingVectorizer
from splearn.feature_extraction.text import SparkTfidfTransformer
from splearn.naive_bayes import SparkMultinomialNB
from splearn.naive_bayes import SparkGaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
from jpjplearn.datasets import load_clf_corpus
from jpjplearn.analyzer import mecab_analyzer
public class IdWorker {
long workerBits = 5L;
long datacenterBits = 5L;
long sequenceBits = 12L;
long workerIdShift = sequenceBits;
long datacenterIdShift = sequenceBits + workerBits;
long timestampLeftShift = sequenceBits + workerBits + datacenterBits;
@hiropppe
hiropppe / zip.sql
Last active October 2, 2015 13:52
MySQL function which behaves like zip function.
CREATE FUNCTION `zip`(_first text, _second text, _separator text, _pair_separator text) RETURNS text CHARSET utf8
BEGIN
DECLARE _ret text;
IF 0 < LENGTH(_first) THEN
SELECT
GROUP_CONCAT(
CONCAT_WS(
_pair_separator,
REPLACE(SUBSTRING_INDEX(v.`first`, _separator, p.rownum), CONCAT(SUBSTRING_INDEX(v.`first`, _separator, p.rownum - 1), _separator), ''),
import pymongo
from bson.dbref import DBRef
from collections import OrderedDict
from random import randrange, choice
client = pymongo.MongoClient()
test = client.test
spot = test.spot
play = test.play