Skip to content

Instantly share code, notes, and snippets.

View kretes's full-sized avatar

Tomasz Bartczak kretes

View GitHub Profile
import akka.NotUsed
import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import akka.stream.scaladsl.{Flow, Sink, Source}
import org.scalatest.mockito.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}
import scala.collection.immutable.Seq
import scala.concurrent.duration._
import akka.NotUsed
import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import akka.stream.scaladsl.{Flow, Keep, Sink, Source}
import pl.allegro.offer.imagescore.floki.streams.TupledFlow
import scala.concurrent.duration.DurationInt
import scala.concurrent.{Await, ExecutionContext, Future}
implicit val as = ActorSystem()
@kretes
kretes / striatum example.py
Created April 26, 2017 15:37
understanding stratium linucb bandit API
from striatum.storage import history
from striatum.storage import model
from striatum.bandit import linucb
from striatum.storage.action import ActionStorage, MemoryActionStorage, Action
historystorage = history.MemoryHistoryStorage()
modelstorage = model.MemoryModelStorage()
actionstorage = MemoryActionStorage()
actionstorage.add([Action(1),Action(2),Action(3)])
@kretes
kretes / keras-tfcontrib.py
Created July 5, 2017 13:25
plain keras vs tf.contrib.keras model
from tensorflow.contrib.keras import layers
from tensorflow.contrib.keras import models
from tensorflow.contrib.keras import backend as K
import numpy as np
input_shape = (10,10, 1)
input_data = layers.Input(name='the_input', shape=input_shape, dtype='float32')
inner = layers.Flatten()(input_data)
binary = layers.Dense(1, kernel_initializer='he_normal', name='densebin')(inner)
y_pred = layers.Activation('sigmoid', name='output_bin')(binary)
@kretes
kretes / Dockerfile
Created February 26, 2017 18:28
Docker for im2text on aws
FROM gcr.io/tensorflow/tensorflow:latest-gpu
RUN add-apt-repository ppa:webupd8team/java && apt-get update
RUN echo debconf shared/accepted-oracle-license-v1-1 select true | debconf-set-selections && echo debconf shared/accepted-oracle-license-v1-1 seen true | debconf-set-selections
RUN apt-get install -y oracle-java8-installer
RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
RUN apt-get update && apt-get install -y bazel
RUN pip install -U nltk
RUN python -m nltk.downloader -d /usr/local/share/nltk_data all
@kretes
kretes / xgb_segfault_reproduce.py
Last active February 2, 2019 21:52
reproduce for issue 4098 of xgboost
import xgboost as xgb
import pandas as pd
import numpy as np
import random
import os
print(xgb.__version__)
def generate_observations(n_rows, n_cols):
return [[random.randint(0,100) for _ in range(n_cols)] for _ in range(n_rows)]
@kretes
kretes / train.libsvm
Last active March 4, 2019 21:29
xgboost_rank_ndcg_vs_rank_pairwise
0 qid:1369666032782981875 0:1.2172619104385376
0 qid:1369666032782981875 0:1.5916666984558105
1 qid:1369666032782981875 0:1.3103448152542114
0 qid:1369666032782981875 0:0.7198443412780762
0 qid:1369666032782981875 0:0.6421052813529968
0 qid:1369666032782981875 0:2.450000047683716
0 qid:1369666032782981875 0:0.3511904776096344
0 qid:1369666032782981875 0:2.110119104385376
0 qid:1369666032782981875 0:1.7380952835083008
0 qid:1369666032782981875 0:1.2692307233810425
@kretes
kretes / csv
Created October 11, 2019 08:57
Repro for spark csv escape issue
a,b
at_the_end\,1
in_\_side,1
"comma,at_the_end\\",1
"comma,in_\\_side",1
@kretes
kretes / dmatrix_from_big_csr.py
Created December 31, 2019 18:42
Reproducing the problem in xgboost - impossible to create a DMatrix from a big sparse matrix
import xgboost as xgb
from scipy.sparse.csr import csr_matrix
import numpy as np
from sklearn.datasets import load_svmlight_file
num_rows = int(np.iinfo(np.int32).max / 1000)
num_cols = 1001
more_than_int32_count = num_rows * num_cols - np.iinfo(np.int32).max
print(more_than_int32_count)
@kretes
kretes / bpe_hang.py
Created March 4, 2020 18:42
reproduce of tokenizers hang on encode_batch
from multiprocessing import Process
import os
from tokenizers.implementations import ByteLevelBPETokenizer
import tokenizers
print(tokenizers.__version__)
# works:
tok = ByteLevelBPETokenizer()
print(tok.encode_batch(['ala']))