Skip to content

Instantly share code, notes, and snippets.

View dmpetrov's full-sized avatar

Dmitry Petrov dmpetrov

View GitHub Profile
@dmpetrov
dmpetrov / datasets.sh
Last active April 2, 2020 17:04
DVC storage proposal #1487
### BASIC SCENARIO ###
# Create dataset
# Assigne dataset name `car-images`, version and verision comment (not Git)
$ tar zxf images.tgz
$ du -sh images/
8.1G images
$ dvc dataset add images/ car-images 1.0.0 -m "Import car images"
Dataset car-images@1.0.0 was added
@dmpetrov
dmpetrov / reddit-dataset-spark.scala
Created March 6, 2017 05:31
Read reddit dataset to Spark
# Code for blog post:
# https://fullstackml.com/2015/11/24/where-to-find-terabyte-size-dataset-for-machine-learning/
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
val fileName = "reddit-May2015.tsv"
val textFile = sc.textFile(fileName)
@dmpetrov
dmpetrov / datascience_memory_step2.R
Created March 6, 2017 05:14
How Much Memory Does A Data Scientist Need (step2)
data.slice.reg <- data.slice.cum_freq %>%
filter(log10(sizeGB) >= -2) %>%
filter(log10(sizeGB) <= 4)
ggplot(data.slice.reg, aes(x=log10(sizeGB), y=cum_freq, color=factor(year))) +
geom_line(aes(group = factor(year)))
attach(data.slice.reg)
model <- lm(log10(sizeGB) ~ cum_freq + year, na.action=na.exclude)
summary(model)
@dmpetrov
dmpetrov / datascience_memory_base.R
Created March 6, 2017 05:12
How Much Memory Does A Data Scientist Need (base)
# Code from blogpost:
# https://fullstackml.com/2015/12/06/how-much-memory-does-a-data-scientist-need/
library(ggplot2)
library(dplyr)
file <- "dataset-sizes.cv"
data <- read.csv(file, sep="\t")
data.slice <- data %>%
@dmpetrov
dmpetrov / save_dataframe_in_single_csv.scala
Created March 6, 2017 05:07
Save Spark dataframe to a single CSV file
# Code for blogpost:
# https://fullstackml.com/2015/12/21/how-to-export-data-frame-from-apache-spark/
def saveDfToCsv(df: DataFrame, tsvOutput: String,
sep: String = ",", header: Boolean = false): Unit = {
val tmpParquetDir = "Posts.tmp.parquet"
df.repartition(1).write.
format("com.databricks.spark.csv").
option("header", header.toString).
@dmpetrov
dmpetrov / bootstrap_spark.scala
Created March 6, 2017 04:49
Check hypotheses with bootstrap using Spark
// Code for blogpost https://fullstackml.com/2016/01/19/how-to-check-hypotheses-with-bootstrap-and-apache-spark/
import scala.util.Sorting.quickSort
def getConfInterval(input: org.apache.spark.rdd.RDD[Double], N: Int, left: Double, right:Double)
: (Double, Double) = {
// Simulate by sampling and calculating averages for each of subsamples
val hist = Array.fill(N){0.0}
for (i <- 0 to N-1) {
hist(i) = input.sample(withReplacement = true, fraction = 1.0).mean
}
@dmpetrov
dmpetrov / imagehash_whash_barbara.py
Created March 6, 2017 04:43
Wavelet imagehash for Barbara image
# Code for the blogpost https://fullstackml.com/2016/07/02/wavelet-image-hash-in-python/
barb = PIL.Image.open(‘barbara.jpg’)
w_b = imagehash.whash(barb)
h_b = imagehash.phash(barb)
a_b = imagehash.average_hash(barb)
(a — a_b)/len(a.hash)**2
# > 0.5
@dmpetrov
dmpetrov / imagehash_whash.py
Created March 6, 2017 04:39
Wavelet imagehash for Lenna image
# Code for the blogpost https://fullstackml.com/2016/07/02/wavelet-image-hash-in-python/
import PIL
from PIL import Image
import imagehash
w = imagehash.whash(PIL.Image.open(‘lenna.png’))
w1 = imagehash.whash(PIL.Image.open(‘lenna1.jpg’))
w2 = imagehash.whash(PIL.Image.open(‘lenna2.jpg’))
(w — w1)/len(w.hash)**2
@dmpetrov
dmpetrov / imagehash_phash_lenna.py
Created March 6, 2017 04:32
Perceptual image hashes difference for Lenna images
# Code for the blogpost https://fullstackml.com/2016/07/02/wavelet-image-hash-in-python/
import PIL
from PIL import Image
import imagehash
lenna = PIL.Image.open(‘lenna.png’)
lenna1 = PIL.Image.open(‘lenna1.jpg’)
h = imagehash.phash(lenna)
h1 = imagehash.phash(lenna1)
@dmpetrov
dmpetrov / imagehash_phash.py
Created March 6, 2017 04:26
Perceptual image hashing
# Code for the blogpost https://fullstackml.com/2016/07/02/wavelet-image-hash-in-python/
import PIL
from PIL import Image
import imagehash
hash1 = imagehash.phash(Image.open(‘test1.jpg’))
print(hash1)
# > 354adab5054af0b7
hash2 = imagehash.phash(Image.open(‘test2.jpg’))