Skip to content

Instantly share code, notes, and snippets.

View bepcyc's full-sized avatar
🙃
Sparkling

Viacheslav Rodionov bepcyc

🙃
Sparkling
  • Qualcomm
  • Germany
View GitHub Profile
@bepcyc
bepcyc / extract_table.py
Created December 27, 2015 13:52
The script to OCR table from PDF in python. Source found here: http://craiget.com/blog/extracting-table-data-from-pdfs-with-ocr/
import Image, ImageOps
import subprocess, sys, os, glob
# minimum run of adjacent pixels to call something a line
H_THRESH = 300
V_THRESH = 300
def get_hlines(pix, w, h):
"""Get start/end pixels of lines containing horizontal runs of at least THRESH black pix"""
hlines = []
package com.avira.ds.sparser.spark
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.spark.{SparkContext, SparkConf}
import scala.language.implicitConversions
sealed trait Event
case class ClickEvent(blaBla: String) extends Event
case class ViewEvent(blaBla: String) extends Event
@bepcyc
bepcyc / Vagrantfile
Created June 28, 2015 18:42
edX CS 100.1X. Vagrant on steroids.
# -*- mode: ruby -*-
# vi: set ft=ruby :
ipythonPort = 8001 # Ipython port to forward (also set in IPython notebook config)
Vagrant.configure(2) do |config|
config.ssh.insert_key = true
config.vm.define "sparkvm" do |master|
master.vm.box = "sparkmooc/base"
master.vm.box_download_insecure = true
# set 8 cores and 6GB of RAM
Vagrant.configure(2) do |config|
config.vm.define "myvm" do |master|
master.vm.provider :virtualbox do |v|
v.customize ["modifyvm", :id, "--ioapic", "on"] # this one is important for setting cores
v.customize ["modifyvm", :id, "--cpus", 8]
v.customize ["modifyvm", :id, "--memory", 6144]
end
end
end
# insert somewhere in function working with sc directly
sc.stop()
from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.memory', '6g') # no sure which one works, use both
SparkContext.setSystemProperty('spark.python.worker.memory', '6g') # no sure which one works, use both
SparkContext.setSystemProperty('spark.shuffle.spill', 'false')
SparkContext.setSystemProperty('spark.driver.memory', '2g')
SparkContext.setSystemProperty('spark.io.compression.codec', 'snappy') # just to be sure
sc = SparkContext("local[8]", "Simple App") # set to your number of cores
@bepcyc
bepcyc / clean_sbt_mvn.sh
Last active March 15, 2016 17:12
Clean all mvn and sbt projects
# maven
find . -name pom.xml -type f | xargs -L1 sh -c 'dirname $0' | xargs -L1 sh -c 'cd $0 && mvn clean'
# sbt
find . -name build.sbt -type f | xargs -L1 sh -c 'dirname $0' | xargs -L1 sh -c 'cd $0 && sbt clean'
# Hint: put 'git pull' as a last command and you will get all your repos updated
# add these lines to .bashrc or to the other start script
export SEARCH_MR_JOB_JAR="/opt/cloudera/parcels/CDH/lib/solr/contrib/mr/search-mr-job.jar"
alias dfsFind="hadoop jar ${SEARCH_MR_JOB_JAR} org.apache.solr.hadoop.HdfsFindTool"
#alias MapReduceIndexerTool="hadoop jar ${SEARCH_MR_JOB_JAR} org.apache.solr.hadoop.MapReduceIndexerTool"
# use it like regular find:
# dfsFind / -name "*.snappy" | grep flume
yarn node -list -all 2>>/dev/null|cut -f3|grep -v "Total Nodes"|grep -P "\:\d{2,}$"|cut -d':' -f1
hadoop fs -mkdir /tmp/${tmp_dir}
hadoop fs -put ${dest} /tmp/${tmp_dir}/
pdsh hadoop fs -get /tmp/${tmp_dir}/${dest}
@bepcyc
bepcyc / jpgtoh264.sh
Created February 14, 2014 08:58
convert jpg files into h.264 video
#you'll need mencoder and x264 packages installed
#just add a video name like output.mp4 at the end
alias jpgtoh264="mencoder mf://*.jpg -nosound -of lavf -lavfopts format=mp4 -ovc x264 -x264encopts pass=1:bitrate=2000:crf=24 -mf type=jpg:fps=30 -o"
@bepcyc
bepcyc / wc_hdfs
Last active December 25, 2015 15:29 — forked from abicky/wc_hdfs
wc for Hadoop HDFS files
#!/bin/bash
#set correct path
HADOOP_HOME="/usr/lib/hadoop"
condition=""
fs="\t"
words=""
lines=""
chars=""