Skip to content

Instantly share code, notes, and snippets.

View frank-leap's full-sized avatar

Francisco Lopez frank-leap

  • GSR
  • Spain
View GitHub Profile
@frank-leap
frank-leap / greenplum.sh
Last active August 29, 2015 14:19
Expect script used for Greenplum SNE 4.2.6.1 installer
#!/usr/bin/env bash
set timeout 60
cd /tmp
/usr/bin/expect << EOF
spawn /bin/bash greenplum-db-4.2.6.1-build-1-RHEL5-x86_64.bin
expect {
-ex "--More--" {
send "\n"
@frank-leap
frank-leap / Dockerfile
Last active February 15, 2016 17:33
Dockerfile for Greenplum SNE 4.2.6.1
FROM centos:6.6
MAINTAINER "Francisco Lopez" teraflopx@gmail.com
# update & upgrade packages
RUN yum update -y & yum upgrade -y
# install extra packages and basic tools (included some required for Greenplum installer)
RUN yum install -y epel-release git unzip which tar sed wget curl nano expect
# cleanup packages
@frank-leap
frank-leap / ExercisesCh1.sc
Last active August 29, 2015 14:19
s4di - Chapter 1 - Exercises
package chapter1
import math._
import math.BigInt._
import util._
object ExercisesCh1 {
3 //> res0: Int(3) = 3
pow(sqrt(3),2) //> res1: Double = 2.9999999999999996
@frank-leap
frank-leap / git-config.sh
Last active August 29, 2015 14:19
Setup Git on Windows
# Install Git from http://git-scm.com/ and then launch Git Bash
# Generate SSH key pair
ssh-keygen -t rsa -C <username>@gmail.com
#Turn on SSH agent
ssh-agent -s
eval $(ssh-agent -s)
# List SSH keys registered in the SSH agent
@frank-leap
frank-leap / ExercisesCh2.sc
Created April 25, 2015 09:45
s4di - Chapter 2 - Exercises
package chapter2
import math._
object ExercisesCh2 {
def signum(i: Int) = if (i < 0) -1 else if (i > 0) 1 else 0
//> signum: (i: Int)Int
signum(5) //> res0: Int = 1
signum(-5) //> res1: Int = -1
@frank-leap
frank-leap / ExercisesCh3.sc
Created April 28, 2015 00:46
s4di - Chapter 3 - Exercises
package chapter3
import util._
import scala.collection.mutable.ArrayBuffer
import java.awt.datatransfer._
import scala.collection.JavaConverters._
import scala.collection.mutable.Buffer
object ExercisesCh3 {
// 1. array of random ints
@frank-leap
frank-leap / JoinDuplicatedLines.scala
Created June 15, 2015 13:27
Reads a CSV file with header and rows where first column is the key, writes into new file without header and where lines with duplicated key are merged into a single one
import java.io.File
import java.io.PrintWriter
import scala.annotation.migration
import scala.collection.immutable.ListMap
import scala.collection.mutable.Map
object JoinDuplicatedLines {
def main(args: Array[String]) {
val input = io.Source fromFile "input.csv"
@frank-leap
frank-leap / SimpleWordTokenizer.py
Last active August 29, 2015 14:23
Simple word tokenizer that returns a list of non-empty words in lowercase
def simpleWordTokenizer(string):
""" A simple (for-comprehension) implementation of input string tokenization
Args:
string (str): input string
Returns:
list: a list of tokens in lowercase and no empty strings
"""
return [x for x in re.split(split_regex, string.lower()) if x]
starWarsDarkSide = 'Only at the end do you realize the power of the Dark Side.'
# copy the Hive configuration file hive-site.xml to the spark configuration folder
# sudo cp /etc/hive/conf.dist/hive-site.xml /usr/lib/spark/conf/
# launch pyspark with the spark-csv package (note: version 1.2.0 has some issues thus better use 1.3.0)
# PYSPARK_DRIVER_PYTHON=ipython pyspark --packages com.databricks:spark-csv_2.10:1.3.0
# check dataframes are working
sqlCtx.createDataFrame([("somekey", 1)])
# load yelp dataset
# load the "orders" table from Hive into a DataFrame
orders_df=sqlCtx.sql("select * from orders")
orders_df.printSchema()
# 1) calculate number of orders in SUSPECTED_FRAUD status
sqlCtx.select("select count(order_id) from orders where order_status='SUSPECTED_FRAUD'").show(5)
# load the "order_items" table from Hive into a DataFrame
order_items_df=sqlCtx.sql("select * from order_items")
order_items_df.printSchema()