Skip to content

Instantly share code, notes, and snippets.

View corneliouzbett's full-sized avatar
:octocat:
Compiling dreams and ideas...

Kipchumba Bett corneliouzbett

:octocat:
Compiling dreams and ideas...
View GitHub Profile
@corneliouzbett
corneliouzbett / wordCount.py
Created March 15, 2019 14:00
It is a simple python code using apache spark to count words in text file
from pyspark import SparkContext, SparkConf
def display_words(words):
for w, we in words.items():
print("{} : {}".format(w, we))
if __name__ == "__main__":
conf = SparkConf().setAppName("word count").setMaster("local[2]")
sc = SparkContext(conf = conf)
from pyspark import SparkContext, SparkConf
if __name__ == "__main__":
conf = SparkConf().setAppName("take").setMaster("local[*]")
sc = SparkContext(conf = conf)
inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
wordRdd = sc.parallelize(inputWords)
words = wordRdd.take(3)
@corneliouzbett
corneliouzbett / SparkSession.py
Created March 16, 2019 19:07
Creatng spark session in python
spark = SparkSession.builder\
.appName("Python Spark SQL basic example")\
.config("spark.some.config.option", "")
.getOrCreate()
@corneliouzbett
corneliouzbett / datasets.java
Created March 16, 2019 19:09
creating datasets in Java
import java.util.Arrays;
import java.util.Collections;
import java.io.Serializable;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
from pyspark.sql import Row
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
# Infer the schema, and register the DataFrame as a table.
@corneliouzbett
corneliouzbett / JDBCConnector.py
Created March 17, 2019 06:27
JDBC to other database sources
# Loading data for JDBC source
jdbcDF = spark.read\
.format("jdbc")\
.option("url", "jdbc:postgresql:dbserver")\
.option("dbtable", "schema.tablename") \
.option("user", "username") \
.option("password", "password") \
.load()
sc = spark.sparkContext
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
path = "examples/src/main/resources/people.json"
peopleDF = spark.read.json(path)
# The inferred schema can be visualized using the printSchema() method
peopleDF.printSchema()
# root
# A lambda function can take any number of arguments, but can only have one expression
x = lamda a,b,c : (a + b) * c
print(x(1,2,3))
# output = 9
#A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
# RegEx can be used to check if a string contains the specified search pattern
import re
text = "The above code is for dummies like you"
# Check if the string starts with "The" and ends with "Spain":
x = re.search("^The.*code$", text)
import datetime
dt = datetime.datetime.now()
print(dt)
print(dt.year)
print(dt.month)