Netanel Malka netanel246

## geojson-conversion.sh
# Bulk convert shapefiles to geojson using ogr2ogr
# For more information, see http://ben.balter.com/2013/06/26/how-to-convert-shapefiles-to-geojson-for-use-on-github/

# Note: Assumes you're in a folder with one or more zip files containing shape files
# and Outputs as geojson with the crs:84 SRS (for use on GitHub or elsewhere)

#geojson conversion
function shp2geojson() {
  ogr2ogr -f GeoJSON -t_srs crs:84 "$1.geojson" "$1.shp"
}

## spark_notes.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                juanignaciosl
                / spark_notes.md
            
            
              Last active
              February 15, 2022 07:09
            
              
                Spark notes
              
          
    Spark notes

Pieces

Mostly taken from [3]
The RDD is how Spark simplifies complex operations like join or groupBy and hides the fact that under the hood, you’re dealing with fragmented data.
The number of partitions is important because a stage in Spark will operate on one partition at a time (and load the data in that partition into memory). Consequently, if you have fewer partitions than active stages, you will wind up under-utilizing your cluster. Furthermore, since with fewer partitions there’s more data in each partition, you increase the memory pressure on your program. On the flip side, with too many partitions, your performance may degrade as you take a greater hit from network and disk I/O.

  
## get_job_status.sh
curl http://spark-cluster-ip:6066/v1/submissions/status/driver-20151008145126-0000

## pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.spnotes.spark</groupId>
    <artifactId>HelloSparkStreaming</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>

## mic_client.py
#!/usr/bin/env python

import pyaudio
import socket
import sys

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 4096

## self-signed-certificate-with-custom-ca.md

      
              1 file
            
          
              424 forks
            
          
              49 comments
            
          
              1355 stars
            
          
                fntlnz
                / self-signed-certificate-with-custom-ca.md
            
            
              Last active
              May 26, 2024 06:07
            
              
                Self Signed Certificate with Custom Root CA
              
          
    Create Root CA (Done once)

Create Root Key

Attention: this is the key used to sign the certificate requests, anyone holding this can sign certificates on your behalf. So keep it in a safe place!
openssl genrsa -des3 -out rootCA.key 4096

  
## hide_single_cell.py
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=true;
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {

## Dockerfile
FROM openjdk:8-jre-alpine

RUN mkdir -p /opt/app
WORKDIR /opt/app

COPY ./run_jar.sh ./app-assembly.jar ./

ENTRYPOINT ["./run_jar.sh"]

## main.scala
// reference: https://stackoverflow.com/questions/36795680/copy-schema-from-one-dataframe-to-another-dataframe?rq=1

case class Person(Dummy: String, Name: String, Timestamp: String, Age: Int)

val personDF = spark.sparkContext.parallelize(Seq(Person("dummy", "Ray", "12345", 23), Person("dummy", "John", "12345", 44))).toDF()

val personSchema = StructType(
    Seq(StructField("Name", StringType, true),
        StructField("Age", IntegerType, true)))
var dataRDD = spark.sparkContext.emptyRDD[Row]

## Zeppelin-Dockerfile
FROM apache/zeppelin:0.8.0

# Workaround to "fix" https://issues.apache.org/jira/browse/ZEPPELIN-3586

RUN echo "$LOG_TAG Download Spark binary" && \
    wget -O /tmp/spark-2.3.1-bin-hadoop2.7.tgz http://apache.panu.it/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz && \
    tar -zxvf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \
    rm -rf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \
    mv spark-2.3.1-bin-hadoop2.7 /spark-2.3.1-bin-hadoop2.7
	# Bulk convert shapefiles to geojson using ogr2ogr
	# For more information, see http://ben.balter.com/2013/06/26/how-to-convert-shapefiles-to-geojson-for-use-on-github/

	# Note: Assumes you're in a folder with one or more zip files containing shape files
	# and Outputs as geojson with the crs:84 SRS (for use on GitHub or elsewhere)

	#geojson conversion
	function shp2geojson() {
	ogr2ogr -f GeoJSON -t_srs crs:84 "$1.geojson" "$1.shp"
	}
	<?xml version="1.0" encoding="UTF-8"?>
	<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.spnotes.spark</groupId>
	<artifactId>HelloSparkStreaming</artifactId>
	<version>1.0-SNAPSHOT</version>

	<properties>
	#!/usr/bin/env python

	import pyaudio
	import socket
	import sys

	FORMAT = pyaudio.paInt16
	CHANNELS = 1
	RATE = 44100
	CHUNK = 4096
	from IPython.display import HTML
	from IPython.display import display

	# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
	tag = HTML('''<script>
	code_show=true;
	function code_toggle() {
	if (code_show){
	$('div.cell.code_cell.rendered.selected div.input').hide();
	} else {
	FROM openjdk:8-jre-alpine

	RUN mkdir -p /opt/app
	WORKDIR /opt/app

	COPY ./run_jar.sh ./app-assembly.jar ./

	ENTRYPOINT ["./run_jar.sh"]
	// reference: https://stackoverflow.com/questions/36795680/copy-schema-from-one-dataframe-to-another-dataframe?rq=1

	case class Person(Dummy: String, Name: String, Timestamp: String, Age: Int)

	val personDF = spark.sparkContext.parallelize(Seq(Person("dummy", "Ray", "12345", 23), Person("dummy", "John", "12345", 44))).toDF()

	val personSchema = StructType(
	Seq(StructField("Name", StringType, true),
	StructField("Age", IntegerType, true)))
	var dataRDD = spark.sparkContext.emptyRDD[Row]
	FROM apache/zeppelin:0.8.0

	# Workaround to "fix" https://issues.apache.org/jira/browse/ZEPPELIN-3586

	RUN echo "$LOG_TAG Download Spark binary" && \
	wget -O /tmp/spark-2.3.1-bin-hadoop2.7.tgz http://apache.panu.it/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz && \
	tar -zxvf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \
	rm -rf /tmp/spark-2.3.1-bin-hadoop2.7.tgz && \
	mv spark-2.3.1-bin-hadoop2.7 /spark-2.3.1-bin-hadoop2.7