vivshri/amt

## amt
import findspark
findspark.init()
import pyspark
import random
from pyspark.sql import SparkSession
sc = pyspark.SparkContext.getOrCreate()
print sc.version
spark = SparkSession.builder.appName("vivek").getOrCreate()

pyspark --master local

puppeteer on bitnami/ubuntu

apt-get update && apt-get install -yq --no-install-recommends libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3

const puppeteer = require('puppeteer');

async function run() {
  const browser = await puppeteer.launch({ headless: true, executablePath: "/root/node_modules/puppeteer/.local-chromium/linux-555668/chrome-linux/chrome" });
  //const browser = await puppeteer.launch()
  const page = await browser.newPage();

  await page.goto('https://github.com');
  await page.screenshot({ path: 'screenshots/github.png' });

  browser.close();
}

run();


val table = sparkSession.read
      .format("jdbc")
      .option("url", "jdbc:oracle:thin://@XXX")
      .option("dbtable", "tcga.%s".format(tableName))
      .option("user", "XXX")
      .option("password", "XXX")
      .option("driver", "oracle.jdbc.driver.OracleDriver")
      .option("fetchsize", "50000")
      .option("numPartitions", "200")
      .load()

import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')


def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

# Save schema from the original DataFrame into json:
schema_json = df.schema.json()

# Restore schema from json:
import json
new_schema = StructType.fromJson(json.loads(schema_json))


# read a part of the whole datalake just to extract the schema
part = spark.read.json("s3a://path/to/json/part")

# create a temporary rdd in order to store the schema as binary file
temp_rdd = sc.parallelize(part.schema)
temp_rdd.coalesce(1).saveAsPickleFile("s3a://path/to/destination_schema.pickle")

# from now on, the schema will be saved.
# it could be used to improve the speed of reading json files.

schema_rdd = sc.pickleFile("s3a://path/to/destination_schema.pickle")
reading_schema = StructType(schema_rdd.collect())

your_data_set = spark.read.json("s3a://path/to/entire_data_lake", reading_schema) # this would be quicker than just spark.read.json()

# multiple dataframes

    gwas_dfs = []
    for inf in glob(gwas_pattern):
        inf = os.path.abspath(inf)
        df = spark.read.parquet(inf)
        gwas_dfs.append(df)

    # Load molecular trait dfs
    mol_dfs = []
    for inf in glob(mol_pattern):
        inf = os.path.abspath(inf)
        df = (
            spark.read.parquet(inf)
            .drop('num_tests')
        )
        mol_dfs.append(df)

    # Take union
    df = reduce(
        pyspark.sql.DataFrame.unionByName,
        gwas_dfs + mol_dfs
    )
#empty schema

empty_schema = StructType([])

joined_df = spark.createDataFrame([],empty_schema)


#creating dictonary

df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("/Users/vivshri/Downloads/2010-12-02.csv")
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type
print selectedColumns


Try with the Glue Option
hostname = urlparse.urlparse(url).hostname or ''

zip -r deployment.zip .\index.js ..\..\node_modules\puppeteer-core\ ..\..\node_modules\chrome-aws-lambda\ ..\..\node_modules\lambdafs\ ..\..\node_modules\debug\ ..\..\node_modules\ms\ ..\..\node_modules\mime\ ..\..\node_modules\extract-zip\ ..\..\node_modules\yauzl\ ..\..\node_modules\pend\ ..\..\node_modules\mkdirp\ ..\..\node_modules\concat-stream\ ..\..\node_modules\readable-stream\ ..\..\node_modules\process-nextick-args\ ..\..\node_modules\isarray\ ..\..\node_modules\safe-buffer\ ..\..\node_modules\core-util-is\ ..\..\node_modules\inherits\ ..\..\node_modules\util-deprecate\ ..\..\node_modules\buffer-from\  ..\..\node_modules\rimraf\ ..\..\node_modules\proxy-from-env\ ..\..\node_modules\ws\ ..\..\node_modules\async-limiter\

Nightmare.js installation

.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\  .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\


.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\  .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\

node_modules\buffer-from\  .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\
        zip warning: name not matched: .\node_modules\mime\
        zip warning: name not matched: .\node_modules\proxy-from-env\
        zip warning: name not matched: .\node_modules\ws\
        zip warning: name not matched: .\node_modules\async-limiter\
updating: index.js (164 bytes security) (deflated 61%)

zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\  .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\

aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip

aws lambda invoke --function-name myNightmareFunction --payload '{ "name": "Bob" }' \

zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\  .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\ .\node_modules\split2\ .\node_modules\defaults\ .\node_modules\clone\ .\node_modules\minstache\; aws lambda update-function-code
--function-name myNightmareFunction --zip-file fileb://new_deployment.zip ; aws lambda invoke --function-name myNightmareFunction --payload '{ \"name\": \"Bob\" }'


aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip


---Docker--
##run a container## make sure to use -d
docker run -it -d shykes/pybuilder /bin/bash
## to attach to running container
docker exec -t -i    amazing_elbakyan /bin/bash

docker run -v /tmp:/tmp -it -d dashboard_validation_without_cmd  xvfb-run -a node dashboard-validation/automation.js > /tmp/logs.log

ACCOUNT_ID = boto3.client('sts').get_caller_identity()['Account']

pip install -U --pre pipenv
	import findspark
	findspark.init()
	import pyspark
	import random
	from pyspark.sql import SparkSession
	sc = pyspark.SparkContext.getOrCreate()
	print sc.version
	spark = SparkSession.builder.appName("vivek").getOrCreate()

	pyspark --master local

	puppeteer on bitnami/ubuntu

	apt-get update && apt-get install -yq --no-install-recommends libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3

	const puppeteer = require('puppeteer');

	async function run() {
	const browser = await puppeteer.launch({ headless: true, executablePath: "/root/node_modules/puppeteer/.local-chromium/linux-555668/chrome-linux/chrome" });
	//const browser = await puppeteer.launch()
	const page = await browser.newPage();

	await page.goto('https://github.com');
	await page.screenshot({ path: 'screenshots/github.png' });

	browser.close();
	}

	run();



	val table = sparkSession.read
	.format("jdbc")
	.option("url", "jdbc:oracle:thin://@XXX")
	.option("dbtable", "tcga.%s".format(tableName))
	.option("user", "XXX")
	.option("password", "XXX")
	.option("driver", "oracle.jdbc.driver.OracleDriver")
	.option("fetchsize", "50000")
	.option("numPartitions", "200")
	.load()

	import pandas as pd
	data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
	data = data.astype({'A': 'int32', 'B': 'object'})
	data.to_parquet('example.parquet')


	def unionAll(dfs):
	return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

	# Save schema from the original DataFrame into json:
	schema_json = df.schema.json()

	# Restore schema from json:
	import json
	new_schema = StructType.fromJson(json.loads(schema_json))





	# read a part of the whole datalake just to extract the schema
	part = spark.read.json("s3a://path/to/json/part")

	# create a temporary rdd in order to store the schema as binary file
	temp_rdd = sc.parallelize(part.schema)
	temp_rdd.coalesce(1).saveAsPickleFile("s3a://path/to/destination_schema.pickle")

	# from now on, the schema will be saved.
	# it could be used to improve the speed of reading json files.

	schema_rdd = sc.pickleFile("s3a://path/to/destination_schema.pickle")
	reading_schema = StructType(schema_rdd.collect())

	your_data_set = spark.read.json("s3a://path/to/entire_data_lake", reading_schema) # this would be quicker than just spark.read.json()

	# multiple dataframes

	gwas_dfs = []
	for inf in glob(gwas_pattern):
	inf = os.path.abspath(inf)
	df = spark.read.parquet(inf)
	gwas_dfs.append(df)

	# Load molecular trait dfs
	mol_dfs = []
	for inf in glob(mol_pattern):
	inf = os.path.abspath(inf)
	df = (
	spark.read.parquet(inf)
	.drop('num_tests')
	)
	mol_dfs.append(df)

	# Take union
	df = reduce(
	pyspark.sql.DataFrame.unionByName,
	gwas_dfs + mol_dfs
	)
	#empty schema

	empty_schema = StructType([])

	joined_df = spark.createDataFrame([],empty_schema)


	#creating dictonary

	df = spark.read.format("csv")\
	.option("header", "true")\
	.option("inferSchema", "true")\
	.load("/Users/vivshri/Downloads/2010-12-02.csv")
	from pyspark.sql.functions import expr, locate
	simpleColors = ["black", "white", "red", "green", "blue"]
	def color_locator(column, color_string):
	return locate(color_string.upper(), column)\
	.cast("boolean")\
	.alias("is_" + color_string)
	selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
	selectedColumns.append(expr("*")) # has to a be Column type
	print selectedColumns



	Try with the Glue Option
	hostname = urlparse.urlparse(url).hostname or ''

	zip -r deployment.zip .\index.js ..\..\node_modules\puppeteer-core\ ..\..\node_modules\chrome-aws-lambda\ ..\..\node_modules\lambdafs\ ..\..\node_modules\debug\ ..\..\node_modules\ms\ ..\..\node_modules\mime\ ..\..\node_modules\extract-zip\ ..\..\node_modules\yauzl\ ..\..\node_modules\pend\ ..\..\node_modules\mkdirp\ ..\..\node_modules\concat-stream\ ..\..\node_modules\readable-stream\ ..\..\node_modules\process-nextick-args\ ..\..\node_modules\isarray\ ..\..\node_modules\safe-buffer\ ..\..\node_modules\core-util-is\ ..\..\node_modules\inherits\ ..\..\node_modules\util-deprecate\ ..\..\node_modules\buffer-from\ ..\..\node_modules\rimraf\ ..\..\node_modules\proxy-from-env\ ..\..\node_modules\ws\ ..\..\node_modules\async-limiter\

	Nightmare.js installation

	.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\


	.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\

	node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\
	zip warning: name not matched: .\node_modules\mime\
	zip warning: name not matched: .\node_modules\proxy-from-env\
	zip warning: name not matched: .\node_modules\ws\
	zip warning: name not matched: .\node_modules\async-limiter\
	updating: index.js (164 bytes security) (deflated 61%)

	zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\

	aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip

	aws lambda invoke --function-name myNightmareFunction --payload '{ "name": "Bob" }' \

	zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\ .\node_modules\split2\ .\node_modules\defaults\ .\node_modules\clone\ .\node_modules\minstache\; aws lambda update-function-code
	--function-name myNightmareFunction --zip-file fileb://new_deployment.zip ; aws lambda invoke --function-name myNightmareFunction --payload '{ \"name\": \"Bob\" }'


	aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip


	---Docker--
	##run a container## make sure to use -d
	docker run -it -d shykes/pybuilder /bin/bash
	## to attach to running container
	docker exec -t -i amazing_elbakyan /bin/bash

	docker run -v /tmp:/tmp -it -d dashboard_validation_without_cmd xvfb-run -a node dashboard-validation/automation.js > /tmp/logs.log

	ACCOUNT_ID = boto3.client('sts').get_caller_identity()['Account']

	pip install -U --pre pipenv