Skip to content

Instantly share code, notes, and snippets.

@vivshri
Last active May 3, 2020 23:08
Show Gist options
  • Save vivshri/f5ce4bddecc5262cf289be219cda2b38 to your computer and use it in GitHub Desktop.
Save vivshri/f5ce4bddecc5262cf289be219cda2b38 to your computer and use it in GitHub Desktop.
amt
import findspark
findspark.init()
import pyspark
import random
from pyspark.sql import SparkSession
sc = pyspark.SparkContext.getOrCreate()
print sc.version
spark = SparkSession.builder.appName("vivek").getOrCreate()
pyspark --master local
puppeteer on bitnami/ubuntu
apt-get update && apt-get install -yq --no-install-recommends libasound2 libatk1.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 libnss3
const puppeteer = require('puppeteer');
async function run() {
const browser = await puppeteer.launch({ headless: true, executablePath: "/root/node_modules/puppeteer/.local-chromium/linux-555668/chrome-linux/chrome" });
//const browser = await puppeteer.launch()
const page = await browser.newPage();
await page.goto('https://github.com');
await page.screenshot({ path: 'screenshots/github.png' });
browser.close();
}
run();
val table = sparkSession.read
.format("jdbc")
.option("url", "jdbc:oracle:thin://@XXX")
.option("dbtable", "tcga.%s".format(tableName))
.option("user", "XXX")
.option("password", "XXX")
.option("driver", "oracle.jdbc.driver.OracleDriver")
.option("fetchsize", "50000")
.option("numPartitions", "200")
.load()
import pandas as pd
data = pd.read_excel('pandas_example.xlsx', sheet_name = 0)
data = data.astype({'A': 'int32', 'B': 'object'})
data.to_parquet('example.parquet')
def unionAll(dfs):
return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)
# Save schema from the original DataFrame into json:
schema_json = df.schema.json()
# Restore schema from json:
import json
new_schema = StructType.fromJson(json.loads(schema_json))
# read a part of the whole datalake just to extract the schema
part = spark.read.json("s3a://path/to/json/part")
# create a temporary rdd in order to store the schema as binary file
temp_rdd = sc.parallelize(part.schema)
temp_rdd.coalesce(1).saveAsPickleFile("s3a://path/to/destination_schema.pickle")
# from now on, the schema will be saved.
# it could be used to improve the speed of reading json files.
schema_rdd = sc.pickleFile("s3a://path/to/destination_schema.pickle")
reading_schema = StructType(schema_rdd.collect())
your_data_set = spark.read.json("s3a://path/to/entire_data_lake", reading_schema) # this would be quicker than just spark.read.json()
# multiple dataframes
gwas_dfs = []
for inf in glob(gwas_pattern):
inf = os.path.abspath(inf)
df = spark.read.parquet(inf)
gwas_dfs.append(df)
# Load molecular trait dfs
mol_dfs = []
for inf in glob(mol_pattern):
inf = os.path.abspath(inf)
df = (
spark.read.parquet(inf)
.drop('num_tests')
)
mol_dfs.append(df)
# Take union
df = reduce(
pyspark.sql.DataFrame.unionByName,
gwas_dfs + mol_dfs
)
#empty schema
empty_schema = StructType([])
joined_df = spark.createDataFrame([],empty_schema)
#creating dictonary
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("/Users/vivshri/Downloads/2010-12-02.csv")
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
return locate(color_string.upper(), column)\
.cast("boolean")\
.alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type
print selectedColumns
Try with the Glue Option
hostname = urlparse.urlparse(url).hostname or ''
zip -r deployment.zip .\index.js ..\..\node_modules\puppeteer-core\ ..\..\node_modules\chrome-aws-lambda\ ..\..\node_modules\lambdafs\ ..\..\node_modules\debug\ ..\..\node_modules\ms\ ..\..\node_modules\mime\ ..\..\node_modules\extract-zip\ ..\..\node_modules\yauzl\ ..\..\node_modules\pend\ ..\..\node_modules\mkdirp\ ..\..\node_modules\concat-stream\ ..\..\node_modules\readable-stream\ ..\..\node_modules\process-nextick-args\ ..\..\node_modules\isarray\ ..\..\node_modules\safe-buffer\ ..\..\node_modules\core-util-is\ ..\..\node_modules\inherits\ ..\..\node_modules\util-deprecate\ ..\..\node_modules\buffer-from\ ..\..\node_modules\rimraf\ ..\..\node_modules\proxy-from-env\ ..\..\node_modules\ws\ ..\..\node_modules\async-limiter\
Nightmare.js installation
.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\
.\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\
node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\
zip warning: name not matched: .\node_modules\mime\
zip warning: name not matched: .\node_modules\proxy-from-env\
zip warning: name not matched: .\node_modules\ws\
zip warning: name not matched: .\node_modules\async-limiter\
updating: index.js (164 bytes security) (deflated 61%)
zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\
aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip
aws lambda invoke --function-name myNightmareFunction --payload '{ "name": "Bob" }' \
zip -r new_deployment.zip .\index.js .\lib\ .\node_modules\nightmare\ .\node_modules\electron\ ..\node_modules\debug\ .\node_modules\ms\ .\node_modules\mime\ .\node_modules\extract-zip\ .\node_modules\yauzl\ .\node_modules\pend\ .\node_modules\mkdirp\ .\node_modules\concat-stream\ .\node_modules\readable-stream\ .\node_modules\process-nextick-args\ .\node_modules\isarray\ .\node_modules\safe-buffer\ .\node_modules\core-util-is\ .\node_modules\inherits\ .\node_modules\util-deprecate\ .\node_modules\buffer-from\ .\node_modules\rimraf\ .\node_modules\proxy-from-env\ .\node_modules\ws\ .\node_modules\async-limiter\ .\node_modules\sliced\ .\node_modules\jsesc\ .\node_modules\once\ .\node_modules\wrappy\ .\node_modules\split2\ .\node_modules\defaults\ .\node_modules\clone\ .\node_modules\minstache\; aws lambda update-function-code
--function-name myNightmareFunction --zip-file fileb://new_deployment.zip ; aws lambda invoke --function-name myNightmareFunction --payload '{ \"name\": \"Bob\" }'
aws lambda update-function-code --function-name myNightmareFunction --zip-file fileb://new_deployment.zip
---Docker--
##run a container## make sure to use -d
docker run -it -d shykes/pybuilder /bin/bash
## to attach to running container
docker exec -t -i amazing_elbakyan /bin/bash
docker run -v /tmp:/tmp -it -d dashboard_validation_without_cmd xvfb-run -a node dashboard-validation/automation.js > /tmp/logs.log
ACCOUNT_ID = boto3.client('sts').get_caller_identity()['Account']
pip install -U --pre pipenv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment