Skip to content

Instantly share code, notes, and snippets.

View siakon89's full-sized avatar

Siaterlis Konstantinos siakon89

View GitHub Profile
from decimal import Decimal
import boto3
import os
DYNAMODB_TABLE = os.environ.get("DYNAMODB_TABLE", None)
dynamodb = boto3.resource("dynamodb")
dynamodb_table = dynamodb.Table(DYNAMODB_TABLE)
AWSTemplateFormatVersion: '2010-09-09'
Transform: AWS::Serverless-2016-10-31
Description: >
My Image App - Label images
# This is the global definitions
# In this example the definitions is only for the Lambdas
Globals:
Function:
Runtime: python3.6
# Filesystem schemes and URIs
|=====================================|
| Filesystem | URI Structure |
|----------------|--------------------|
| Local Fs | file:///path |
| HDFS | hdfs://hdfs_path |
| S3 | s3://bucket/object |
|=====================================|
# Loading a file into an RDD
# GroupBy a column and count
df.groupby("column").count().show()
# GroupBy a column and sum
df.groupby("column1").sum("column2").show()
# GroupBy with multiple columns
df.groupby("column1", "column2").count().show()
# GroupBy with multiple columns and sum multiple columns
# Filter your DataFrame and select a column
df.filter(df.column1 > 20).select("column2").show()
# Filter your DataFrame with AND
df.filter((df.column1 > 20) & (df.column2 < 10)).select("column2").show()
# Filter your DataFrame with OR
df.filter((df.column1 > 20) | (df.column2 < 10)).select("column2").show()
# Select one column from your DataFrame
df.select("column").show()
# Select multiple columns from your DataFrame
df.select("column1", "column2").show()
# Select a column and add 1 to every entry
df.select(df.column + 1).show()
# Reading a CSV
df = spark.read.csv("filename.csv")
# Reading a CSV with header
df = spark.read.csv("filename.csv", header=True)
# Reading a CSV using the load method
df = spark.read.format("csv").load("filename.csv")
# Reading a CSV using the load method with header
import boto3
# SQL Query
query = "SELECT * FROM table LIMIT 10"
# Create the client
athena = boto3.client("athena")
# Submit a Query - Response will have a QueryId
response = athena.start_query_execution(
# List all bucket
aws s3 ls
# Use a different profile
aws s3 ls --profile <profile name>
# List objects in bucket
aws s3 ls s3://<my bucket>
# Create a bucket
# Filter your DataFrame and select a column
df.filter(df.column1 > 20).select("column2").show()
# Filter your DataFrame with AND
df.filter((df.column1 > 20) & (df.column2 < 10)).select("column2").show()
# Filter your DataFrame with OR
df.filter((df.column1 > 20) | (df.column2 < 10)).select("column2").show()