Skip to content

Instantly share code, notes, and snippets.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
We can make this file beautiful and searchable if this error is corrected: Unclosed quoted field in line 5.
,VMs,CloudSQL,Winner
Pricing per instance (n1-highmem-16),$686.33,"$1,387.98",VMs as it’s much cheaper
Failover,"Switch master-slave (need extra work to set up and maintain)
Slave can be read replica (no additional cost)","HA configuration (GCP support without additional work). https://cloud.google.com/sql/docs/mysql/high-availability
An additional and exclusive standby instance needed, which cannot be used as a read replica (a fair account of additional cost occurs)","VMs are more cost-effective
CloudSQL is easy to set up and without maintenance work from us"
Multiple read replicas,"Chain-like replications are feasible, e.g. db01->db02->db03","All replicas have to replicate the same primary, e.g. db02 and db03 have to chain from db01",VMs cause AASF needs chain-like replications
Maintenance window,No maintenance window is required. We have the control of when to upgrade or patch the system,"A maintenance window has to be set, and maintenance automatically takes place during the window","VMs cause we have f
@wwwbbb8510
wwwbbb8510 / pyspark-example-third-output.py
Created September 20, 2019 04:53
pyspark-example-third-output.py
# convert pyspark dataframe to pandas dataframe, which will be scanned for each user
pdf_user = df_user.toPandas()
# function to find the superior_id
def find_superior_user(row):
first_name, last_name = row.first_name, row.last_name
pdf_matched_users = pdf_user[(pdf_user['first_name'] == first_name) & (pdf_user['last_name'] == last_name)]
pdf_matched_users = pdf_matched_users.sort_values(by=['modified_date'], ascending=False)
return Row(id=row.id, superior_id=int(pdf_matched_users.iloc[0,:]['id']))
# apply the function of finding superior user to the user dataframe
df_superior_user = df_user.rdd.map(find_superior_user)
@wwwbbb8510
wwwbbb8510 / pyspark-example-second-output.py
Created September 20, 2019 04:41
pyspark-example-second-output.py
# function to calculate the age
curr_datetime = datetime.now()
def calculate_age_from_dob(row):
age = None
dob_datetime = row.date_of_birth
try:
curr_month_datetime = datetime(2018, curr_datetime.month, curr_datetime.day)
dob_month_datetime = datetime(2018, dob_datetime.month, dob_datetime.day)
age = curr_datetime.year - dob_datetime.year
@wwwbbb8510
wwwbbb8510 / pyspark-example-first-output.py
Created September 20, 2019 04:30
pyspark-example-first-output.py
# function to register pyspark dataframe as a table/view
def register_dataframe_as_table(df_data, table_name):
loaded = False
try:
df_data.createOrReplaceTempView(table_name)
loaded = True
except:
pass
return loaded
# sql to join user and transaction through user_transaction and sort user transaction by created_date of transaction
@wwwbbb8510
wwwbbb8510 / pyspark-example-load-data.py
Created September 20, 2019 04:25
pyspark-example-load-data.py
# load a parquet folder as a table/view
def load_table_from_parquet_file(table_name, parquet_path):
loaded = False
try:
lc_parquet_file = spark_session.read.option("mergeSchema", "true").parquet(parquet_path)
lc_parquet_file.createOrReplaceTempView(table_name)
loaded = True
except:
pass
return loaded
@wwwbbb8510
wwwbbb8510 / pyspark-example-fake-data.py
Created September 20, 2019 04:20
pyspark-example-fake-data.py
# define function of saving pyspark dataframe
def save_pyspark_dataframe(df_data, folder, format='csv'):
saved_folder = hdfs_base_uri + '/' + folder
if format == 'csv':
df_data.write.csv(saved_folder, header=True)
elif format == 'parquet':
df_data.write.parquet(saved_folder)
return saved_folder
# user table fields: 'id', 'email', 'first_name', 'last_name', 'created_date', 'modified_date'
# generate 100 users
@wwwbbb8510
wwwbbb8510 / pyspark-example-init-session.py
Last active September 20, 2019 04:06
pyspark-example-init-session.py
# function to init a spark session
def init_spark_session(app_name):
spark_session = SparkSession.builder.appName(app_name).getOrCreate()
return spark_session
# init spark session
app_name = 'pyspark example of basic functions'
spark_session = init_spark_session(app_name)
# define the base uri of hadoop file system
hdfs_base_uri = 'hdfs://node-master:9000//user/hadoop/spark_examples'
# define user, transaction, user transaction data folder (csv and parquet)