王斌(Bin Wang) wwwbbb8510

## bedrock-langchain.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                wwwbbb8510
                / bedrock-langchain.ipynb
            
            
              Created
              September 17, 2023 01:29
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## bedrock.ipynb

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                wwwbbb8510
                / bedrock.ipynb
            
            
              Last active
              September 17, 2023 01:27
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## table-comparison-cloudsql-vms.csv
,VMs,CloudSQL,Winner
Pricing per instance (n1-highmem-16),$686.33,"$1,387.98",VMs as it’s much cheaper
Failover,"Switch master-slave (need extra work to set up and maintain)

Slave can be read replica (no additional cost)","HA configuration (GCP support without additional work). https://cloud.google.com/sql/docs/mysql/high-availability

An additional and exclusive standby instance needed, which cannot be used as a read replica (a fair account of additional cost occurs)","VMs are more cost-effective
CloudSQL is easy to set up and without maintenance work from us"
Multiple read replicas,"Chain-like replications are feasible, e.g. db01->db02->db03","All replicas have to replicate the same primary, e.g. db02 and db03 have to chain from db01",VMs cause AASF needs chain-like replications
Maintenance window,No maintenance window is required. We have the control of when to upgrade or patch the system,"A maintenance window has to be set, and maintenance automatically takes place during the window","VMs cause we have f

## pyspark-example-third-output.py
# convert pyspark dataframe to pandas dataframe, which will be scanned for each user
pdf_user = df_user.toPandas()
# function to find the superior_id
def find_superior_user(row):
    first_name, last_name = row.first_name, row.last_name
    pdf_matched_users = pdf_user[(pdf_user['first_name'] == first_name) & (pdf_user['last_name'] == last_name)]
    pdf_matched_users = pdf_matched_users.sort_values(by=['modified_date'], ascending=False)
    return Row(id=row.id, superior_id=int(pdf_matched_users.iloc[0,:]['id']))
# apply the function of finding superior user to the user dataframe
df_superior_user = df_user.rdd.map(find_superior_user)

## pyspark-example-second-output.py
# function to calculate the age
curr_datetime = datetime.now()
def calculate_age_from_dob(row):
    age = None
    dob_datetime = row.date_of_birth
    try:
        curr_month_datetime = datetime(2018, curr_datetime.month, curr_datetime.day)
        dob_month_datetime = datetime(2018, dob_datetime.month, dob_datetime.day)

        age = curr_datetime.year - dob_datetime.year

## pyspark-example-first-output.py
# function to register pyspark dataframe as a table/view
def register_dataframe_as_table(df_data, table_name):
    loaded = False
    try:
        df_data.createOrReplaceTempView(table_name)
        loaded = True
    except:
        pass
    return loaded
# sql to join user and transaction through user_transaction and sort user transaction by created_date of transaction

## pyspark-example-load-data.py
# load a parquet folder as a table/view
def load_table_from_parquet_file(table_name, parquet_path):
    loaded = False
    try:
        lc_parquet_file = spark_session.read.option("mergeSchema", "true").parquet(parquet_path)
        lc_parquet_file.createOrReplaceTempView(table_name)
        loaded = True
    except:
        pass
    return loaded

## pyspark-example-fake-data.py
# define function of saving pyspark dataframe
def save_pyspark_dataframe(df_data, folder, format='csv'):
    saved_folder = hdfs_base_uri + '/' + folder
    if format == 'csv':
        df_data.write.csv(saved_folder, header=True)
    elif format == 'parquet':
        df_data.write.parquet(saved_folder)
    return saved_folder
# user table fields: 'id', 'email', 'first_name', 'last_name', 'created_date', 'modified_date'
# generate 100 users

## pyspark-example-init-session.py
# function to init a spark session
def init_spark_session(app_name):
    spark_session = SparkSession.builder.appName(app_name).getOrCreate()
    return spark_session
# init spark session
app_name = 'pyspark example of basic functions'
spark_session = init_spark_session(app_name)
# define the base uri of hadoop file system
hdfs_base_uri = 'hdfs://node-master:9000//user/hadoop/spark_examples'
# define user, transaction, user transaction data folder (csv and parquet)

## ibcf-vs-ubcf-example.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                wwwbbb8510
                / ibcf-vs-ubcf-example.md
            
            
              Created
              May 8, 2018 21:23
	,VMs,CloudSQL,Winner
	Pricing per instance (n1-highmem-16),$686.33,"$1,387.98",VMs as it’s much cheaper
	Failover,"Switch master-slave (need extra work to set up and maintain)

	Slave can be read replica (no additional cost)","HA configuration (GCP support without additional work). https://cloud.google.com/sql/docs/mysql/high-availability

	An additional and exclusive standby instance needed, which cannot be used as a read replica (a fair account of additional cost occurs)","VMs are more cost-effective
	CloudSQL is easy to set up and without maintenance work from us"
	Multiple read replicas,"Chain-like replications are feasible, e.g. db01->db02->db03","All replicas have to replicate the same primary, e.g. db02 and db03 have to chain from db01",VMs cause AASF needs chain-like replications
	Maintenance window,No maintenance window is required. We have the control of when to upgrade or patch the system,"A maintenance window has to be set, and maintenance automatically takes place during the window","VMs cause we have f
	# convert pyspark dataframe to pandas dataframe, which will be scanned for each user
	pdf_user = df_user.toPandas()
	# function to find the superior_id
	def find_superior_user(row):
	first_name, last_name = row.first_name, row.last_name
	pdf_matched_users = pdf_user[(pdf_user['first_name'] == first_name) & (pdf_user['last_name'] == last_name)]
	pdf_matched_users = pdf_matched_users.sort_values(by=['modified_date'], ascending=False)
	return Row(id=row.id, superior_id=int(pdf_matched_users.iloc[0,:]['id']))
	# apply the function of finding superior user to the user dataframe
	df_superior_user = df_user.rdd.map(find_superior_user)
	# function to calculate the age
	curr_datetime = datetime.now()
	def calculate_age_from_dob(row):
	age = None
	dob_datetime = row.date_of_birth
	try:
	curr_month_datetime = datetime(2018, curr_datetime.month, curr_datetime.day)
	dob_month_datetime = datetime(2018, dob_datetime.month, dob_datetime.day)

	age = curr_datetime.year - dob_datetime.year
	# function to register pyspark dataframe as a table/view
	def register_dataframe_as_table(df_data, table_name):
	loaded = False
	try:
	df_data.createOrReplaceTempView(table_name)
	loaded = True
	except:
	pass
	return loaded
	# sql to join user and transaction through user_transaction and sort user transaction by created_date of transaction
	# load a parquet folder as a table/view
	def load_table_from_parquet_file(table_name, parquet_path):
	loaded = False
	try:
	lc_parquet_file = spark_session.read.option("mergeSchema", "true").parquet(parquet_path)
	lc_parquet_file.createOrReplaceTempView(table_name)
	loaded = True
	except:
	pass
	return loaded
	# define function of saving pyspark dataframe
	def save_pyspark_dataframe(df_data, folder, format='csv'):
	saved_folder = hdfs_base_uri + '/' + folder
	if format == 'csv':
	df_data.write.csv(saved_folder, header=True)
	elif format == 'parquet':
	df_data.write.parquet(saved_folder)
	return saved_folder
	# user table fields: 'id', 'email', 'first_name', 'last_name', 'created_date', 'modified_date'
	# generate 100 users
	# function to init a spark session
	def init_spark_session(app_name):
	spark_session = SparkSession.builder.appName(app_name).getOrCreate()
	return spark_session
	# init spark session
	app_name = 'pyspark example of basic functions'
	spark_session = init_spark_session(app_name)
	# define the base uri of hadoop file system
	hdfs_base_uri = 'hdfs://node-master:9000//user/hadoop/spark_examples'
	# define user, transaction, user transaction data folder (csv and parquet)