Anjaiah Methuku anjijava16

## python_env.sh
/Users/welcome/sai_workspace/python_envs

welcome@welcomes-MacBook-Pro python_envs % python3 -m venv mlops_water_metrics
welcome@welcomes-MacBook-Pro python_envs % pwd
/Users/welcome/sai_workspace/python_envs
cd /Users/welcome/sai_workspace/python_envs/mlops_water_metrics/bin
source activate

## emr_create.sh
aws emr create-cluster \
 --name "ModelServing" \
 --log-uri "s3n://aws-logs-654288303595-us-east-1/elasticmapreduce/" \
 --release-label "emr-6.7.0" \
 --service-role "arn:aws:iam::654288303595:role/EMR_DefaultRole" \
 --ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","EmrManagedMasterSecurityGroup":"sg-0cf75a954ffb6d02e","EmrManagedSlaveSecurityGroup":"sg-0fc72ac9d5e6a6759","KeyName":"welcome_emr","AdditionalMasterSecurityGroups":[],"AdditionalSlaveSecurityGroups":[],"SubnetId":"subnet-0066baf1bb164b3de"}' \
 --applications Name=Hadoop Name=Hive Name=Hue Name=Pig Name=Spark \
 --instance-groups '[{"InstanceCount":1,"InstanceGroupType":"MASTER","Name":"Master - 1","InstanceType":"m5.xlarge","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"VolumeType":"gp2","SizeInGB":40},"VolumesPerInstance":1}],"EbsOptimized":true}},{"InstanceCount":2,"InstanceGroupType":"CORE","Name":"Core - 2","InstanceType":"m5.xlarge","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"Vol

## npm_mac.sh

https://nodejs.org/dist/v18.14.2/node-v18.14.2.pkg
This package will install:
	•	Node.js v18.14.2 to /usr/local/bin/node
	•	npm v9.5.0 to /usr/local/bin/npm

## large_file_read_and_split.sh
# Create a File
seq 1 100000000 > my_file.txt

# Check File Size
welcome@welcomes-MacBook-Pro temp_data % ls -hl
total 2326664
-rw-r--r--  1 welcome  staff   1.1G Mar  3 20:27 my_file.txt
-rw-r--r--  1 welcome  staff    71B Mar  3 20:22 read_file.sh
welcome@welcomes-MacBook-Pro temp_data %

## snowflake_Needs.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                anjijava16
                / snowflake_Needs.md
            
            
              Last active
              December 16, 2022 04:17
            
          
Snowflake architecture
Virtual Warehouse
Internal stage
AWS/Azure/GCP based external stage
Snowpipe
File formats
Task
Dependent Task
Micro Partitioning
ADF to Snowflake


## SparkML.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                anjijava16
                / SparkML.md
            
            
              Created
              November 30, 2022 05:09
            
          
    https://github.com/falaybeg/SparkStreaming-Network-Anomaly-Detection

  
## parquet_compression.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                anjijava16
                / parquet_compression.md
            
            
              Created
              October 3, 2022 00:57
            
          
    Parquet compression options
Parquet is designed for large-scale data with several types of data compression formats supported. Depending on your data format, you might want a different compression.
LZ4: Compression codec loosely based on the LZ4 compression algorithm, but with an additional undocumented framing scheme. The framing is part of the original Hadoop compression library and was historically copied first in parquet-mr, then emulated with mixed results by parquet-cpp.
LZO: Compression codec based on or interoperable with the LZO compression library.
GZIP: Compression codec based on the GZIP format (not the closely-related "zlib" or "deflate" formats) defined by RFC 1952.
Snappy: Default compression for parquet files.
ZSTD: Compression codec with the highest compression ratio based on the Zstandard format defined by RFC 8478.

  
## Final_Interview.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                anjijava16
                / Final_Interview.md
            
            
              Last active
              September 15, 2022 02:40
            
          
https://mindmajix.com/hadoop-interview-questions#apache-spark

Spark


https://www.simplilearn.com/top-apache-spark-interview-questions-and-answers-article


## Top
1. Hive Joins
2. Functions SQL,Window Functions should write one example in notepad
3. Top 3 Records, or Top n Records
4. Best File formats hive : Ans Should be Parquet Why
5. map side vs reduce side join
6. Spark Connectors, Spark with Hive connectors
6. Reduce By vs group by (Good ANs : group by having more shuffle but in reduce by less shuffle
7. cache vs perist
8. repartition vs colasec
9. RDD vs Dataframe

## sql_interv.sql
1. https://www.youtube.com/watch?v=AK7_m-aThfw
1. https://www.youtube.com/watch?v=Oo2FoYgRBvE&list=PLaIYQ9kvDKjroSixUUJJEdlvh-jr8X3ER
2. https://www.youtube.com/watch?v=-WEpWH1NHGU
3. https://www.mygreatlearning.com/blog/sql-interview-questions/


# HIve
1. https://www.youtube.com/watch?v=8tFyr02GYzc&list=PLIeFQEzpZM9gDh5UWCPeI11M-Ykm4scq1
	/Users/welcome/sai_workspace/python_envs

	welcome@welcomes-MacBook-Pro python_envs % python3 -m venv mlops_water_metrics
	welcome@welcomes-MacBook-Pro python_envs % pwd
	/Users/welcome/sai_workspace/python_envs
	cd /Users/welcome/sai_workspace/python_envs/mlops_water_metrics/bin
	source activate
	aws emr create-cluster \
	--name "ModelServing" \
	--log-uri "s3n://aws-logs-654288303595-us-east-1/elasticmapreduce/" \
	--release-label "emr-6.7.0" \
	--service-role "arn:aws:iam::654288303595:role/EMR_DefaultRole" \
	--ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","EmrManagedMasterSecurityGroup":"sg-0cf75a954ffb6d02e","EmrManagedSlaveSecurityGroup":"sg-0fc72ac9d5e6a6759","KeyName":"welcome_emr","AdditionalMasterSecurityGroups":[],"AdditionalSlaveSecurityGroups":[],"SubnetId":"subnet-0066baf1bb164b3de"}' \
	--applications Name=Hadoop Name=Hive Name=Hue Name=Pig Name=Spark \
	--instance-groups '[{"InstanceCount":1,"InstanceGroupType":"MASTER","Name":"Master - 1","InstanceType":"m5.xlarge","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"VolumeType":"gp2","SizeInGB":40},"VolumesPerInstance":1}],"EbsOptimized":true}},{"InstanceCount":2,"InstanceGroupType":"CORE","Name":"Core - 2","InstanceType":"m5.xlarge","EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"Vol

	https://nodejs.org/dist/v18.14.2/node-v18.14.2.pkg
	This package will install:
	• Node.js v18.14.2 to /usr/local/bin/node
	• npm v9.5.0 to /usr/local/bin/npm
	# Create a File
	seq 1 100000000 > my_file.txt

	# Check File Size
	welcome@welcomes-MacBook-Pro temp_data % ls -hl
	total 2326664
	-rw-r--r-- 1 welcome staff 1.1G Mar 3 20:27 my_file.txt
	-rw-r--r-- 1 welcome staff 71B Mar 3 20:22 read_file.sh
	welcome@welcomes-MacBook-Pro temp_data %
	1. Hive Joins
	2. Functions SQL,Window Functions should write one example in notepad
	3. Top 3 Records, or Top n Records
	4. Best File formats hive : Ans Should be Parquet Why
	5. map side vs reduce side join
	6. Spark Connectors, Spark with Hive connectors
	6. Reduce By vs group by (Good ANs : group by having more shuffle but in reduce by less shuffle
	7. cache vs perist
	8. repartition vs colasec
	9. RDD vs Dataframe
	1. https://www.youtube.com/watch?v=AK7_m-aThfw
	1. https://www.youtube.com/watch?v=Oo2FoYgRBvE&list=PLaIYQ9kvDKjroSixUUJJEdlvh-jr8X3ER
	2. https://www.youtube.com/watch?v=-WEpWH1NHGU
	3. https://www.mygreatlearning.com/blog/sql-interview-questions/


	# HIve
	1. https://www.youtube.com/watch?v=8tFyr02GYzc&list=PLIeFQEzpZM9gDh5UWCPeI11M-Ykm4scq1