Dominik389/dataproc_cluster.tf

## dataproc_cluster.tf
# To use this file, you need to install Terraform. How to do that can be found here: https://learn.hashicorp.com/tutorials/terraform/install-cli.
# To use this file, put it in a directory and name the file main.tf. This will tell Terraform to use this file.
# Then create a subdirectory called key.
# Get a service account key from your GCP project. To do this, go into your GCP project. Then go to IAM -> Service Accounts -> Compute Engine Standard Service Account -> Key -> Add Key -> Create New Key -> JSON.
# The key will now be downloaded to your computer.
# Load a service account key from your GCP project into the key directory and rename the service account key to GCPkey.json.
# Then replace all values that are in this format: <<< value >>>.
# Now start a shell and go to the directory where this file is located as main.tf.
# Run the command "terraform init" there. Terraform will now load everything to use its providers.
# After Terraform has been initialized, run the "terraform apply" command.
# Terraform will now show you what it will build. To confirm this, type "yes" and press enter.
# The build process may take three to four minutes.
# Now Terraform may give you an error like this:  Error 409: The requested bucket name is not available. The bucket namespace is shared by all users of the system. Please choose another name and try again.
# This means that someone else has already chosen the cluster name. The buckets of the cluster are named after the cluster name. Each bucket name must be absolutely unique. That is why such an error may occur. Please choose another cluster name.
# When Terraform is finished, you should see 3 new VMs in the GCP -> Compute Engine. The names should be <cluster-name>-w-0, <cluster-name>-w-1 and <cluster-name>-m.
# The cluster itself can be found in GCP -> Dataproc -> Clusters.
# The buckets for the cluster can be found under GCP -> Cloud Storage -> Buckets. The names should be <cluster-name>-staging and <cluster-name>-temp.
# To destroy the cluster, run the "terraform destroy" command where you ran the "terraform apply" command.
# Terrafrom will ask you for permission before destroying the cluster.
# Type yes to agree to destroy the cluster.
# After that, terraform will destroy the cluster and you should no longer see it in GCP -> Dataproc -> Clusters. The VMs in GCP -> Compute Engine and the buckets under GCP -> Cloud Storage -> Buckets should also be gone.

variable "project" {
  type = string
  default = "<<< your project >>>"
  description = "The GCP-project in which Dataproc cluster will be set up."
}

variable "zone" {
  type = string
  default = "<<< your zone >>>"
  description = "The zone in which Dataproc cluster will be set up."
}

variable "region" {
  type = string
  default = "<<< your region >>>"
  description = "The region in which Dataproc cluster will be set up."
}

variable "clusterName" {
  type = string
  default = "<<< your cluster Name >>>"
  description = "The name of the Dataproc cluster."
}

provider "google" {
  credentials = file("./key/GCPkey.json")
  project     = var.project
  region      = var.region
  zone        = var.zone
}

terraform {
  required_providers {
    google = {
      source  = "hashicorp/google-beta"
      version = "3.84.0"
    }
  }
}

resource "google_storage_bucket" "staging-bucket" {
  name          = "${var.clusterName}-staging"
  storage_class = "STANDARD"
  project       = var.project
  location      = var.region
  force_destroy = true
}

resource "google_storage_bucket" "temp-bucket" {
  name          = "${var.clusterName}-temp"
  storage_class = "STANDARD"
  project       = var.project
  location      = var.region
  force_destroy = true
}

resource "google_dataproc_autoscaling_policy" "dataproc-scaling" {
  policy_id = "dataproc-policy"
  location  = var.region

  worker_config {
    min_instances = 2
    max_instances = 2
  }

  secondary_worker_config {
    min_instances = 0
    max_instances = 10
  }

  basic_algorithm {
    yarn_config {
      graceful_decommission_timeout = "30s"
      scale_up_factor   = 1
      scale_down_factor = 0.3
    }
  }
}

resource "google_dataproc_cluster" "cluster_instance"{
  name   = "${var.clusterName}"
  region = var.region

  cluster_config {

    staging_bucket = google_storage_bucket.staging-bucket.name
    temp_bucket = google_storage_bucket.temp-bucket.name

    autoscaling_config {
      policy_uri = google_dataproc_autoscaling_policy.dataproc-scaling.name
    }

    gce_cluster_config {
      zone = var.zone
      subnetwork = "projects/${var.project}/regions/${var.region}/subnetworks/${var.project}-vpc-sub1"
      tags = [<<< your Tags >>>]

      metadata = {
       ssh-keys = <<EOF
        <<< your SSH key >>>
        EOF
      }
    }

    master_config {
      num_instances = 1
      machine_type  = "e2-highmem-4"
      disk_config {
        boot_disk_size_gb = 100
      }
    }

    worker_config {
      num_instances    = 2
      machine_type     = "e2-highmem-4"
      disk_config {
        boot_disk_size_gb = 100
      }
    }

    initialization_action {
      script = "gs://goog-dataproc-initialization-actions-${var.region}/kafka/kafka.sh"
    }

    software_config {
      image_version = "2.0-ubuntu18"
      optional_components = ["HBASE","ZOOKEEPER"]
    }

    endpoint_config {
      enable_http_port_access = "true"
    }
  }
}
	# To use this file, you need to install Terraform. How to do that can be found here: https://learn.hashicorp.com/tutorials/terraform/install-cli.
	# To use this file, put it in a directory and name the file main.tf. This will tell Terraform to use this file.
	# Then create a subdirectory called key.
	# Get a service account key from your GCP project. To do this, go into your GCP project. Then go to IAM -> Service Accounts -> Compute Engine Standard Service Account -> Key -> Add Key -> Create New Key -> JSON.
	# The key will now be downloaded to your computer.
	# Load a service account key from your GCP project into the key directory and rename the service account key to GCPkey.json.
	# Then replace all values that are in this format: <<< value >>>.
	# Now start a shell and go to the directory where this file is located as main.tf.
	# Run the command "terraform init" there. Terraform will now load everything to use its providers.
	# After Terraform has been initialized, run the "terraform apply" command.
	# Terraform will now show you what it will build. To confirm this, type "yes" and press enter.
	# The build process may take three to four minutes.
	# Now Terraform may give you an error like this: Error 409: The requested bucket name is not available. The bucket namespace is shared by all users of the system. Please choose another name and try again.
	# This means that someone else has already chosen the cluster name. The buckets of the cluster are named after the cluster name. Each bucket name must be absolutely unique. That is why such an error may occur. Please choose another cluster name.
	# When Terraform is finished, you should see 3 new VMs in the GCP -> Compute Engine. The names should be <cluster-name>-w-0, <cluster-name>-w-1 and <cluster-name>-m.
	# The cluster itself can be found in GCP -> Dataproc -> Clusters.
	# The buckets for the cluster can be found under GCP -> Cloud Storage -> Buckets. The names should be <cluster-name>-staging and <cluster-name>-temp.
	# To destroy the cluster, run the "terraform destroy" command where you ran the "terraform apply" command.
	# Terrafrom will ask you for permission before destroying the cluster.
	# Type yes to agree to destroy the cluster.
	# After that, terraform will destroy the cluster and you should no longer see it in GCP -> Dataproc -> Clusters. The VMs in GCP -> Compute Engine and the buckets under GCP -> Cloud Storage -> Buckets should also be gone.

	variable "project" {
	type = string
	default = "<<< your project >>>"
	description = "The GCP-project in which Dataproc cluster will be set up."
	}

	variable "zone" {
	type = string
	default = "<<< your zone >>>"
	description = "The zone in which Dataproc cluster will be set up."
	}

	variable "region" {
	type = string
	default = "<<< your region >>>"
	description = "The region in which Dataproc cluster will be set up."
	}

	variable "clusterName" {
	type = string
	default = "<<< your cluster Name >>>"
	description = "The name of the Dataproc cluster."
	}

	provider "google" {
	credentials = file("./key/GCPkey.json")
	project = var.project
	region = var.region
	zone = var.zone
	}

	terraform {
	required_providers {
	google = {
	source = "hashicorp/google-beta"
	version = "3.84.0"
	}
	}
	}

	resource "google_storage_bucket" "staging-bucket" {
	name = "${var.clusterName}-staging"
	storage_class = "STANDARD"
	project = var.project
	location = var.region
	force_destroy = true
	}

	resource "google_storage_bucket" "temp-bucket" {
	name = "${var.clusterName}-temp"
	storage_class = "STANDARD"
	project = var.project
	location = var.region
	force_destroy = true
	}

	resource "google_dataproc_autoscaling_policy" "dataproc-scaling" {
	policy_id = "dataproc-policy"
	location = var.region

	worker_config {
	min_instances = 2
	max_instances = 2
	}

	secondary_worker_config {
	min_instances = 0
	max_instances = 10
	}

	basic_algorithm {
	yarn_config {
	graceful_decommission_timeout = "30s"
	scale_up_factor = 1
	scale_down_factor = 0.3
	}
	}
	}

	resource "google_dataproc_cluster" "cluster_instance"{
	name = "${var.clusterName}"
	region = var.region

	cluster_config {

	staging_bucket = google_storage_bucket.staging-bucket.name
	temp_bucket = google_storage_bucket.temp-bucket.name

	autoscaling_config {
	policy_uri = google_dataproc_autoscaling_policy.dataproc-scaling.name
	}

	gce_cluster_config {
	zone = var.zone
	subnetwork = "projects/${var.project}/regions/${var.region}/subnetworks/${var.project}-vpc-sub1"
	tags = [<<< your Tags >>>]

	metadata = {
	ssh-keys = <<EOF
	<<< your SSH key >>>
	EOF
	}
	}

	master_config {
	num_instances = 1
	machine_type = "e2-highmem-4"
	disk_config {
	boot_disk_size_gb = 100
	}
	}

	worker_config {
	num_instances = 2
	machine_type = "e2-highmem-4"
	disk_config {
	boot_disk_size_gb = 100
	}
	}

	initialization_action {
	script = "gs://goog-dataproc-initialization-actions-${var.region}/kafka/kafka.sh"
	}

	software_config {
	image_version = "2.0-ubuntu18"
	optional_components = ["HBASE","ZOOKEEPER"]
	}

	endpoint_config {
	enable_http_port_access = "true"
	}
	}
	}