clairemcwhite/Accessukbb_notes.txt

## Accessukbb_notes.txt
# This is extra instructions to access and query the hail tables from the pan ukbiobank
# https://pan.ukbb.broadinstitute.org/docs/hail-format/index.html
#
# Open a google cloud account, set up billing, get project id (word-word-number)

# Create a virtual machine
# Select Ubuntu 16.04
# Allow access to all Cloud API (unknown if needed)

# Need python 3.7 for hail
$ sudo apt install software-properties-common
$ sudo add-apt-repository ppa:deadsnakes/ppa
$ sudo apt-get update
$ sudo apt install python3.7
$ sudo update-alternatives  --set python /usr/bin/python3.7
$ sudo apt-get install pip3
$ python3.7 -m pip install hail
# Had to do $ python3.7 -m pip install pypandoc

# https://hail.is/docs/0.2/cloud/google_cloud.html
" To allow hail to read from GCS when running locally, you need to install the Cloud Storage Connector."
" The easiest way to do that is to run the following script from your command line:"
# Had to do the gcloud login for the gcs install to work (follow login link and prompt)
# $ gcloud auth application-default login
$ curl -sSL https://broad.io/install-gcs-connector | python3.7

$ git clone https://github.com/atgu/ukbb_pan_ancestry
$ git clone https://github.com/Nealelab/ukb_common

# python3.7 script to test if it's working
from ukbb_pan_ancestry import *
hl.init(spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO',
                    'spark.hadoop.fs.gs.requester.pays.project.id': 'my-project-id'})
mt = load_final_sumstats_mt()
mt.describe()

# Alternate way is to submit a job to a cluster with hailctl after installing hail
# Don't know yet how to prevent the error finding ukbb_pan_ancestry functions
$ hailctl dataproc start --region us-central1 --requester-pays-allow-all  --master-machine-type=n1-standard-2  --worker-machine-type=n1-standard-2 tester
$ gcloud config set dataproc/region us-central1
$ hailctl dataproc submit tester hailtest.py
# Getting error:
# from ukbb_pan_ancestry import *
# ModuleNotFoundError: No module named 'ukbb_pan_ancestry'
$ hailctl dataproc stop tester

# This is only for if running ukb_common/create_gwas_sig_file.py
# Output of that already exists at gs://ukbb-mega-gwas-results-public/round2/ukbb_imputed_v3_gwas_significant.GRCh37.ht
# To install hdbscan, sudo apt-get install python3.7-dev first
# Now can install gnomad
$ python3.7 -m pip install gnomad

# Convert the hail table to a flat table (python3.7)
#So mt.export() fails "Out of space" merging all the tempfile together, but the temp files are complete, so just use them. About 2Gb total
from ukbb_pan_ancestry import *
hl.init(spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO',
                    'spark.hadoop.fs.gs.requester.pays.project.id': 'projectid'},
                    tmp_dir = '/data/tmpdir', local_tmpdir= '/data/tmpdir')
mt = hl.read_table('gs://ukbb-mega-gwas-results-public/round2/ukbb_imputed_v3_gwas_significant.GRCh37.ht')
mt = mt.naive_coalesce(50) # Make 50 fragments, about 1,000,000 rows each
mt.export('/data/export.txt.bgz')


# To get download data ...not sure if this is the best way
# Make a gsbucket and copy files into it
$ gsutil cp -r output gs://ukbbdefaultdownloadsig
# Install gsutil on a local computer and do
$ gsutil -u projectname -m cp -r gs://ukbbdefaultdownloadsig/output/ .


# Notes on installing gsutil on local computer

$ sudo apt-get update
$ sudo apt-get install apt-transport-https ca-certificates gnupg
$ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
$ curl https://dl.google.com/dl/cloudsdk/release/install_google_cloud_sdk.bash | bash

# Notes on installing gsutil/google cloud sdk without sudo privileges

$ wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-265.0.0-linux-x86_64.tar.gz
$ tar -zxf google-cloud-sdk-*
$ cd google-cloud-sdk
$ ./install.sh
#Add to .bash_profile and restart session
$ export PATH=$PATH:/home/asldfkj/google-cloud-sdk/bin
	# This is extra instructions to access and query the hail tables from the pan ukbiobank
	# https://pan.ukbb.broadinstitute.org/docs/hail-format/index.html
	#
	# Open a google cloud account, set up billing, get project id (word-word-number)

	# Create a virtual machine
	# Select Ubuntu 16.04
	# Allow access to all Cloud API (unknown if needed)

	# Need python 3.7 for hail
	$ sudo apt install software-properties-common
	$ sudo add-apt-repository ppa:deadsnakes/ppa
	$ sudo apt-get update
	$ sudo apt install python3.7
	$ sudo update-alternatives --set python /usr/bin/python3.7
	$ sudo apt-get install pip3
	$ python3.7 -m pip install hail
	# Had to do $ python3.7 -m pip install pypandoc

	# https://hail.is/docs/0.2/cloud/google_cloud.html
	" To allow hail to read from GCS when running locally, you need to install the Cloud Storage Connector."
	" The easiest way to do that is to run the following script from your command line:"
	# Had to do the gcloud login for the gcs install to work (follow login link and prompt)
	# $ gcloud auth application-default login
	$ curl -sSL https://broad.io/install-gcs-connector \| python3.7

	$ git clone https://github.com/atgu/ukbb_pan_ancestry
	$ git clone https://github.com/Nealelab/ukb_common

	# python3.7 script to test if it's working
	from ukbb_pan_ancestry import *
	hl.init(spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO',
	'spark.hadoop.fs.gs.requester.pays.project.id': 'my-project-id'})
	mt = load_final_sumstats_mt()
	mt.describe()

	# Alternate way is to submit a job to a cluster with hailctl after installing hail
	# Don't know yet how to prevent the error finding ukbb_pan_ancestry functions
	$ hailctl dataproc start --region us-central1 --requester-pays-allow-all --master-machine-type=n1-standard-2 --worker-machine-type=n1-standard-2 tester
	$ gcloud config set dataproc/region us-central1
	$ hailctl dataproc submit tester hailtest.py
	# Getting error:
	# from ukbb_pan_ancestry import *
	# ModuleNotFoundError: No module named 'ukbb_pan_ancestry'
	$ hailctl dataproc stop tester

	# This is only for if running ukb_common/create_gwas_sig_file.py
	# Output of that already exists at gs://ukbb-mega-gwas-results-public/round2/ukbb_imputed_v3_gwas_significant.GRCh37.ht
	# To install hdbscan, sudo apt-get install python3.7-dev first
	# Now can install gnomad
	$ python3.7 -m pip install gnomad

	# Convert the hail table to a flat table (python3.7)
	#So mt.export() fails "Out of space" merging all the tempfile together, but the temp files are complete, so just use them. About 2Gb total
	from ukbb_pan_ancestry import *
	hl.init(spark_conf={'spark.hadoop.fs.gs.requester.pays.mode': 'AUTO',
	'spark.hadoop.fs.gs.requester.pays.project.id': 'projectid'},
	tmp_dir = '/data/tmpdir', local_tmpdir= '/data/tmpdir')
	mt = hl.read_table('gs://ukbb-mega-gwas-results-public/round2/ukbb_imputed_v3_gwas_significant.GRCh37.ht')
	mt = mt.naive_coalesce(50) # Make 50 fragments, about 1,000,000 rows each
	mt.export('/data/export.txt.bgz')



	# To get download data ...not sure if this is the best way
	# Make a gsbucket and copy files into it
	$ gsutil cp -r output gs://ukbbdefaultdownloadsig
	# Install gsutil on a local computer and do
	$ gsutil -u projectname -m cp -r gs://ukbbdefaultdownloadsig/output/ .


	# Notes on installing gsutil on local computer

	$ sudo apt-get update
	$ sudo apt-get install apt-transport-https ca-certificates gnupg
	$ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \| sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
	$ curl https://dl.google.com/dl/cloudsdk/release/install_google_cloud_sdk.bash \| bash

	# Notes on installing gsutil/google cloud sdk without sudo privileges

	$ wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-265.0.0-linux-x86_64.tar.gz
	$ tar -zxf google-cloud-sdk-*
	$ cd google-cloud-sdk
	$ ./install.sh
	#Add to .bash_profile and restart session
	$ export PATH=$PATH:/home/asldfkj/google-cloud-sdk/bin