fxia22/process_ukbb.py

## process_ukbb.py
import os
import numpy as np
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
import pandas as pd

main_ukbb_file = sys.argv[1]
other_ukbb_files = sys.argv[1:]

nl = 0
lines = []
with open(main_ukbb_file) as f:
    for line in tqdm(f):
        if line[:2] == '1:':
            lines.append(line.strip().split('\t'))
        nl += 1

name = [item[0] for item in lines]
pval = [float(item[-1]) for item in lines]

print(len(name))

d = pd.read_csv('chr1phastcon.txt')
phastcon = np.array(d).flatten()

state = []
with open('wgEncodeAwgSegmentationChromhmmGm12878.bed') as f:
    for line in f:
        state.append(line.split()[3])

state = sorted(list(set(state)))
print(state)

chrom_state = np.zeros(250000000, )
state_dict = dict(zip(state, range(len(state))))

with open('wgEncodeAwgSegmentationChromhmmGm12878.bed') as f:
    for line in tqdm(f):
        ls = line.split()
        if ls[0] != 'chr1':
            break

        start = int(ls[1])
        end = int(ls[2])
        state = ls[3]
        chrom_state[start:end] = state_dict[state]

x = np.zeros((len(name), 2))
p = np.zeros((len(name), ))
for i in tqdm(range(len(name))):
    pos = int(name[i].split(':')[1])
    x[i,0] = chrom_state[pos]
    x[i,1] = phastcon[pos]
    p[i] = pval[i]


with open('{}.csv'.format(main_ukbb_file.split('.')[0]), 'w') as f:
    f.write('x1, x2, p_value, h\n')
    for i in range(x.shape[0]):
        f.write("{}, {}, {}, {}\n".format(x[i, 0], x[i, 1], p[i], np.nan))
	import os
	import numpy as np
	import matplotlib.pyplot as plt
	import sys
	from tqdm import tqdm
	import pandas as pd

	main_ukbb_file = sys.argv[1]
	other_ukbb_files = sys.argv[1:]

	nl = 0
	lines = []
	with open(main_ukbb_file) as f:
	for line in tqdm(f):
	if line[:2] == '1:':
	lines.append(line.strip().split('\t'))
	nl += 1

	name = [item[0] for item in lines]
	pval = [float(item[-1]) for item in lines]

	print(len(name))

	d = pd.read_csv('chr1phastcon.txt')
	phastcon = np.array(d).flatten()

	state = []
	with open('wgEncodeAwgSegmentationChromhmmGm12878.bed') as f:
	for line in f:
	state.append(line.split()[3])

	state = sorted(list(set(state)))
	print(state)

	chrom_state = np.zeros(250000000, )
	state_dict = dict(zip(state, range(len(state))))

	with open('wgEncodeAwgSegmentationChromhmmGm12878.bed') as f:
	for line in tqdm(f):
	ls = line.split()
	if ls[0] != 'chr1':
	break

	start = int(ls[1])
	end = int(ls[2])
	state = ls[3]
	chrom_state[start:end] = state_dict[state]

	x = np.zeros((len(name), 2))
	p = np.zeros((len(name), ))
	for i in tqdm(range(len(name))):
	pos = int(name[i].split(':')[1])
	x[i,0] = chrom_state[pos]
	x[i,1] = phastcon[pos]
	p[i] = pval[i]


	with open('{}.csv'.format(main_ukbb_file.split('.')[0]), 'w') as f:
	f.write('x1, x2, p_value, h\n')
	for i in range(x.shape[0]):
	f.write("{}, {}, {}, {}\n".format(x[i, 0], x[i, 1], p[i], np.nan))