Last active
February 14, 2018 04:07
-
-
Save dillondaudert/94785e9cc0318ac69243c6283da3a032 to your computer and use it in GitHub Desktop.
Parse DSSP files into csv files using multiprocessing and pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NOTE: This requires the DSSPData class shown in this gist: https://gist.github.com/jlhg/5181883 | |
import os | |
import pandas as pd | |
from multiprocessing import Process, Queue | |
from DSSPData import DSSPData | |
datapath = "/path/to/dssp/data/" | |
def _parse(filename): | |
'''Parse a single .dssp file and return its properties. | |
Returns: | |
seq - str | |
secondary structures - str | |
tco - list | |
kappa - list | |
alpha - list | |
phi - list | |
psi - list | |
''' | |
dssp = DSSPData() | |
dssp.parseDSSP(filename) | |
seq = ''.join(dssp.aa) | |
structs = [] | |
for i in range(len(dssp.aa)): | |
s = dssp.struct[i][2] | |
if s == ' ': | |
s = 'U' | |
structs.append(s) | |
ss = ''.join(structs) | |
tco = ','.join(dssp.getTCO()) | |
kappa = ','.join(dssp.getKAPPA()) | |
alpha = ','.join(dssp.getALPHA()) | |
phi = ','.join(dssp.getPHI()) | |
psi = ','.join(dssp.getPSI()) | |
return (seq, ss, tco, kappa, alpha, phi, psi) | |
def parser(worker_queue, done_queue): | |
'''Parser task that parses DSSP files from the worker_queue and places | |
the results in the done_queue. | |
''' | |
while True: | |
filename = worker_queue.get() | |
# check if done | |
if filename is None: | |
return | |
res = _parse(filename) | |
done_queue.put(res) | |
if __name__ == '__main__': | |
num_workers = 8 | |
files = os.listdir(datapath) | |
num_files = len(files) | |
tenth = num_files // 10 | |
worker_queue = Queue() | |
done_queue = Queue() | |
print("Spawning %d workers..." % (num_workers)) | |
workers = [] | |
for _ in range(num_workers): | |
p = Process(target=parser, args=(worker_queue, done_queue)) | |
workers.append(p) | |
p.start() | |
# fill the queue with files | |
for filename in files: | |
worker_queue.put(datapath+filename) | |
# Add sentinels to signal end | |
for _ in range(num_workers): | |
worker_queue.put(None) | |
print("Parsing DSSP files...") | |
results = [] | |
file_index = 1 | |
for i in range(num_files): | |
res = done_queue.get() | |
results.append(res) | |
assert len(results) != 0 | |
# split the data into 10 different files | |
if len(results) % tenth == 0: | |
print("%2d%% done, saving to dssp_%d.csv" % (file_index*10, file_index)) | |
filename = "dssp_"+str(file_index)+".csv" | |
df = pd.DataFrame.from_records(results, columns=["seq", "ss", "tco", "kappa", "alpha", "phi", "psi"]) | |
df.to_csv(filename, index=False) | |
file_index += 1 | |
results = [] | |
print("Finished. Saving remaining %d to dssp_%d.dssp" % (len(results), file_index)) | |
filename = "dssp_"+str(file_index)+".csv" | |
df = pd.DataFrame.from_records(results, columns=["seq", "ss", "tco", "kappa", "alpha", "phi", "psi"]) | |
df.to_csv(filename, index=False) | |
print("Joining %d workers..." % (num_workers)) | |
for p in workers: | |
p.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment