Skip to content

Instantly share code, notes, and snippets.

@dillondaudert
Last active February 14, 2018 04:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dillondaudert/94785e9cc0318ac69243c6283da3a032 to your computer and use it in GitHub Desktop.
Save dillondaudert/94785e9cc0318ac69243c6283da3a032 to your computer and use it in GitHub Desktop.
Parse DSSP files into csv files using multiprocessing and pandas
# NOTE: This requires the DSSPData class shown in this gist: https://gist.github.com/jlhg/5181883
import os
import pandas as pd
from multiprocessing import Process, Queue
from DSSPData import DSSPData
datapath = "/path/to/dssp/data/"
def _parse(filename):
'''Parse a single .dssp file and return its properties.
Returns:
seq - str
secondary structures - str
tco - list
kappa - list
alpha - list
phi - list
psi - list
'''
dssp = DSSPData()
dssp.parseDSSP(filename)
seq = ''.join(dssp.aa)
structs = []
for i in range(len(dssp.aa)):
s = dssp.struct[i][2]
if s == ' ':
s = 'U'
structs.append(s)
ss = ''.join(structs)
tco = ','.join(dssp.getTCO())
kappa = ','.join(dssp.getKAPPA())
alpha = ','.join(dssp.getALPHA())
phi = ','.join(dssp.getPHI())
psi = ','.join(dssp.getPSI())
return (seq, ss, tco, kappa, alpha, phi, psi)
def parser(worker_queue, done_queue):
'''Parser task that parses DSSP files from the worker_queue and places
the results in the done_queue.
'''
while True:
filename = worker_queue.get()
# check if done
if filename is None:
return
res = _parse(filename)
done_queue.put(res)
if __name__ == '__main__':
num_workers = 8
files = os.listdir(datapath)
num_files = len(files)
tenth = num_files // 10
worker_queue = Queue()
done_queue = Queue()
print("Spawning %d workers..." % (num_workers))
workers = []
for _ in range(num_workers):
p = Process(target=parser, args=(worker_queue, done_queue))
workers.append(p)
p.start()
# fill the queue with files
for filename in files:
worker_queue.put(datapath+filename)
# Add sentinels to signal end
for _ in range(num_workers):
worker_queue.put(None)
print("Parsing DSSP files...")
results = []
file_index = 1
for i in range(num_files):
res = done_queue.get()
results.append(res)
assert len(results) != 0
# split the data into 10 different files
if len(results) % tenth == 0:
print("%2d%% done, saving to dssp_%d.csv" % (file_index*10, file_index))
filename = "dssp_"+str(file_index)+".csv"
df = pd.DataFrame.from_records(results, columns=["seq", "ss", "tco", "kappa", "alpha", "phi", "psi"])
df.to_csv(filename, index=False)
file_index += 1
results = []
print("Finished. Saving remaining %d to dssp_%d.dssp" % (len(results), file_index))
filename = "dssp_"+str(file_index)+".csv"
df = pd.DataFrame.from_records(results, columns=["seq", "ss", "tco", "kappa", "alpha", "phi", "psi"])
df.to_csv(filename, index=False)
print("Joining %d workers..." % (num_workers))
for p in workers:
p.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment