Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created November 4, 2023 23:12
Show Gist options
  • Save PatWalters/854a73154feaaeadce8b3f33bf1ce121 to your computer and use it in GitHub Desktop.
Save PatWalters/854a73154feaaeadce8b3f33bf1ce121 to your computer and use it in GitHub Desktop.
Parse a larger ChemFP output file
#!/usr/bin/env python
import os
import sys
import pandas as pd
from tqdm.auto import tqdm
if len(sys.argv) != 4:
print(f"usage: {sys.argv[0]} cutoff infile outfile")
sys.exit(1)
cutoff = float(sys.argv[1])
infile_name = sys.argv[2]
outfile_name = sys.argv[3]
for idx,chunk in enumerate(tqdm(pd.read_csv(infile_name,sep="\t",skiprows=8,header=None, chunksize=1000))):
result_list = []
for row in chunk.values:
num_sims, query = row[0:2]
lst = row[2:]
it = iter(lst)
for name in it:
sim = next(it)
sim_val = float(sim)
if sim_val < cutoff:
result_list.append([query, name, sim])
result_df = pd.DataFrame(result_list,columns=["query","name","sim"])
if idx == 0:
result_df.to_csv(outfile_name,index=False)
else:
result_df.to_csv(outfile_name,index=False,mode='a',header=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment