Last active
November 17, 2023 09:53
-
-
Save KhepryQuixote/00946f2f7dd5f89324d8 to your computer and use it in GitHub Desktop.
Python 3 script to extract Chemical Abstract Society Registry Numbers (CASRNs) from PubChem's CID Synonym file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
Name: PyPubChemXtractor.py | |
Author: Khepry Quixote | |
Date: 11 Nov 2014 | |
Language: Python 3.4 | |
Narrative: | |
This Python 3.4 program will extract CASRN values from PubChem's | |
CID Synonym file, which can be found at the following address: | |
ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-Synonym-filtered.gz | |
As of 11 Nov 2014, this file is approximately 1.2 GB in size. | |
Lists of Chemical Abstract Society (CAS) Registration Numbers (CASRNs) are hard to find, | |
and this list is no exception. Within this list, the CASRNs are NOT explicitly identified as such, | |
possibly because CAS would rather have you go through them for access to the numbers in question. | |
However, CASRNs do seem to follow a particular pattern and as such can be extracted via | |
a regular expression pattern: "^[1-9][0-9]{1,6}\\-[0-9]{2}\\-[0-9]$". This is what this | |
program does, extracting CASRNs from the synonym list via regular expression pattern recognition. | |
The "master" output file contains a row for each CID, CASRN combination. | |
The "summary" output file contains a row for each CID, with all of the CASRNs for that CID number. | |
The "synonym" output file contains a row for each CID and synonym combination. | |
TODO: output these files as tables within one SQLite database file. | |
''' | |
import codecs | |
import csv | |
import os | |
import re | |
import sys | |
import time | |
# source file full path | |
srcFileFullPath = "~/data/fracking/CID-Synonym-filtered.txt" | |
# master file's full path | |
masFileFullPath = "~/data/fracking/CID-CASRN-SynXref-Master.txt" | |
# summary file's full path | |
sumFileFullPath = "~/data/fracking/CID-CASRN-SynXref-Summary.txt" | |
# synonyms file's full path | |
synFileFullPath = "~/data/fracking/CID-CASRN-SynXref-Synonyms.txt"; | |
# maximum number of matches desired | |
# 0 = unlimited number of matches | |
# greater than 0 = specified number of matches allowed | |
maxMatches = 0 | |
maxRows = 0 | |
flushCount = 100000 | |
# source file column separator | |
srcDelim = "\t"; | |
srcFieldNames = ['cid','synonym'] | |
# master file column separator | |
masDelim = "\t"; | |
masFieldNames = ['cid','casrn'] | |
# summary file column separator | |
sumDelim = "\t"; | |
sumCasrnDelim = "|"; | |
sumFieldNames = ['cid','casrns'] | |
# synonyms file column separator | |
synDelim = "\t"; | |
synFieldNames = ['cid','synonym'] | |
# summary value separator (ALT+1 is ☺) | |
sumValSeparator = "|"; | |
# Chemical Abstract Service Registry Number regular expression pattern | |
regExprCasRegNbr = "^[1-9][0-9]{1,6}\\-[0-9]{2}\\-[0-9]$" | |
# ============================================================================= | |
# Main routine: | |
# ============================================================================= | |
def main(): | |
global srcFileFullPath | |
global masFileFullPath | |
global sumFileFullPath | |
global synFileFullPath | |
bgnTime = time.time() | |
prevCID = "" | |
# if necessary | |
if srcFileFullPath.startswith('~'): | |
# expand the source file path with the user's folder | |
srcFileFullPath = os.path.expanduser(srcFileFullPath) | |
# verify that the source file exists | |
if not os.path.exists(srcFileFullPath): | |
sys.stderr.write('srcFileFullPath does NOT exist: %s%s' % (srcFileFullPath, os.linesep)) | |
return | |
# make sure the target folders exist | |
masFileFullPath = os.path.expanduser(masFileFullPath) | |
if not os.path.exists(os.path.dirname(masFileFullPath)): | |
os.makedirs(os.path.dirname(masFileFullPath)) | |
sumFileFullPath = os.path.expanduser(sumFileFullPath) | |
if not os.path.exists(os.path.dirname(sumFileFullPath)): | |
os.makedirs(os.path.dirname(synFileFullPath)) | |
synFileFullPath = os.path.expanduser(synFileFullPath) | |
if not os.path.exists(os.path.dirname(synFileFullPath)): | |
os.makedirs(os.path.dirname(synFileFullPath)) | |
srcFile = codecs.open(srcFileFullPath, 'rb', 'cp1252') | |
masFile = codecs.open(masFileFullPath, 'wb', 'cp1252') | |
sumFile = codecs.open(sumFileFullPath, 'wb', 'cp1252') | |
synFile = codecs.open(synFileFullPath, 'wb', 'cp1252') | |
srcReader = csv.DictReader(srcFile, delimiter=srcDelim, fieldnames=srcFieldNames) | |
masWriter = csv.DictWriter(masFile, delimiter=masDelim, fieldnames=masFieldNames) | |
sumWriter = csv.DictWriter(sumFile, delimiter=sumDelim, fieldnames=sumFieldNames) | |
synWriter = csv.DictWriter(synFile, delimiter=synDelim, fieldnames=synFieldNames) | |
masWriter.writeheader() | |
sumWriter.writeheader() | |
synWriter.writeheader() | |
# compile the CASRN regex pattern | |
# for later use in CASRN matching | |
pattern = re.compile(regExprCasRegNbr) | |
casrns = [] | |
synonyms = [] | |
rows = 0 | |
for rowDict in srcReader: | |
rows += 1 | |
# break on exceeding maximum rows | |
if maxRows > 0 and rows > maxRows: | |
break | |
# if level-break on CID | |
if prevCID != "" and rowDict['cid'] != prevCID: | |
# if CASRNs were found | |
if len(casrns) > 0: | |
# output rows to appropriate files | |
for casrn in casrns: | |
masWriter.writerow({'cid':prevCID, 'casrn':casrn}) | |
sumWriter.writerow({'cid':prevCID, 'casrns':sumCasrnDelim.join(casrns)}) | |
for synonym in synonyms: | |
synWriter.writerow({'cid':prevCID, 'synonym':synonym}) | |
del casrns[:] | |
del synonyms[:] | |
prevCID = rowDict['cid'].strip() | |
synonyms.append(rowDict['synonym'].strip()) | |
# if synonym matches CASRN regex pattern | |
if pattern.match(rowDict['synonym'].strip()): | |
# append it for later file output | |
casrns.append(rowDict['synonym'].strip()) | |
if rows % flushCount == 0: | |
masFile.flush() | |
sumFile.flush() | |
synFile.flush() | |
endTime = time.time() | |
seconds = endTime - bgnTime | |
if seconds > 0: | |
rcdsPerSec = rows / seconds | |
else: | |
rcdsPerSec = 0 | |
print ("CID Synonym Rows: {:,} in {:,.0f} seconds @ {:,.0f} records/second".format(rows, seconds, rcdsPerSec)) | |
# end-of-source-file processing | |
if prevCID != "": | |
# if CASRNs were found | |
if len(casrns) > 0: | |
# output rows to appropriate files | |
for casrn in casrns: | |
masWriter.writerow({'cid':prevCID, 'casrn':casrn}) | |
sumWriter.writerow({'cid':prevCID, 'casrns':sumCasrnDelim.join(casrns)}) | |
for synonym in synonyms: | |
synWriter.writerow({'cid':prevCID, 'synonym':synonym}) | |
del casrns[:] | |
del synonyms[:] | |
# close all files | |
srcFile.close() | |
masFile.close() | |
sumFile.close() | |
synFile.close() | |
print ("-----------------------------------") | |
endTime = time.time() | |
seconds = endTime - bgnTime | |
if seconds > 0: | |
rcdsPerSec = rows / seconds | |
else: | |
rcdsPerSec = 0 | |
print ("CID Synonym Rows: {:,} in {:,.0f} seconds @ {:,.0f} records/second".format(rows, seconds, rcdsPerSec)) | |
return | |
# ============================================================================ | |
# execute the mainline processing routine | |
# ============================================================================ | |
if (__name__ == "__main__"): | |
retval = main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment