Created
October 25, 2011 14:13
-
-
Save lsauer/1312860 to your computer and use it in GitHub Desktop.
Regular Expressions for validating SMILES, InChi, InChiKey
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//SMILES, Inchi Regex , by lo sauer - lsauer.com | |
//Here's a PREG version for SMILES validation (JavaScript) beyond a length of 5: | |
var x = "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2" | |
x.trim().match(/^([^J][a-z0-9@+\-\[\]\(\)\\\/%=#$]{6,})$/ig)[0] | |
>"OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H]2[C@@H]1c3c(O)c(OC)c(O)cc3C(=O)O2" | |
//for the most frequent organic molecules | |
x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig) | |
//generic Perl RegEx: | |
/^([^J][A-Za-z0-9@+\-\[\]\(\)\\\/%=#$]+)$/ | |
//Note: The only letter not appearing on the Periodic Table is the letter "J" | |
//Annotated | |
x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})$/ig) | |
//if you need a carbon count: | |
x.toLowerCase().split('').map(function(v,k){return +'c'==v;}) | |
>[false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, | |
false, false, false, false] | |
x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}) | |
>[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, | |
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0] | |
if(!Array.prototype.hasOwnProperty('sum')) | |
{ | |
function (){return this.reduce(function(a,b){return a+b})} | |
} | |
x.toLowerCase().split('').map(function(v,k){return 'c'==v|0;}).sum() | |
>14 | |
Array.prototype.atomCount = function(t){ return this.map(function(v,k){return t==v|0;}).reduce(function(a,b){return a+b}) }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// International Chemical Identifier Regex, by lo sauer - lsauer.com | |
// Morphine InchI: | |
var x="InChI=1S/C17H19NO3/c1-18-7-6-17-10-3-5-13(20)16(17)21-15-12(19)4-2-9(14(15)17)8-11(10)18/h2-5,10-11,13,16,19-20H,6-8H2,1H3/t10-,11+,13-,16-,17-/m0/s1" | |
// applying an organic character-subset | |
// we could check for the length property, but in case of 0 matches 'null' is returned -> hence !!.. \ generally equal to Boolean(..) | |
!!x.trim().match(/^((InChI=)?[^J][0-9BCOHNSOPrIFla+\-\(\)\\\/,pqbtmsih]{6,})$/ig) | |
>true | |
//generic: | |
x.trim().match(/^((InChI=)?[^J][0-9a-z+\-\(\)\\\/,]+)$/ig) | |
>true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// International Chemical Identifier KEY Regex, by lo sauer - lsauer.com | |
// The InChIKey, or hashed InChI, is a fixed length (25 character) condensed digital representation | |
// of the InChI, which tries to be unique but is not human-comprehensible. It uses a BASE26 alphabet (Hexavigesimal)! | |
// The last character of an InChIKey is computed from the rest of the InChIKey | |
// The InChIKey specification facilitates web searches for chemical compounds, owing to unqiue referencing | |
// of compounds with a concise key, which is problematic with the full-length InChI (e.g. GET url limit is 1600 chars) | |
// From the official documents ( http://chemdata.nist.gov/InChI/inchi-hash.pdf ): | |
// "The InChIKey is a character signature based on a hash code of the InChI string. Also, this hash | |
// may serve as a checksum for verifying InChI, for example, after transmission over a network." | |
// InChIKey has four (4) distinct components: a 14-character hash of the basic (Mobile-H) | |
// InChI layer (without /p segment accounting for added or removed protons); a 8-character | |
// hash of the remaining layers; a 1 character is a flag indicating selected features (e.g. | |
// presence of fixed-H layer); a 1 character is a “check” character. The overall length of | |
// InChIKey is fixed at 25 characters, including separator: | |
// AAAAAAAAAAAAAA-BBBBBBBBCD | |
// This is significantly shorter than a typical InChI string (for example, the average length | |
// of InChI string for Pubchem collection is 146 characters). | |
// -------------------------------- | |
// InChIKey layout is as follows: | |
// -------------------------------- | |
// AAAAAAAAAAAAAA | |
// First block (14 letters) | |
// Encodes molecular skeleton (connectivity) | |
// BBBBBBBB | |
// Second block (8 letters) | |
// Encodes proton positions (tautomers), stereochemistry, isotopomers, reconnected layer | |
// C | |
// Flag character | |
// Indicates InChI version, presence of a fixed-H layer, isotopes, and stereochemical | |
// information. | |
// D | |
// Check character, obtained from all symbols except delimiters, i.e. from | |
// AAAAAAAAAAAAAABBBBBBBBC | |
// All symbols except the delimiter (a dash, that is, a minus) are uppercase English letters | |
// representing a “base-26” encoding. | |
// see also:http://en.wikipedia.org/wiki/Hexavigesimal | |
//InChiKey v1.2 length: 14-10-1 | |
//InChIKey v1.2 for morphine is BQJCRHHNABKAKU-KBQPJGBKSA-N | |
var x = 'BQJCRHHNABKAKU-KBQPJGBKSA-N' | |
25===x.length && '-'===x[14] | |
&& !!x.match(/^([0-9A-Z\-]+)$/) | |
>false | |
//enzyme ligand Copper - InchiKey: RYGMFSIKBFXOCR-UHFFFAOYSA-N | |
var x = 'RYGMFSIKBFXOCR-UHFFFAOYSA-N' | |
27===x.length && '-'===x[14] && '-'===x[25] | |
&& !!x.match(/^([0-9A-Z\-]+)$/) | |
>true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Simplified Molecular Input Line Entry Specification (SMILES) Regex annotated, by lo sauer - lsauer.com | |
/^( //starting with | |
[^J] //must not contain J, which is the only letter absent from the periodic table | |
[0-9 // number for cyclic connection quantifiers and charge; e.g. [Co+3] or [Co+++] | |
BCOHNSOPrIFla //"organic subset": B, C, N, O, P, S, F, Cl, Br, I -> do not require [] in SMILES; additionally: Na, Cl, Fl | |
@ // Stereocenter configuration descriptor, usually tetrahedral carbon e.g. L-Ala N[C@@H](C)C(=O)O vs. D-Ala N[C@H](C)C(=O)O | |
% // unique labels: C14 is a carbon hold the ring closure labels 1 and 2; C%12 holds the unique label 12 | |
+\- // +- the charge sign | |
\[\] // [] is used to delinate atoms e.g. [Co+++] | |
\(\) // () branching-descriptors of the sparse-tree (rings are broken in SMILES), e.g. CCC(=O)O for propionic acid | |
\\\/ // /\ for configuration around double bonds e.g. F/C=C/F... trans-difluoroethene, | |
= // double bonds e.g. O=C=O (carbon dioxide) | |
# // triple bonds e.g. C#N (hydrogen cyanide,) | |
$ // quadruple bonds e.g. [Ga-]$[As+] (gallium arsenide) | |
]{6,} // length must be > 5 | |
)$ | |
/ig //ending with, search global, case-insensitive | |
/** | |
* SMARTS | |
*/ | |
// SMARTS...Smiles arbitrary target specification has commas in addition to the SMILES character-set | |
// bonds are defined as: '-' (single), '=' (double), '#' (triple), ':' (aromatic) and '~' (any) | |
// logic operators: | |
// OR operator ',' | |
// AND operator '&' (lower priority ';') | |
// NOT operator '!' | |
// more information: http://en.wikipedia.org/wiki/Smiles_arbitrary_target_specification | |
//REGEX | |
/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~&!]{6,})$/ | |
//examples | |
// definitions of hydrogen bond donors and acceptors used to apply Lipinski's Rule of Five: | |
var x="[N,n,O;!H0]" | |
var y="[#7,#8;!H0]" | |
!!x.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig) | |
>true | |
y.trim().match(/^([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$,.~;&!]{6,})$/ig) | |
>true | |
// definition of aliphatic amines, likely to protonate at physiological pH: | |
var x="[$([NH2][CX4]),$([NH]([CX4])[CX4]),$[NX3]([CX4])([CX4])[CX4])]" | |
>true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment