Skip to content

Instantly share code, notes, and snippets.

@mbohun
Created July 13, 2012 02:20
Show Gist options
  • Save mbohun/3102307 to your computer and use it in GitHub Desktop.
Save mbohun/3102307 to your computer and use it in GitHub Desktop.
Martin Bohun's SMILES tokenizer
// spidermonkey: /usr/local/spider-monkey-1.8.5/bin/js -f ./test.js
// rhino: java -jar /usr/local/rhino/js.jar -f ./test.js
//
var smiles_tokenize = function ( ) {
var PTE = [
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na",
"Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti",
"V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge",
"As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",
"I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
"Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf",
"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb",
"Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U",
"Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No",
"Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn",
"Uut", "Uuq", "Uup", "Uuh", "Uus", "Uuo"
],
special = [ //aromatic: C, N, O, P, S, As, Se, and *
"(", ")", "[", "]", "=", "#", "@", "*", "%", "1", "2", "3", "4",
"5", "6", "7", "8", "9", ".", "/", "\\", "+", "-", "c", "n",
"o"
],
table = PTE.sort().reverse().concat(special),
match_symbol = function (smiles, offset, tokens) {
for (var i = 0; i < table.length; i++) {
var symbol = table[i];
if (symbol === smiles.substr(offset, symbol.length)) {
tokens.push(symbol);
return symbol.length;
}
}
return 0;
};
return function (smiles) {
var tok = [], i = 0;
while (i < smiles.length) {
var match = match_symbol(smiles, i, tok);
if (match > 0) {
i = i + match;
} else {
print("smiles_tokenize error - no match[" + i + "]:"
+ smiles.substr(i, 1));
i = i + 1;
}
}
return tok;
};
}( );
// spidermonkey (1.8.5) requires the explicit this.hasOwnProperty
print("read():" + this.hasOwnProperty("read"));
print("readFile():" + this.hasOwnProperty("readFile"));
if (this.hasOwnProperty("read")) {
read_file = read;
}
if (this.hasOwnProperty("readFile")) {
read_file = readFile;
}
// what if read_file is udnefined?
// print(typeof read_file);
var smiles = read_file("data.smi").trim().split("\n");
for (var i = 0; i < smiles.length; i++) {
print("smiles[" + i + "]:" + smiles[i]
+ " " + smiles_tokenize(smiles[i]));
}
//
// bash-3.1$ cat data.smi
// COc1c(OC)c(OC)cc(c1)CCN
// COc(c1)cccc1C#N
// COc1ccc(Br)cc1
// C1CN(C)[C@@H]2CC(=O)CC[C@@]21c3ccc(OC)c(OC)c3
// bash-3.1$ java -jar /usr/local/rhino/js.jar -f ./smiles.js
// Picked up _JAVA_OPTIONS: -Dawt.useSystemAAFontSettings=on
// read():false
// readFile():true
// smiles[0]:COc1c(OC)c(OC)cc(c1)CCN C,O,c,1,c,(,O,C,),c,(,O,C,),c,c,(,c,1,),C,C,N
// smiles[1]:COc(c1)cccc1C#N C,O,c,(,c,1,),c,c,c,c,1,C,#,N
// smiles[2]:COc1ccc(Br)cc1 C,O,c,1,c,c,c,(,Br,),c,c,1
// smiles[3]:C1CN(C)[C@@H]2CC(=O)CC[C@@]21c3ccc(OC)c(OC)c3 C,1,C,N,(,C,),[,C,@,@,H,],2,C,C,(,=,O,),C,C,[,C,@,@,],2,1,c,3,c,c,c,(,O,C,),c,(,O,C,),c,3
//
// bash-3.1$ /usr/local/spider-monkey-1.8.5/bin/js -f ./smiles.js
// read():true
// readFile():false
// smiles[0]:COc1c(OC)c(OC)cc(c1)CCN C,O,c,1,c,(,O,C,),c,(,O,C,),c,c,(,c,1,),C,C,N
// smiles[1]:COc(c1)cccc1C#N C,O,c,(,c,1,),c,c,c,c,1,C,#,N
// smiles[2]:COc1ccc(Br)cc1 C,O,c,1,c,c,c,(,Br,),c,c,1
// smiles[3]:C1CN(C)[C@@H]2CC(=O)CC[C@@]21c3ccc(OC)c(OC)c3 C,1,C,N,(,C,),[,C,@,@,H,],2,C,C,(,=,O,),C,C,[,C,@,@,],2,1,c,3,c,c,c,(,O,C,),c,(,O,C,),c,3
//
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment