This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the query may be passed by the user directly | |
# prepare query scaffold (e.g. '(R1)C1CC(R3)CCC1(R2)') | |
# scaffold = indigo.loadQueryMoleculeFromFile(r"D:/tmp/query_mol.mol") | |
scaffold = indigo.loadQueryMolecule('C1%91CCC%92CC%931.[*:1]%91.[*:2]%93.[*:3]%92 |$;;;;;;_R1;_R2;_R3$|') | |
# init decomposition | |
deco = indigo.createDecomposer(scaffold) | |
# load molecule | |
# if Br was H it would not match, even with implicit hydrogen atoms | |
# hence need to repeat with multiple queries with R groups removed | |
mol = indigo.loadMolecule('NC1CC(Br)CCC1(O)') # |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### method 1, builtin highlighting | |
# experiment with substructure matching (when tautomers, show the scaffold as in the target) | |
indigo = Indigo() | |
renderer = IndigoRenderer(indigo) | |
indigo.setOption("render-output-format", "png") | |
smiles1 = 'CCC(O)=CCCCC' | |
mol1 = indigo.loadMolecule(smiles1) | |
smiles2 = 'CC(=O)CC' | |
mol2 = indigo.loadQueryMolecule(smiles2) | |
flag = 'TAU' # other flags 'RES', 'TAU', 'TAU INCHI', 'TAU RSMARTS' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for more details see | |
# https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1 | |
# compute embeddings with sentencepiece | |
from sentence_transformers import SentenceTransformer, util | |
docs = ["Around 9 Million people live in London", "This is nice"] | |
#Load the model | |
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# map values in pyspark | |
import pyspark.sql.functions as F | |
from itertools import chain | |
data = [['a', 1], ['b', 2], ['a', 3], ['d', 4]] | |
data = spark.createDataFrame(data, schema=['name', 'val']) | |
data.show() | |
# create mapping column | |
mapping = {'a': 'hello a', 'b': 'hello b', 'c': 'hello c'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark = ( | |
SparkSession.builder | |
.appName('learn') | |
# .config('spark.sql.shuffle.partitions', 10) | |
# .config('spark.default.parallelism', 10) | |
# .config('spark.executor.memory', '1g') | |
# .config('spark.driver.memory', '1g') | |
# .config('spark.executor.instances', 1) | |
#.config('spark.executor.cores', 2) | |
.getOrCreate() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create dataframe from dictionary, without a schema | |
df = [{'one': 1, 'two': [1,2,3]}, {'one': 101}] | |
df = spark.createDataFrame(df) | |
df.printSchema() | |
# root | |
# |-- one: long (nullable = true) | |
# |-- two: array (nullable = true) | |
# | |-- element: long (containsNull = true) | |
df.show() | |
# |one| two| |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simple example, create struct | |
import pyspark.sql.functions as F | |
df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']] | |
df = spark.createDataFrame(df, schema=['x', 'y', 'z']) | |
res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy')) | |
res.show() | |
# | x| _xy| | |
# +---+-----------+ | |
# | 1| {1, mplah}| | |
# | 2|{2, mplah2}| |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# simple example: explode array | |
import pyspark.sql.functions as F | |
df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]] | |
df = spark.createDataFrame(df, schema=['x', 'y', 'z']) | |
df.printSchema() | |
res = df.select(F.col('x'), F.explode(F.col('z')).alias('z')) | |
res.show(truncate=False) | |
res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z')) | |
res.show(truncate=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from pyspark schema to json and the other way round | |
import pyspark.sql.types as T | |
from pprint import pprint | |
# -- create simple dataframe and schema | |
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ] | |
schema = T.StructType([ | |
T.StructField('x1', T.LongType()), | |
T.StructField('x2', T.StringType()), | |
T.StructField('x3', T.ArrayType(StringType())), | |
T.StructField('x4', T.MapType(StringType(), StringType())), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early | |
# read dates and datetimes using the default ISO format | |
# date: yyyy-MM-dd | |
# datetime: yyyy-MM-ddTHH:mm:ss.SSS | |
import pyspark.sql.types as T | |
with open(r'D:/junk/tmp.csv', 'wt') as f: | |
f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n') | |
f.write('1\t2022-10-13\t2021-10-03T06:32:01.001') |
NewerOlder