karpan karpanGit

## indigo, R-group decomposition with user specified query.py
# the query may be passed by the user directly
# prepare query scaffold (e.g. '(R1)C1CC(R3)CCC1(R2)')
# scaffold = indigo.loadQueryMoleculeFromFile(r"D:/tmp/query_mol.mol")
scaffold = indigo.loadQueryMolecule('C1%91CCC%92CC%931.[*:1]%91.[*:2]%93.[*:3]%92 |$;;;;;;_R1;_R2;_R3$|')
# init decomposition
deco = indigo.createDecomposer(scaffold)
# load molecule
# if Br was H it would not match, even with implicit hydrogen atoms
# hence need to repeat with multiple queries with R groups removed
mol = indigo.loadMolecule('NC1CC(Br)CCC1(O)') #

## indigo, substructure search with tautomers.py
##### method 1, builtin highlighting
# experiment with substructure matching (when tautomers, show the scaffold as in the target)
indigo = Indigo()
renderer = IndigoRenderer(indigo)
indigo.setOption("render-output-format", "png")
smiles1 = 'CCC(O)=CCCCC'
mol1 = indigo.loadMolecule(smiles1)
smiles2 = 'CC(=O)CC'
mol2 = indigo.loadQueryMolecule(smiles2)
flag = 'TAU' # other flags 'RES', 'TAU', 'TAU INCHI', 'TAU RSMARTS'

## embeddings through sentencepiece or with PyTorch directly.py

# for more details see
# https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1


# compute embeddings with sentencepiece
from sentence_transformers import SentenceTransformer, util
docs = ["Around 9 Million people live in London", "This is nice"]
#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

## pyspark, apply mapping.py
# map values in pyspark
import pyspark.sql.functions as F
from itertools import chain

data = [['a', 1], ['b', 2], ['a', 3], ['d', 4]]
data = spark.createDataFrame(data, schema=['name', 'val'])
data.show()

# create mapping column
mapping = {'a': 'hello a', 'b': 'hello b', 'c': 'hello c'}

## pyspark, local vs global views.py
spark = (
    SparkSession.builder
        .appName('learn')
        # .config('spark.sql.shuffle.partitions', 10)
        # .config('spark.default.parallelism', 10)
        # .config('spark.executor.memory', '1g')
        # .config('spark.driver.memory', '1g')
        # .config('spark.executor.instances', 1)
        #.config('spark.executor.cores', 2)
        .getOrCreate()

## pyspark, generate dataframe from dictionary with and without a schema.py
# create dataframe from dictionary, without a schema
df = [{'one': 1, 'two': [1,2,3]}, {'one': 101}]
df = spark.createDataFrame(df)
df.printSchema()
# root
#  |-- one: long (nullable = true)
#  |-- two: array (nullable = true)
#  |    |-- element: long (containsNull = true)
df.show()
# |one|      two|

## pyspark, create struct from columns.py
# simple example, create struct
import pyspark.sql.functions as F
df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy'))
res.show()
# |  x|        _xy|
# +---+-----------+
# |  1| {1, mplah}|
# |  2|{2, mplah2}|

## pyspark, explode array, explode list, collect_list, collect_set.py

# simple example: explode array
import pyspark.sql.functions as F
df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]]
df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
df.printSchema()
res = df.select(F.col('x'), F.explode(F.col('z')).alias('z'))
res.show(truncate=False)
res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z'))
res.show(truncate=False)

## pyspark, from pyspark to json and back.py
# from pyspark schema to json and the other way round
import pyspark.sql.types as T
from pprint import pprint
# -- create simple dataframe and schema
df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
schema = T.StructType([
    T.StructField('x1', T.LongType()),
    T.StructField('x2', T.StringType()),
    T.StructField('x3', T.ArrayType(StringType())),
    T.StructField('x4', T.MapType(StringType(), StringType())),

## pyspark, read csv by specifying date and datetime format.py
# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early


# read dates and datetimes using the default ISO format
# date: yyyy-MM-dd
# datetime: yyyy-MM-ddTHH:mm:ss.SSS
import pyspark.sql.types as T
with open(r'D:/junk/tmp.csv', 'wt') as f:
    f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n')
    f.write('1\t2022-10-13\t2021-10-03T06:32:01.001')
	# the query may be passed by the user directly
	# prepare query scaffold (e.g. '(R1)C1CC(R3)CCC1(R2)')
	# scaffold = indigo.loadQueryMoleculeFromFile(r"D:/tmp/query_mol.mol")
	scaffold = indigo.loadQueryMolecule('C1%91CCC%92CC%931.[:1]%91.[:2]%93.[*:3]%92 \|$;;;;;;_R1;_R2;_R3$\|')
	# init decomposition
	deco = indigo.createDecomposer(scaffold)
	# load molecule
	# if Br was H it would not match, even with implicit hydrogen atoms
	# hence need to repeat with multiple queries with R groups removed
	mol = indigo.loadMolecule('NC1CC(Br)CCC1(O)') #
	##### method 1, builtin highlighting
	# experiment with substructure matching (when tautomers, show the scaffold as in the target)
	indigo = Indigo()
	renderer = IndigoRenderer(indigo)
	indigo.setOption("render-output-format", "png")
	smiles1 = 'CCC(O)=CCCCC'
	mol1 = indigo.loadMolecule(smiles1)
	smiles2 = 'CC(=O)CC'
	mol2 = indigo.loadQueryMolecule(smiles2)
	flag = 'TAU' # other flags 'RES', 'TAU', 'TAU INCHI', 'TAU RSMARTS'

	# for more details see
	# https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1


	# compute embeddings with sentencepiece
	from sentence_transformers import SentenceTransformer, util
	docs = ["Around 9 Million people live in London", "This is nice"]
	#Load the model
	model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
	# map values in pyspark
	import pyspark.sql.functions as F
	from itertools import chain

	data = [['a', 1], ['b', 2], ['a', 3], ['d', 4]]
	data = spark.createDataFrame(data, schema=['name', 'val'])
	data.show()

	# create mapping column
	mapping = {'a': 'hello a', 'b': 'hello b', 'c': 'hello c'}
	spark = (
	SparkSession.builder
	.appName('learn')
	# .config('spark.sql.shuffle.partitions', 10)
	# .config('spark.default.parallelism', 10)
	# .config('spark.executor.memory', '1g')
	# .config('spark.driver.memory', '1g')
	# .config('spark.executor.instances', 1)
	#.config('spark.executor.cores', 2)
	.getOrCreate()
	# create dataframe from dictionary, without a schema
	df = [{'one': 1, 'two': [1,2,3]}, {'one': 101}]
	df = spark.createDataFrame(df)
	df.printSchema()
	# root
	# \|-- one: long (nullable = true)
	# \|-- two: array (nullable = true)
	# \| \|-- element: long (containsNull = true)
	df.show()
	# \|one\| two\|
	# simple example, create struct
	import pyspark.sql.functions as F
	df = [[1, 'mplah', 'gogo'], [2, 'mplah2', 'gogo2'], [3, 'mplah3', 'gogo3']]
	df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
	res = df.select(F.col('x'), F.struct(F.col('x').alias('_x'), F.col('y').alias('_y')).alias('_xy'))
	res.show()
	# \| x\| _xy\|
	# +---+-----------+
	# \| 1\| {1, mplah}\|
	# \| 2\|{2, mplah2}\|

	# simple example: explode array
	import pyspark.sql.functions as F
	df = [[1, 'mplah', ['a', 'b', 'c']], [2, 'mplah2', ['a2', 'b2', 'c2']], [3, 'mplah3', ['a3', 'b3', 'c3', 'd3']]]
	df = spark.createDataFrame(df, schema=['x', 'y', 'z'])
	df.printSchema()
	res = df.select(F.col('x'), F.explode(F.col('z')).alias('z'))
	res.show(truncate=False)
	res = df.select(F.col('x'), F.posexplode(F.col('z')).alias('z_id', 'z'))
	res.show(truncate=False)
	# from pyspark schema to json and the other way round
	import pyspark.sql.types as T
	from pprint import pprint
	# -- create simple dataframe and schema
	df = [[1, 'mplah', ['Panos', 'George'], {'a': 'b', 'c': 'd'}, ('mplip1', 'mplip1_')], [2, 'mplah2', ['Panos2', 'George2'], {'a2': 'b2', 'c2': 'd2'}, ('mplip2', 'mplip2_')] ]
	schema = T.StructType([
	T.StructField('x1', T.LongType()),
	T.StructField('x2', T.StringType()),
	T.StructField('x3', T.ArrayType(StringType())),
	T.StructField('x4', T.MapType(StringType(), StringType())),
	# note the FAILFAST mode. It is much preferred to PERMISSIVE to catch errors early


	# read dates and datetimes using the default ISO format
	# date: yyyy-MM-dd
	# datetime: yyyy-MM-ddTHH:mm:ss.SSS
	import pyspark.sql.types as T
	with open(r'D:/junk/tmp.csv', 'wt') as f:
	f.write('1\t2022-10-03\t2022-10-03T06:02:01.657\n')
	f.write('1\t2022-10-13\t2021-10-03T06:32:01.001')