Skip to content

Instantly share code, notes, and snippets.

@jimathyp
Last active September 21, 2022 05:41
Show Gist options
  • Save jimathyp/0c95148274c6f40b1106c4aa03ada023 to your computer and use it in GitHub Desktop.
Save jimathyp/0c95148274c6f40b1106c4aa03ada023 to your computer and use it in GitHub Desktop.
Spark on Databricks

Spark on Databricks

Notebook comes with a spark session

print(dir())             # sc, spark, sql, sqlContext
print(type(spark))       # <class 'pyspark.sql.session.SparkSession'>
print(type(sc))          # <class 'dbruntime.spark_connection.RemoteContext'>
print(type(sql))         # <class 'method'>  Help on method sql in module pyspark.sql.context
print(type(sqlContext))  # <pyspark.sql.context.SQLContext object at 0x7fb14a1373d0>

All funcs/attribs available in a python cell

dir()

Out[1]: ['In',
 'Out',
 '_',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__name__',
 '_dh',
 '_i',
 '_i1',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'dbutils',
 'display',
 'displayHTML',
 'exit',
 'getArgument',
 'get_ipython',
 'quit',
 'sc',
 'spark',
 'sql',
 'sqlContext',
 'table',
 'udf']
%py
for f in dir():
    print(f"{f} {type(eval(f))}")

dbutils      <class 'dbruntime.dbutils.DBUtils'>
display      <class 'method'>
displayHTML  <class 'method'>
exit         <class 'IPython.core.autocall.ExitAutocall'>
getArgument  <class 'method'>
get_ipython  <class 'method'>
quit         <class 'IPython.core.autocall.ExitAutocall'>
sc           <class 'dbruntime.spark_connection.RemoteContext'>
spark        <class 'pyspark.sql.session.SparkSession'>
sql          <class 'method'>
sqlContext   <class 'pyspark.sql.context.SQLContext'>
table        <class 'method'>
udf          <class 'function'>
dir(spark)
Out[10]: ['Builder',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activeSession',
 '_conf',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromLocalTrusted',
 '_createFromRDD',
 '_create_dataframe',
 '_create_from_pandas_with_arrow',
 '_create_rdd_from_local_trusted',
 '_create_shell_session',
 '_get_numpy_record_dtype',
 '_get_pandas_num_slices',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_jwrapped',
 '_repr_html_',
 '_sc',
 '_wrap_data_schema',
 '_wrapped',
 '_write_to_trusted_path',
 'builder',
 'catalog',
 'conf',
 'createDataFrame',
 'getActiveSession',
 'newSession',
 'range',
 'read',
 'readStream',
 'sparkContext',
 'sql',
 'stop',
 'streams',
 'table',
 'udf',
 'version']
dir(spark.conf)
Out[11]: ['__class__',
'__delattr__',
'__dict__',
'__dir__',
'__doc__',
'__eq__',
'__format__',
'__ge__',
'__getattribute__',
'__gt__',
'__hash__',
'__init__',
'__init_subclass__',
'__le__',
'__lt__',
'__module__',
'__ne__',
'__new__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__setattr__',
'__sizeof__',
'__str__',
'__subclasshook__',
'__weakref__',
'_checkType',
'_jconf',
'get',
'isModifiable',
'set',
'unset'] 
dir(spark.conf._jconf)
Out[14]: ['$anonfun$requireNonStaticConf$1',
 '$anonfun$unset$1',
 'contains',
 'equals',
 'get',
 'getAll',
 'getClass',
 'getOption',
 'hashCode',
 'isModifiable',
 'notify',
 'notifyAll',
 'set',
 'sqlConf',
 'sqlConf_$eq',
 'toString',
 'unset',
 'wait']

help(spark.conf._jconf.getAll())
sc._conf.getAll()
spark.sparkContext.getConf().getAll()
for item in sorted(sc._conf.getAll()): print(item)

Read data direectly

select * from parquet.`s3://....`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment