Skip to content

Instantly share code, notes, and snippets.

@xhochy
Created November 9, 2018 10:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xhochy/2f0b497e474f4c97031e1e92e9ca7a76 to your computer and use it in GitHub Desktop.
Save xhochy/2f0b497e474f4c97031e1e92e9ca7a76 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import jpype\n",
"import pyarrow.jvm\n",
"import fletcher as fr\n",
"import pandas as pd\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tools_jar = os.path.realpath(\"../java/tools/target/arrow-tools-0.11.0-SNAPSHOT-jar-with-dependencies.jar\")\n",
"jdbc_jar = os.path.realpath(\"../java/adapter/jdbc/target/arrow-jdbc-0.11.0-SNAPSHOT.jar\")\n",
"drill_jar = os.path.realpath(\"apache-drill-1.14.0/jars/jdbc-driver/drill-jdbc-all-1.14.0.jar\")\n",
"jpype.startJVM(jpype.getDefaultJVMPath(), \"-Djava.class.path={}:{}:{}\".format(tools_jar, jdbc_jar, drill_jar))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ra = jpype.JPackage(\"org\").apache.arrow.memory.RootAllocator(sys.maxsize)\n",
"jpype.JPackage(\"java\").lang.Class.forName(\"org.apache.drill.jdbc.Driver\");\n",
"dm = jpype.JPackage(\"java\").sql.DriverManager"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"connection = dm.getConnection('jdbc:drill:drillbit=127.0.0.1')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
" SELECT *\n",
" FROM dfs.`/Users/ukorn/Development/arrow-repos-3/arrow/python/green_tripdata_2018-06.parquet`\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 16.1 s, sys: 845 ms, total: 16.9 s\n",
"Wall time: 12 s\n"
]
}
],
"source": [
"%%time\n",
"batch = jpype.JPackage(\"org\").apache.arrow.adapter.jdbc.JdbcToArrow.sqlToArrow(\n",
" connection,\n",
" query,\n",
" ra\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"record_batch = pyarrow.jvm.record_batch(batch)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = fr.pandas_from_arrow(record_batch)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>VendorID</th>\n",
" <th>lpep_pickup_datetime</th>\n",
" <th>lpep_dropoff_datetime</th>\n",
" <th>store_and_fwd_flag</th>\n",
" <th>RatecodeID</th>\n",
" <th>PULocationID</th>\n",
" <th>DOLocationID</th>\n",
" <th>passenger_count</th>\n",
" <th>trip_distance</th>\n",
" <th>fare_amount</th>\n",
" <th>extra</th>\n",
" <th>mta_tax</th>\n",
" <th>tip_amount</th>\n",
" <th>tolls_amount</th>\n",
" <th>ehail_fee</th>\n",
" <th>improvement_surcharge</th>\n",
" <th>total_amount</th>\n",
" <th>payment_type</th>\n",
" <th>trip_type</th>\n",
" <th>__index_level_0__</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>2018-05-31 22:33:55</td>\n",
" <td>2018-05-31 22:36:13</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" <td>33</td>\n",
" <td>5</td>\n",
" <td>0.51</td>\n",
" <td>4.0</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.70</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.3</td>\n",
" <td>6.00</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2018-05-31 22:40:36</td>\n",
" <td>2018-05-31 22:49:46</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" <td>49</td>\n",
" <td>5</td>\n",
" <td>1.97</td>\n",
" <td>9.0</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>2.06</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.3</td>\n",
" <td>12.36</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>2018-05-31 22:57:12</td>\n",
" <td>2018-05-31 23:02:58</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>61</td>\n",
" <td>49</td>\n",
" <td>5</td>\n",
" <td>1.40</td>\n",
" <td>6.5</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.3</td>\n",
" <td>7.80</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>2018-05-31 22:10:13</td>\n",
" <td>2018-05-31 22:16:27</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>49</td>\n",
" <td>97</td>\n",
" <td>1</td>\n",
" <td>1.36</td>\n",
" <td>7.0</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.3</td>\n",
" <td>8.30</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2018-05-31 22:32:08</td>\n",
" <td>2018-05-31 22:52:06</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>75</td>\n",
" <td>127</td>\n",
" <td>1</td>\n",
" <td>7.90</td>\n",
" <td>24.0</td>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>6.30</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.3</td>\n",
" <td>31.60</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag \\\n",
"0 2 2018-05-31 22:33:55 2018-05-31 22:36:13 False \n",
"1 2 2018-05-31 22:40:36 2018-05-31 22:49:46 False \n",
"2 2 2018-05-31 22:57:12 2018-05-31 23:02:58 False \n",
"3 2 2018-05-31 22:10:13 2018-05-31 22:16:27 False \n",
"4 1 2018-05-31 22:32:08 2018-05-31 22:52:06 False \n",
"\n",
" RatecodeID PULocationID DOLocationID passenger_count trip_distance \\\n",
"0 1 66 33 5 0.51 \n",
"1 1 25 49 5 1.97 \n",
"2 1 61 49 5 1.40 \n",
"3 1 49 97 1 1.36 \n",
"4 1 75 127 1 7.90 \n",
"\n",
" fare_amount extra mta_tax tip_amount tolls_amount ehail_fee \\\n",
"0 4.0 0.5 0.5 0.70 0.0 NaN \n",
"1 9.0 0.5 0.5 2.06 0.0 NaN \n",
"2 6.5 0.5 0.5 0.00 0.0 NaN \n",
"3 7.0 0.5 0.5 0.00 0.0 NaN \n",
"4 24.0 0.5 0.5 6.30 0.0 NaN \n",
"\n",
" improvement_surcharge total_amount payment_type trip_type __index_level_0__ \n",
"0 0.3 6.00 1 1 0 \n",
"1 0.3 12.36 1 1 1 \n",
"2 0.3 7.80 2 1 2 \n",
"3 0.3 8.30 2 1 3 \n",
"4 0.3 31.60 1 1 4 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 739373 entries, 0 to 739372\n",
"Data columns (total 20 columns):\n",
"VendorID 739373 non-null fletcher[int64]\n",
"lpep_pickup_datetime 739373 non-null fletcher[timestamp[ms, tz=UTC]]\n",
"lpep_dropoff_datetime 739373 non-null fletcher[timestamp[ms, tz=UTC]]\n",
"store_and_fwd_flag 739373 non-null fletcher[bool]\n",
"RatecodeID 739373 non-null fletcher[int64]\n",
"PULocationID 739373 non-null fletcher[int64]\n",
"DOLocationID 739373 non-null fletcher[int64]\n",
"passenger_count 739373 non-null fletcher[int64]\n",
"trip_distance 739373 non-null fletcher[double]\n",
"fare_amount 739373 non-null fletcher[double]\n",
"extra 739373 non-null fletcher[double]\n",
"mta_tax 739373 non-null fletcher[double]\n",
"tip_amount 739373 non-null fletcher[double]\n",
"tolls_amount 739373 non-null fletcher[double]\n",
"ehail_fee 0 non-null fletcher[double]\n",
"improvement_surcharge 739373 non-null fletcher[double]\n",
"total_amount 739373 non-null fletcher[double]\n",
"payment_type 739373 non-null fletcher[int64]\n",
"trip_type 739373 non-null fletcher[int64]\n",
"__index_level_0__ 739373 non-null fletcher[int64]\n",
"dtypes: fletcher[bool](1), fletcher[double](9), fletcher[int64](8), fletcher[timestamp[ms, tz=UTC]](2)\n",
"memory usage: 154.6 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@dclong
Copy link

dclong commented Nov 21, 2020

I followed your example here to use pyarrow.jvm to query a Hive database.
However,
after running the following code

batch = jpype.JPackage("org").apache.arrow.adapter.jdbc.JdbcToArrow.sqlToArrow(
    connection,
    query,
    ra
)

it returns only 1024 row.
Basically, batch.getRowCount() return 1024. The table I queries is a huge which has way more than 1024 rows.
Do you have an idea what might have caused the issue?
Do I have to use customized configuration?

@xhochy
Copy link
Author

xhochy commented Dec 8, 2020

@dclong I'm also seeing this behaviour with the newest Arrow release. Did you open an upstream bug about this already?

@dclong
Copy link

dclong commented Dec 9, 2020

No. I figured out that the default configuration has changed. So, one way to fix the issue is to customize the underlying configuration.

@ortap
Copy link

ortap commented Jul 21, 2021

This looks awesome and I am trying to implement this but I run into errors. Like @dclong , I am trying to connect to hive. I am successfully able to get the "batch" (VectorSchemeRoot object). However, when I try to pass the VectorSchemeRoot through pyarrow.jvm.record_batch. It gives me the following error: expected bytes, java.lang.String found.

Have either of you tried this gist out recently? Would you know what the issue might be?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment