Skip to content

Instantly share code, notes, and snippets.

@icexelloss
Created August 22, 2023 15:26
Show Gist options
  • Save icexelloss/26ac259d85cdcf3c971c49854ffced7e to your computer and use it in GitHub Desktop.
Save icexelloss/26ac259d85cdcf3c971c49854ffced7e to your computer and use it in GitHub Desktop.
Scan V2 Ordering
Code:
Status exp_9() {
// For local filesystem reading, file:/// may work here as well. You may
// need to #include <arrow/dataset/file_base.h>
const std::string uri_string = "ts3://";
// const std::string uri_string = "gs://";
// Filesystem and path
ARROW_ASSIGN_OR_RAISE(const std::shared_ptr<arrow::fs::FileSystem> filesystem,
arrow::fs::FileSystemFromUri(uri_string, nullptr));
const std::vector<std::string> paths = {
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230331/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230403/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230404/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230405/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230406/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230410/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230411/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230412/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230413/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230414/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230417/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230418/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230419/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230420/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230421/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230424/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230425/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230426/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230427/data.gzip.parquet",
"mdp-bamboo-equity-model-state-us/US/30/20230526/"
"MdpBambooEquityStateNode/equity_state/date=20230428/data.gzip.parquet",
};
ARROW_ASSIGN_OR_RAISE(
auto factory, arrow::dataset::FileSystemDatasetFactory::Make(
filesystem, paths,
std::make_shared<arrow::dataset::ParquetFileFormat>(),
arrow::dataset::FileSystemFactoryOptions()));
// ARROW_ASSIGN_OR_RAISE(auto schema, factory->Inspect());
ARROW_ASSIGN_OR_RAISE(auto dataset, factory->Finish());
// std::cout << schema->ToString() << std::endl;
// Register "scan" node
arrow::dataset::internal::Initialize();
ARROW_ASSIGN_OR_RAISE(auto plan, ExecPlan::Make());
arrow::dataset::ScanV2Options scan_v2_options =
arrow::dataset::ScanV2Options(dataset);
scan_v2_options.columns =
arrow::dataset::ScanV2Options::AllColumns(*dataset->schema());
Declaration scan2{"scan2", std::move(scan_v2_options)};
// Uncomment this to check ordering of rows:
cp::Expression sel_time = cp::field_ref("time");
ac::Declaration project{"project",
{std::move(scan2)},
ac::ProjectNodeOptions({sel_time}, {"time"})};
ARROW_ASSIGN_OR_RAISE(auto table,
ac::DeclarationToTable(std::move(project), false));
std::cout << table->ToString() << std::endl;
return Status::OK();
}
Output:
time: timestamp[us, tz=UTC]
----
time:
[
[
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
2023-04-05 19:30:00.000000,
...
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000,
2023-04-05 16:30:00.000000
],
[
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
2023-04-03 19:30:00.000000,
...
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000,
2023-04-03 16:30:00.000000
],
...,
[
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
2023-04-28 16:30:00.000000,
...
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000
],
[
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
2023-04-28 18:30:00.000000,
...
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000,
2023-04-28 19:00:00.000000
]
]
OK
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment