Skip to content

Instantly share code, notes, and snippets.

@gshimansky
Last active October 9, 2020 19:45
Show Gist options
  • Save gshimansky/727d7194c80a7a7b24cdf469f95dd059 to your computer and use it in GitHub Desktop.
Save gshimansky/727d7194c80a7a7b24cdf469f95dd059 to your computer and use it in GitHub Desktop.
Generate data files for h2o benchmark https://github.com/h2oai/db-benchmark
import subprocess
import pandas as pd
columns = ["task", "data", "nrow" , "k", "na", "sort", "active"]
dtypes = {x: str for x in columns}
data = pd.read_csv("_control/data.csv", names=columns, dtype=dtypes)
for index, row in data.iterrows():
if row["active"] != "1":
continue
if row["task"] == "groupby":
script = ["Rscript", "_data/groupby-datagen.R"]
elif row["task"] == "join":
script = ["Rscript", "_data/join-datagen.R"]
else:
raise Exception("Unknown task type {}".format(row["task"]))
script.extend([str(row["nrow"]), str(row["k"]), str(row["na"]), str(row["sort"])])
print(script)
try:
process = subprocess.Popen(script, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
output = process.communicate()
except OSError:
print("Failed to start ", script)
raise
print(str(output[0].strip().decode()))
print("Command returned", process.returncode)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment