Skip to content

Instantly share code, notes, and snippets.

@eddyxu
Last active May 5, 2022 18:46
Show Gist options
  • Save eddyxu/dbaeb75da8fb9dca356f3715f86975ef to your computer and use it in GitHub Desktop.
Save eddyxu/dbaeb75da8fb9dca356f3715f86975ef to your computer and use it in GitHub Desktop.
Prepare Hardhat dataset into Rikai
import xml.etree.ElementTree as ET
from pyspark.sql import Row
from rikai.types import Box2d, Image
all_images = {p.name: p for p in basedir.glob("**/*.jpg")}
images = []
for split in ["Train", "Test"]:
for voc_file in basedir.glob(f"{split}/**/*.xml"):
root = ET.parse(voc_file).getroot()
annotations = [Row(
label=obj.find("name").text,
box=Box2d(
**{
k: int(obj.find(f"./bndbox/{k}").text)
for k in ["xmin", "ymin", "xmax", "ymax"]
}
),
) for obj in root.iter("object")]
filename = root.find("filename").text
images.append(
{
"filename": filename,
"width": int(root.find("./size/width").text),
"height": int(root.find("./size/height").text),
"depth": int(root.find("./size/depth").text),
"annotations": annotations,
"split": split.lower(),
"image": Image(all_images[filename]).to_embedded(),
}
)
spark.createDataFrame(images).write.format("rikai").save("datasets/hardhat")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment