Skip to content

Instantly share code, notes, and snippets.

@eddyxu
eddyxu / hd-vila-ray.py
Last active April 12, 2024 19:19
hd-vila-ray
#!/usr/bin/env python3
#
# Generate HD-Vila-100M dataset
#
# https://github.com/microsoft/XPretrain/tree/main/hd-vila-100m
import argparse
import datetime
import logging
SELECT * FROM (
SELECT
*,
ML_PREDICT(coco_det, image) as det,
ML_PREDICT(hardhat, image) as hat
FROM hardhat
) WHERE array_contains(hat.label, "helmat")
AND array_contains(hat.label, "head")
AND array_contains(det.label, "truck")
CREATE MODEL hardhat -- Load from model registry
FLAVOR pytorch
MODEL_TYPE ssd
USING 's3://bucket/to/hardhat.pth';
CREATE MODEL coco_det -- Load pretrained models.
FLAVOR pytorch
MODEL_TYPE fasterrcnn;
@eddyxu
eddyxu / hardhat-train.py
Last active May 5, 2022 18:55
Train on hardhat dataset
from rikai.pytorch.data import Dataset
def train(
name: str,
uri: Dataset,
model_type: str,
epochs: int = 150,
batch_size: int = 4,
num_workers: int = 4,
lr: float = 0.02,
@eddyxu
eddyxu / hardhat-rikai.py
Last active May 5, 2022 18:46
Prepare Hardhat dataset into Rikai
import xml.etree.ElementTree as ET
from pyspark.sql import Row
from rikai.types import Box2d, Image
all_images = {p.name: p for p in basedir.glob("**/*.jpg")}
images = []
for split in ["Train", "Test"]:
for voc_file in basedir.glob(f"{split}/**/*.xml"):
root = ET.parse(voc_file).getroot()
annotations = [Row(
SELECT image_id, image, detection.* FROM (
SELECT image_id, image, explode(ML_PREDICT(ssd, image)) as detection FROM raw_data
) WHERE 100 < area(detection.box) AND area(detection.box) < 10000
AND coco_name(detection.label_id) IN ('chair', 'remote', 'cell phone')
AND detection.score < 0.6
SELECT box_area, count(*) AS cnt FROM (
SELECT
CAST(area(detection.box) / 500 AS int) * 500 AS box_area
FROM least_margin
WHERE coco_name(detection.label_ids[0]) IN ('chair', 'person')
AND coco_name(detection.label_ids[1]) IN ('chair', 'person')
) GROUP BY box_area ORDER BY box_area
SELECT
coco_name(detection.label_id) as label,
count(detection.label_id) AS cnt
FROM (
SELECT
explode(ML_PREDICT(ssd, image)) AS detection
FROM coco
) WHERE coco_name(detection.label_id) IN ('person', 'chair', 'remote', 'cell phone', 'sheep', 'cow')
GROUP BY detection.label_id
ORDER BY cnt DESC
WITH least_margin AS (
SELECT
image_id, image, detection,
detection.scores[0] - detection.scores[1] as margin FROM (
SELECT
image_id,
image,
explode(ML_PREDICT(class_scores, image)) AS detection
FROM coco
) ORDER BY margin LIMIT 1000
SELECT
image_id,
image,
detection,
entropy(detection.scores) as entropy
FROM (
SELECT
image_id,
image,
explode(ML_PREDICT(class_scores, image)) AS detection