Skip to content

Instantly share code, notes, and snippets.

@mattsgithub
Last active November 5, 2021 15:43
Show Gist options
  • Save mattsgithub/9f04ead3e51dabd8c17f9dc3355f2905 to your computer and use it in GitHub Desktop.
Save mattsgithub/9f04ead3e51dabd8c17f9dc3355f2905 to your computer and use it in GitHub Desktop.
2D Hole Dataset
import numpy as np
import pandas as pd
def get_disk_holes(r, r_hole, n_hole, allow_center=True):
"""
Returns disks that are can be used to
cut data from a disk of radius `r`
"""
df = pd.DataFrame(columns=['r_hole', 'r', 'theta', 'x', 'y'])
if n_hole < 1:
return df
n_non_center_holes = n_hole - allow_center
df['theta'] = np.linspace(0, 2 * np.pi, num=n_non_center_holes, endpoint=False)
df['r'] = [r / 2.] * n_non_center_holes
df['x'] = df.r * np.cos(df.theta)
df['y'] = df.r * np.sin(df.theta)
# Special case for centered hole (undefined)
if allow_center:
center = {'r_hole': r_hole, 'r' : r / 2., 'theta': None, 'x': 0, 'y': 0}
df = df.append(center, ignore_index=True)
# Constants
df['r_hole'] = r_hole
df['r'] = r / 2.
return df
def get_sample_from_disk(n, r):
# Must take square root to get uniform
# density across disk
r = r * np.sqrt(np.random.random(size=n))
theta = 2 * np.pi * np.random.random(size=len(r))
df = pd.DataFrame()
df['r'] = r
df['theta'] = theta
df['x'] = df.r * np.cos(df.theta)
df['y'] = df.r * np.sin(df.theta)
return df
def get_2d_holes_dataset(n_neg=500,
n_pos=500,
n_hole=9,
r=1.,
r_hole=.1,
r_pos=.02,
allow_center=True):
"""
Generates a complex topological dataset
consisting of two manifolds.
Args
n_neg: int
How many negative examples to sample
n_pos: int
How many negative examples to sample
n_hole: int
Number of holes to generate
r: float
Radius of entire circle of which all data resides
This is the disk from which negative examples
are sampled from
r_hole:
Radius of hole(s)
r_pos: float
Radius of disks for positive examples
allow_center: True
If True, allow a hole to be created in the center
"""
# First, perform data checks
# We can't allow for example, r_hole > r
r_hole_diameter = 2 * r_hole
max_stacked_holes = 3
holes_max_width = max_stacked_holes * r_hole_diameter
if holes_max_width > r:
raise ValueError(f'r_hole must be no more than r/6')
if r_pos > r_hole:
raise ValueError('r_pos cannot be greater than r_role')
# Oversample. Will delete after
# Need a smater approach instead of factor of 5 approach
# Sampling will be proportional to the number of holes
# Need to calculate how many points we expect to be
# removed for each hole
df_neg = get_sample_from_disk(n=5 * n_neg, r=r)
df_neg['label'] = 0
# These are the regions where negative
# examples are forbidden
df_holes = get_disk_holes(r, r_hole, n_hole,
allow_center=allow_center)
# Find the rows of df_neg that in df_holes
# To find this, we recenter data and check if
# radius is satisfied
df_neg['in_hole'] = 0
for _, row in df_holes.iterrows():
df_neg['in_hole'] = (((row.x - df_neg.x)**2 + (row.y - df_neg.y)**2) <= row.r_hole**2).astype(int) | df_neg.in_hole
# Drop points contained in holes
df_neg = df_neg[df_neg.in_hole == 0].drop(columns=['in_hole'])
df_neg = df_neg.sample(n=n_neg, replace=False)
# Sample points inside disk
# We can't always get an even split
n_pos_per_disk = int(np.floor(n_pos / n_hole))
remainder = n_pos - n_pos_per_disk * n_hole
dfs = pd.DataFrame()
for _, row in df_holes.iterrows():
if remainder >= 1:
df_pos_disk = get_sample_from_disk(n_pos_per_disk + 1, r=r_pos)
remainder -= 1
else:
df_pos_disk = get_sample_from_disk(n_pos_per_disk, r=r_pos)
# Translate disk to center of hole
df_pos_disk['x'] = df_pos_disk.x + row.x
df_pos_disk['y'] = df_pos_disk.y + row.y
dfs = dfs.append(df_pos_disk)
dfs['label'] = 1
df_neg = df_neg.append(dfs)
df_neg = df_neg.sample(frac=1.)
return df_neg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment