nitaku/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This experiment proposes a diagram for displaying the results of a data matching (aka record linkage) problem. In this kind of problems, two different datasets A and B are automatically compared, in order to find pairs of records that refer to the same real-world entity. We took the assumption of representing the results of a constrained matching problem, where the matching function is a one-to-one mapping (a bijective function from A to B). The implication is that the number of matches found in A is equal to the number of matches found in B.
The number of elements in A and B is represented by the length of two justaxposed, "misaligned" bars. A is depicted in brown-orange, while B in different shades of cyan. The number of matches is proportional to the length of the aligned portion of the bars (represented with more vivid colors). The remaining parts show the unmatched records in A (brown) and B (darker cyan).
The diagram could have been made more theoretically correct by aligning the two bars on the same y coordinate, thus keeping both the bars representing unmatched records to the right. This would have enabled a better evaluation of the quantities encoded in the diagram, making it easier to compare the amount of unmatched records found in the two datasets. However, we feel that our design presents a more intuitive depiction of a matching process, yielding a better metaphor than the theoretical approach, sort of a quantitative version of the classic two-sets Venn diagram.
Terminology about data matching tasks is mainly from Cohen et al. 2002.

  
## index.coffee
# data
randint = (min, max) -> Math.floor(Math.random()*(max-min))+min

a = randint(1000,3000)
b = randint(2000,4000)
ab = randint(3000,5000)

# setup
svg = d3.select('svg')
width = svg.node().getBoundingClientRect().width
height = svg.node().getBoundingClientRect().height

# translate the viewBox to have (0,0) at the center of the vis
svg
  .attr
    viewBox: "#{-width/2} #{-height/2} #{width} #{height}"

x = d3.scale.linear()
  .domain([0, a+b+ab])
  .range([-width/2+40, width/2-40])

H = 40
P = 30
D = 6
T = 10

base_hue = 45

color_a = d3.hcl(base_hue, 50, 50)
color_ab = d3.hcl(base_hue+30, 55, 70)
color_b = d3.hcl(base_hue+180, 50, 50)
color_ba = d3.hcl(base_hue+180-30, 55, 70)

# vis
svg.append('rect')
  .attr
    class: 'bar'
    x: x(0)
    y: -H-P/2
    width: x(a)-x(0)
    height: H
    fill: color_a

svg.append('rect')
  .attr
    class: 'bar'
    x: x(a)
    y: -H-P/2
    width: x(a+ab)-x(a)
    height: H
    fill: color_ab

svg.append('rect')
  .attr
    class: 'bar'
    x: x(a)
    y: +P/2
    width: x(a+ab)-x(a)
    height: H
    fill: color_ba

svg.append('rect')
  .attr
    class: 'bar'
    x: x(a+ab)
    y: +P/2
    width: x(a+ab+b)-x(a+ab)
    height: H
    fill: color_b

# labels
svg.append('text')
  .text(d3.format(',')(a))
  .attr
    class: 'label'
    x: x(a/2)
    dy: '0.35em'

svg.append('text')
  .text(d3.format(',')(ab) + ' matches')
  .attr
    class: 'label'
    x: x(a+ab/2)
    dy: '0.35em'

svg.append('text')
  .text(d3.format(',')(b))
  .attr
    class: 'label'
    x: x(a+ab+b/2)
    dy: '0.35em'

svg.append('path')
  .attr
    class: 'axis'
    d: "M#{x(0)} #{-H-P/2-D} l0 #{-T} l#{x(a+ab)-x(0)} 0 l0 #{T}"

svg.append('path')
  .attr
    class: 'axis'
    d: "M#{x(a)} #{+H+P/2+D} l0 #{+T} l#{x(a+ab+b)-x(a)} 0 l0 #{-T}"

label_a = svg.append('text')
  .text(d3.format(',')(a+ab) + ' (dataset A)')
  .attr
    class: 'label'
    x: x((a+ab)/2)
    y: -H-P/2-D-T
    dy: '0.35em'

a_bbox = label_a.node().getBBox();
svg.insert('rect', '.label:last-child')
  .attr
    class: 'halo'
    x: a_bbox.x
    y: a_bbox.y
    width: a_bbox.width
    height: a_bbox.height

label_b = svg.append('text')
  .text(d3.format(',')(b+ab) + ' (dataset B)')
  .attr
    class: 'label'
    x: x(a+(b+ab)/2)
    y: +H+P/2+D+T
    dy: '0.35em'

b_bbox = label_b.node().getBBox();
svg.insert('rect', '.label:last-child')
  .attr
    class: 'halo'
    x: b_bbox.x
    y: b_bbox.y
    width: b_bbox.width
    height: b_bbox.height

## index.css
svg {
  background: white;
}
.bar {
  shape-rendering: crispEdges;
}
.a {
  fill: teal;
}
.ab {
  fill: blue;
}
.ba {
  fill: red;
}
.b {
  fill: orange;
}

.label {
  text-anchor: middle;
  font-family: sans-serif;
  font-size: 14px;
  fill: #333;
}

.axis {
  stroke: #CCC;
  stroke-width: 1;
  shape-rendering: crispEdges;
  fill: none;
}

.halo {
  fill: white;
  stroke: white;
  stroke-width: 12px;
  shape-rendering: crispEdges;
}

## index.html
<!DOCTYPE html>
<html>
	<head>
        <meta charset="utf-8">
        <meta name="description" content="Data matching diagram" />
        <title>Data matching diagram</title>
		<link type="text/css" href="index.css" rel="stylesheet"/>
        <script src="http://d3js.org/d3.v3.min.js"></script>
	</head>
	<body>
        <svg height="500" width="960"></svg>
        <script src="index.js"></script>
	</body>
</html>

## index.js
(function() {
  var D, H, P, T, a, a_bbox, ab, b, b_bbox, base_hue, color_a, color_ab, color_b, color_ba, height, label_a, label_b, randint, svg, width, x;

  randint = function(min, max) {
    return Math.floor(Math.random() * (max - min)) + min;
  };

  a = randint(1000, 3000);

  b = randint(2000, 4000);

  ab = randint(3000, 5000);

  svg = d3.select('svg');

  width = svg.node().getBoundingClientRect().width;

  height = svg.node().getBoundingClientRect().height;

  svg.attr({
    viewBox: "" + (-width / 2) + " " + (-height / 2) + " " + width + " " + height
  });

  x = d3.scale.linear().domain([0, a + b + ab]).range([-width / 2 + 40, width / 2 - 40]);

  H = 40;

  P = 30;

  D = 6;

  T = 10;

  base_hue = 45;

  color_a = d3.hcl(base_hue, 50, 50);

  color_ab = d3.hcl(base_hue + 30, 55, 70);

  color_b = d3.hcl(base_hue + 180, 50, 50);

  color_ba = d3.hcl(base_hue + 180 - 30, 55, 70);

  svg.append('rect').attr({
    "class": 'bar',
    x: x(0),
    y: -H - P / 2,
    width: x(a) - x(0),
    height: H,
    fill: color_a
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a),
    y: -H - P / 2,
    width: x(a + ab) - x(a),
    height: H,
    fill: color_ab
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a),
    y: +P / 2,
    width: x(a + ab) - x(a),
    height: H,
    fill: color_ba
  });

  svg.append('rect').attr({
    "class": 'bar',
    x: x(a + ab),
    y: +P / 2,
    width: x(a + ab + b) - x(a + ab),
    height: H,
    fill: color_b
  });

  svg.append('text').text(d3.format(',')(a)).attr({
    "class": 'label',
    x: x(a / 2),
    dy: '0.35em'
  });

  svg.append('text').text(d3.format(',')(ab) + ' matches').attr({
    "class": 'label',
    x: x(a + ab / 2),
    dy: '0.35em'
  });

  svg.append('text').text(d3.format(',')(b)).attr({
    "class": 'label',
    x: x(a + ab + b / 2),
    dy: '0.35em'
  });

  svg.append('path').attr({
    "class": 'axis',
    d: "M" + (x(0)) + " " + (-H - P / 2 - D) + " l0 " + (-T) + " l" + (x(a + ab) - x(0)) + " 0 l0 " + T
  });

  svg.append('path').attr({
    "class": 'axis',
    d: "M" + (x(a)) + " " + (+H + P / 2 + D) + " l0 " + (+T) + " l" + (x(a + ab + b) - x(a)) + " 0 l0 " + (-T)
  });

  label_a = svg.append('text').text(d3.format(',')(a + ab) + ' (dataset A)').attr({
    "class": 'label',
    x: x((a + ab) / 2),
    y: -H - P / 2 - D - T,
    dy: '0.35em'
  });

  a_bbox = label_a.node().getBBox();

  svg.insert('rect', '.label:last-child').attr({
    "class": 'halo',
    x: a_bbox.x,
    y: a_bbox.y,
    width: a_bbox.width,
    height: a_bbox.height
  });

  label_b = svg.append('text').text(d3.format(',')(b + ab) + ' (dataset B)').attr({
    "class": 'label',
    x: x(a + (b + ab) / 2),
    y: +H + P / 2 + D + T,
    dy: '0.35em'
  });

  b_bbox = label_b.node().getBBox();

  svg.insert('rect', '.label:last-child').attr({
    "class": 'halo',
    x: b_bbox.x,
    y: b_bbox.y,
    width: b_bbox.width,
    height: b_bbox.height
  });

}).call(this);

## thumbnail.png

      
    Raw
  

              thumbnail.png
	# data
	randint = (min, max) -> Math.floor(Math.random()*(max-min))+min

	a = randint(1000,3000)
	b = randint(2000,4000)
	ab = randint(3000,5000)

	# setup
	svg = d3.select('svg')
	width = svg.node().getBoundingClientRect().width
	height = svg.node().getBoundingClientRect().height

	# translate the viewBox to have (0,0) at the center of the vis
	svg
	.attr
	viewBox: "#{-width/2} #{-height/2} #{width} #{height}"

	x = d3.scale.linear()
	.domain([0, a+b+ab])
	.range([-width/2+40, width/2-40])

	H = 40
	P = 30
	D = 6
	T = 10

	base_hue = 45

	color_a = d3.hcl(base_hue, 50, 50)
	color_ab = d3.hcl(base_hue+30, 55, 70)
	color_b = d3.hcl(base_hue+180, 50, 50)
	color_ba = d3.hcl(base_hue+180-30, 55, 70)

	# vis
	svg.append('rect')
	.attr
	class: 'bar'
	x: x(0)
	y: -H-P/2
	width: x(a)-x(0)
	height: H
	fill: color_a

	svg.append('rect')
	.attr
	class: 'bar'
	x: x(a)
	y: -H-P/2
	width: x(a+ab)-x(a)
	height: H
	fill: color_ab

	svg.append('rect')
	.attr
	class: 'bar'
	x: x(a)
	y: +P/2
	width: x(a+ab)-x(a)
	height: H
	fill: color_ba

	svg.append('rect')
	.attr
	class: 'bar'
	x: x(a+ab)
	y: +P/2
	width: x(a+ab+b)-x(a+ab)
	height: H
	fill: color_b

	# labels
	svg.append('text')
	.text(d3.format(',')(a))
	.attr
	class: 'label'
	x: x(a/2)
	dy: '0.35em'

	svg.append('text')
	.text(d3.format(',')(ab) + ' matches')
	.attr
	class: 'label'
	x: x(a+ab/2)
	dy: '0.35em'

	svg.append('text')
	.text(d3.format(',')(b))
	.attr
	class: 'label'
	x: x(a+ab+b/2)
	dy: '0.35em'

	svg.append('path')
	.attr
	class: 'axis'
	d: "M#{x(0)} #{-H-P/2-D} l0 #{-T} l#{x(a+ab)-x(0)} 0 l0 #{T}"

	svg.append('path')
	.attr
	class: 'axis'
	d: "M#{x(a)} #{+H+P/2+D} l0 #{+T} l#{x(a+ab+b)-x(a)} 0 l0 #{-T}"

	label_a = svg.append('text')
	.text(d3.format(',')(a+ab) + ' (dataset A)')
	.attr
	class: 'label'
	x: x((a+ab)/2)
	y: -H-P/2-D-T
	dy: '0.35em'

	a_bbox = label_a.node().getBBox();
	svg.insert('rect', '.label:last-child')
	.attr
	class: 'halo'
	x: a_bbox.x
	y: a_bbox.y
	width: a_bbox.width
	height: a_bbox.height

	label_b = svg.append('text')
	.text(d3.format(',')(b+ab) + ' (dataset B)')
	.attr
	class: 'label'
	x: x(a+(b+ab)/2)
	y: +H+P/2+D+T
	dy: '0.35em'

	b_bbox = label_b.node().getBBox();
	svg.insert('rect', '.label:last-child')
	.attr
	class: 'halo'
	x: b_bbox.x
	y: b_bbox.y
	width: b_bbox.width
	height: b_bbox.height
	svg {
	background: white;
	}
	.bar {
	shape-rendering: crispEdges;
	}
	.a {
	fill: teal;
	}
	.ab {
	fill: blue;
	}
	.ba {
	fill: red;
	}
	.b {
	fill: orange;
	}

	.label {
	text-anchor: middle;
	font-family: sans-serif;
	font-size: 14px;
	fill: #333;
	}

	.axis {
	stroke: #CCC;
	stroke-width: 1;
	shape-rendering: crispEdges;
	fill: none;
	}

	.halo {
	fill: white;
	stroke: white;
	stroke-width: 12px;
	shape-rendering: crispEdges;
	}
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="description" content="Data matching diagram" />
	<title>Data matching diagram</title>
	<link type="text/css" href="index.css" rel="stylesheet"/>
	<script src="http://d3js.org/d3.v3.min.js"></script>
	</head>
	<body>
	<svg height="500" width="960"></svg>
	<script src="index.js"></script>
	</body>
	</html>
	(function() {
	var D, H, P, T, a, a_bbox, ab, b, b_bbox, base_hue, color_a, color_ab, color_b, color_ba, height, label_a, label_b, randint, svg, width, x;

	randint = function(min, max) {
	return Math.floor(Math.random() * (max - min)) + min;
	};

	a = randint(1000, 3000);

	b = randint(2000, 4000);

	ab = randint(3000, 5000);

	svg = d3.select('svg');

	width = svg.node().getBoundingClientRect().width;

	height = svg.node().getBoundingClientRect().height;

	svg.attr({
	viewBox: "" + (-width / 2) + " " + (-height / 2) + " " + width + " " + height
	});

	x = d3.scale.linear().domain([0, a + b + ab]).range([-width / 2 + 40, width / 2 - 40]);

	H = 40;

	P = 30;

	D = 6;

	T = 10;

	base_hue = 45;

	color_a = d3.hcl(base_hue, 50, 50);

	color_ab = d3.hcl(base_hue + 30, 55, 70);

	color_b = d3.hcl(base_hue + 180, 50, 50);

	color_ba = d3.hcl(base_hue + 180 - 30, 55, 70);

	svg.append('rect').attr({
	"class": 'bar',
	x: x(0),
	y: -H - P / 2,
	width: x(a) - x(0),
	height: H,
	fill: color_a
	});

	svg.append('rect').attr({
	"class": 'bar',
	x: x(a),
	y: -H - P / 2,
	width: x(a + ab) - x(a),
	height: H,
	fill: color_ab
	});

	svg.append('rect').attr({
	"class": 'bar',
	x: x(a),
	y: +P / 2,
	width: x(a + ab) - x(a),
	height: H,
	fill: color_ba
	});

	svg.append('rect').attr({
	"class": 'bar',
	x: x(a + ab),
	y: +P / 2,
	width: x(a + ab + b) - x(a + ab),
	height: H,
	fill: color_b
	});

	svg.append('text').text(d3.format(',')(a)).attr({
	"class": 'label',
	x: x(a / 2),
	dy: '0.35em'
	});

	svg.append('text').text(d3.format(',')(ab) + ' matches').attr({
	"class": 'label',
	x: x(a + ab / 2),
	dy: '0.35em'
	});

	svg.append('text').text(d3.format(',')(b)).attr({
	"class": 'label',
	x: x(a + ab + b / 2),
	dy: '0.35em'
	});

	svg.append('path').attr({
	"class": 'axis',
	d: "M" + (x(0)) + " " + (-H - P / 2 - D) + " l0 " + (-T) + " l" + (x(a + ab) - x(0)) + " 0 l0 " + T
	});

	svg.append('path').attr({
	"class": 'axis',
	d: "M" + (x(a)) + " " + (+H + P / 2 + D) + " l0 " + (+T) + " l" + (x(a + ab + b) - x(a)) + " 0 l0 " + (-T)
	});

	label_a = svg.append('text').text(d3.format(',')(a + ab) + ' (dataset A)').attr({
	"class": 'label',
	x: x((a + ab) / 2),
	y: -H - P / 2 - D - T,
	dy: '0.35em'
	});

	a_bbox = label_a.node().getBBox();

	svg.insert('rect', '.label:last-child').attr({
	"class": 'halo',
	x: a_bbox.x,
	y: a_bbox.y,
	width: a_bbox.width,
	height: a_bbox.height
	});

	label_b = svg.append('text').text(d3.format(',')(b + ab) + ' (dataset B)').attr({
	"class": 'label',
	x: x(a + (b + ab) / 2),
	y: +H + P / 2 + D + T,
	dy: '0.35em'
	});

	b_bbox = label_b.node().getBBox();

	svg.insert('rect', '.label:last-child').attr({
	"class": 'halo',
	x: b_bbox.x,
	y: b_bbox.y,
	width: b_bbox.width,
	height: b_bbox.height
	});

	}).call(this);