Skip to content

Instantly share code, notes, and snippets.

@nitaku
Last active August 1, 2016 04:16
Show Gist options
  • Save nitaku/1fc26ae44b5865700d22 to your computer and use it in GitHub Desktop.
Save nitaku/1fc26ae44b5865700d22 to your computer and use it in GitHub Desktop.
Arc Diagram III

Like the previous example, but with the option to provide a custom string. The application relies on a server-side computation that uses Oliver Mader's code to compute the essential matching pairs.

The example shows the first 93 digits of PI.

A translucent white stroke has also been introduced to avoid mistaking palindrome repetition of matching subsequences with large subsequences (e.g. 793238 and 383279 on the left side of the visualization).

# Copyright (C) 2013 Oliver Mader <b52@reaktor42.de>
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# -*- coding: utf-8 -*-
import naive_algorithm
from collections import defaultdict
from itertools import product, islice
from suffix_tree import SuffixTree
__all__ = [
'maximal_matching_pairs',
'repetition_regions',
'essential_matching_pairs'
]
def maximal_matching_pairs(tree):
"""
Find all substring pairs fulfilling the properties specified in
definition 1, namely _identical_, _non-overlapping_, _consecutive_ and
_maximal_.
Args:
tree (SuffixTree): A suffix tree, build from the string to be searched.
Returns:
generator. A generator of tuples, each describing one matching pair as
composed of the start of the first and the second substring,
as well as the length of the substring.
"""
s = tree.string
substrings = defaultdict(set)
# get all substring starts repeated at least once
for leaf in tree.leaves:
node = leaf.parent
end = leaf.start
while node is not None and \
(node.pathLabel not in substrings or end - len(node.pathLabel) \
not in substrings[node.pathLabel]) and \
node.edgeLabel != '':
for i in range(len(node.pathLabel)):
substrings[node.pathLabel[:i+1]].add(end - len(node.pathLabel))
end -= len(node.edgeLabel)
node = node.parent
def contained(x, y, l, largers):
for large in largers:
starts = substrings[large]
idx = list(find_all(large, sub))
for i,j in product(idx, idx):
if (x - i) + len(large) <= (y - j) and (x - i) in starts and \
(y - j) in starts:
return True
return False
# apply constraints and yield legit pairs
for sub in sorted(substrings, key=len, reverse=True):
starts = sorted(substrings[sub])
l = len(sub)
cs = [k for k in substrings if len(k) > len(sub) and
k.startswith(sub) and k.endswith(sub)]
for x, y in zip(starts, islice(starts, 1, None)):
# overlapping
if x + l > y:
continue
# non maximal: left and right expansion
if (x > 0 and x + l < y and s[x-1] == s[y-1]) or \
(y + l < len(s) and x + l < y and s[x+l] == s[y+l]):
continue
# not maximal: inner and outer expansion
if contained(x, y, l, cs):
continue
yield x, y, l
def repetition_regions(tree):
"""
Find all repetition regions as specified in definition 2 and the following
further limiting rules:
2.1) _Minimal_: There does not exist any other repetition region R'
containing R, with the fundamental substring P' containing P.
2.2) _Leftmost_: There doesn't exist another repetition region R',
originating from R shifted one character to the left, with equal
length of the region and of the fundamental substring.
Args:
tree (SuffixTree): A suffix tree, build from the string to be searched.
Returns:
list. A list of tuples, each describing one repetition region build
from multiple matching pairs. The tuples contain the start of
the region, the end of the region not inclusive and the length
of the fundamental substring, respectively.
"""
return naive_algorithm.repetition_regions(tree.string)
s = tree.string
substrings = defaultdict(set)
for leaf in tree.leaves:
node = leaf.parent
end = leaf.start
while node is not None and \
(node.pathLabel not in substrings or end - len(node.pathLabel) \
not in substrings[node.pathLabel]) and \
node.edgeLabel != '':
for i in range(len(node.pathLabel)):
substrings[node.pathLabel[:i+1]].add(end - len(node.pathLabel))
end -= len(node.edgeLabel)
node = node.parent
regions = []
for sub in sorted(substrings, key=len):
starts = sorted(substrings[sub])
l = len(sub)
a = 0
while a < len(starts) - 1:
f = starts[a]
e = f + l
b = a + 1
while b < len(starts) and starts[b] - starts[b - 1] == l:
e += l
b += 1
if b - a > 1:
regions.append((f, e, l))
a = b
return regions
def essential_matching_pairs(string):
"""
Find all essential matching pairs as specified in definition 3. These
pairs might be used to build arc diagrams from.
Args:
string (str): The string to be searched.
Returns:
generator. Yields tuples, each describing one matching pair as composed
of the start of the first and the second substring, as well
as the length of the substring.
"""
tree = SuffixTree(string)
regions = repetition_regions(tree)
# definition 3.1 and 3.2
for x,y,l in maximal_matching_pairs(tree):
if not any(x >= r and y + l <= e for r,e,_ in regions) or \
any(int((x - r)/f) == int((y + l - r - 1)/f) for r,_,f in regions):
yield x, y, l
# definition 3.3
for r,e,l in regions:
for x in range(r, e - l, l):
yield x, x + l, l
def children(node):
""" Returns a generator to iterate all direct node children. """
c = node.firstChild
while c is not None:
yield c
c = c.next
def find_all(a_str, sub):
""" Yield all occurences of a substring. """
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += 1
# vim: set expandtab shiftwidth=4 softtabstop=4 textwidth=79:
svg = d3.select('svg')
width = svg.node().getBoundingClientRect().width
height = svg.node().getBoundingClientRect().height
# translate the viewBox to have (0,0) at the center of the vis
svg
.attr
viewBox: "#{-width/2} #{-height+60} #{width} #{height}"
width -= 60
redraw = () ->
sequence = d3.select('#sequence_input').node().value
x = d3.scale.ordinal()
.domain([0..sequence.length])
.rangeBands([-width/2,width/2])
d3.json "main.php?sequence=#{sequence}", (matches) ->
svg.selectAll('.character').remove()
svg.selectAll('.arc').remove()
svg.select('.subsequence').remove()
characters = svg.selectAll('.character')
.data(sequence)
characters.enter().append('text')
.text (d) -> d
.attr
class: 'character'
x: (d,i) -> x(i) + x.rangeBand()/2
dy: '1em'
arcs = svg.selectAll('.arc')
.data(matches.filter (m) -> m.length > 1)
enter_arcs = arcs.enter().append('path')
.attr
class: 'arc'
d: (match) ->
start_left = x(match.source)
start_right = x(match.source + match.length)
end_left = x(match.target)
end_right = x(match.target + match.length)
big_radius = (end_right-start_left)/2
small_radius = (end_left-start_right)/2
return "M#{start_left} 0 A#{big_radius} #{big_radius} 0 1 1 #{end_right} 0 L#{end_left} 0 A#{small_radius} #{small_radius} 0 1 0 #{start_right} 0 z"
# subsequence selection
subsequence = svg.append 'text'
.attr
class: 'subsequence'
y: -350
enter_arcs.on 'click', (d1) ->
subsequence
.text d1.subsequence
svg.selectAll('.arc')
.classed 'selected', (d2) -> d1.subsequence is d2.subsequence
.classed 'dimmed', (d2) -> d1.subsequence isnt d2.subsequence
d3.event.stopPropagation()
svg.on 'click', () ->
subsequence
.text ''
svg.selectAll('.arc')
.classed 'selected', false
.classed 'dimmed', false
redraw()
d3.select('#sequence_input').on 'change', () ->
console.log 'change'
redraw()
body, html {
padding: 0;
margin: 0;
height: 100%;
font-family: sans-serif;
font-size: 12px;
}
svg {
width: 100%;
height: 100%;
background: white;
}
.character {
text-anchor: middle;
font-size: 16px;
font-family: Georgia;
}
.arc {
opacity: 0.2;
cursor: pointer;
stroke: white
}
.arc.selected {
fill: steelblue;
opacity: 0.4;
}
.arc.dimmed {
opacity: 0.07;
}
.subsequence {
fill: steelblue;
text-anchor: middle;
font-size: 40px;
font-family: Georgia;
text-transform: uppercase;
}
#input_box {
position: absolute;
top: 6px;
left: 6px;
width: 100%
}
#sequence_input {
width: 50%;
}
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Arc Diagram III</title>
<link type="text/css" href="index.css" rel="stylesheet"/>
<script src="http://d3js.org/d3.v3.min.js"></script>
</head>
<body>
<svg></svg>
<div id="input_box">
<label for="sequence_input">Sequence to be analyzed (press ENTER to redraw):</label>
<input id="sequence_input" type="text" value="314159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534">
</div>
<script src="index.js"></script>
</body>
</html>
// Generated by CoffeeScript 1.10.0
(function() {
var height, redraw, svg, width;
svg = d3.select('svg');
width = svg.node().getBoundingClientRect().width;
height = svg.node().getBoundingClientRect().height;
svg.attr({
viewBox: (-width / 2) + " " + (-height + 60) + " " + width + " " + height
});
width -= 60;
redraw = function() {
var j, ref, results, sequence, x;
sequence = d3.select('#sequence_input').node().value;
x = d3.scale.ordinal().domain((function() {
results = [];
for (var j = 0, ref = sequence.length; 0 <= ref ? j <= ref : j >= ref; 0 <= ref ? j++ : j--){ results.push(j); }
return results;
}).apply(this)).rangeBands([-width / 2, width / 2]);
return d3.json("main.php?sequence=" + sequence, function(matches) {
var arcs, characters, enter_arcs, subsequence;
svg.selectAll('.character').remove();
svg.selectAll('.arc').remove();
svg.select('.subsequence').remove();
characters = svg.selectAll('.character').data(sequence);
characters.enter().append('text').text(function(d) {
return d;
}).attr({
"class": 'character',
x: function(d, i) {
return x(i) + x.rangeBand() / 2;
},
dy: '1em'
});
arcs = svg.selectAll('.arc').data(matches.filter(function(m) {
return m.length > 1;
}));
enter_arcs = arcs.enter().append('path').attr({
"class": 'arc',
d: function(match) {
var big_radius, end_left, end_right, small_radius, start_left, start_right;
start_left = x(match.source);
start_right = x(match.source + match.length);
end_left = x(match.target);
end_right = x(match.target + match.length);
big_radius = (end_right - start_left) / 2;
small_radius = (end_left - start_right) / 2;
return "M" + start_left + " 0 A" + big_radius + " " + big_radius + " 0 1 1 " + end_right + " 0 L" + end_left + " 0 A" + small_radius + " " + small_radius + " 0 1 0 " + start_right + " 0 z";
}
});
subsequence = svg.append('text').attr({
"class": 'subsequence',
y: -350
});
enter_arcs.on('click', function(d1) {
subsequence.text(d1.subsequence);
svg.selectAll('.arc').classed('selected', function(d2) {
return d1.subsequence === d2.subsequence;
}).classed('dimmed', function(d2) {
return d1.subsequence !== d2.subsequence;
});
return d3.event.stopPropagation();
});
return svg.on('click', function() {
subsequence.text('');
return svg.selectAll('.arc').classed('selected', false).classed('dimmed', false);
});
});
};
redraw();
d3.select('#sequence_input').on('change', function() {
console.log('change');
return redraw();
});
}).call(this);
<?php
echo shell_exec('python main.py ' . $_GET['sequence']);
?>
import sys
import algorithm as arc
import json
sequence = sys.argv[1]
matches = arc.essential_matching_pairs(sequence)
result = [{'source': match[0], 'target': match[1], 'length': match[2], 'subsequence': sequence[match[0]:match[0]+match[2]]} for match in matches if match[2] > 1]
print json.dumps(result)
# Copyright (C) 2013 Oliver Mader <b52@reaktor42.de>
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# -*- coding: utf-8 -*-
def maximal_matching_pairs(string):
"""
Find all substring pairs fulfilling the properties specified in
definition 1, namely _identical_, _non-overlapping_, _consecutive_ and
_maximal_.
Args:
string (str): The string to be searched.
Returns:
generator. A generator of tuples, each describing one matching pair as
composed of the start of the first and the second substring,
as well as the length of the substring.
"""
n = len(string)
for x in range(0, n - 1):
for l in range(int((n - x)/2) + 1, 0, -1):
c = string[x:x+l]
y = string.find(c, x + 1)
# not found or not consecutive
if y == -1 or y < x + l:
continue
# not maximal: left or right expansion
if (y + l < n and x + l < y and string[x+l] == string[y+l]) or \
(x - 1 >= 0 and x + l < y and string[x-1] == string[y-1]):
continue
# not maximal: inner expansion
if any(string[x:x+l+i] == string[y-i:y+l]
for i in range(1, (y - x - l)/2 + 1)):
continue
# not maximal: outer expansion
if any(string[x-i:x+l] == string[y:y+l+i]
for i in range(1, max(x, n - y - l) + 1)):
continue
yield x, y, l
def repetition_regions(string):
"""
Find all repetition regions as specified in definition 2 and the following
further limiting rules:
2.1) _Minimal_: There doesn't exist any other repetition region R'
containing R, with an equal or smaller fundamental substring P',
contained in P.
2.2) _Leftmost_: There doesn't exist another repetition region R',
originating from R shifted one character to the left, with equal
length of the region and of the fundamental substring.
Args:
string (str): The string to be searched.
Returns:
list. A list of tuples, each describing one repetition region build
from multiple matching pairs. The tuples contain the start of
the region, the end of the region not inclusive and the length
of the fundamental substring, respectively.
"""
n = len(string)
s = 0
regions = []
while s < n - 1:
for l in range(1, (n - s)/2 + 1):
candidate = string[s:s+l]
end = s + l
while string.find(candidate, end, end + l) == end:
end += l
# not enough pairs
if end == s + l:
continue
# not left most
if (s-1, end-1, len(candidate)) in regions:
continue
# not minimal
if any(r[0] <= s and r[1] >= end and r[2] <= len(candidate)
for r in regions):
continue
regions.append((s, end, len(candidate)))
s += 1
return regions
def essential_matching_pairs(string):
"""
Find all essential matching pairs as specified in definition 3. These
pairs might be used to build arc diagrams from.
Args:
string (str): The string to be searched.
Returns:
generator. Yields tuples, each describing one matching pair as composed
of the start of the first and the second substring, as well
as the length of the substring.
"""
regions = repetition_regions(string)
# definition 3.1 and 3.2
for x,y,l in maximal_matching_pairs(string):
if not any(x >= r and y + l <= e for r,e,_ in regions) or \
any(int((x - r)/f) == int((y + l - r - 1)/f) for r,_,f in regions):
yield (x, y, l)
# definition 3.3
for r,e,l in regions:
for x in range(r, e - l, l):
yield (x, x + l, l)
# vim: set expandtab shiftwidth=4 softtabstop=4 textwidth=79:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment