Skip to content

Instantly share code, notes, and snippets.

@alexshpilkin
Last active September 26, 2021 19:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexshpilkin/bf25962064e570d10aca9a8a4b325b78 to your computer and use it in GitHub Desktop.
Save alexshpilkin/bf25962064e570d10aca9a8a4b325b78 to your computer and use it in GitHub Desktop.
Data obfuscation on the Russian Central Election Commission website
#!/usr/bin/env python3
#{ SPDX-License-Identifier: CC0-1.0 }
from collections import namedtuple
from fontTools.ttLib import TTFont
from io import BytesIO
from lxml.html import document_fromstring
from lxml.etree import tostring
from re import finditer, compile as re_compile
from requests import get
from sys import stdin, stdout
stdin.reconfigure(encoding='cp1251', newline=None)
tree = document_fromstring(stdin.read())
container, = tree.xpath('//*[contains(concat(" ", @class, " "), " show ")]')
css, = container.xpath('.//style'); css.drop_tree(); css = str(css.text)
js, = container.xpath('.//script'); js.drop_tree(); js = str(js.text)
byclass = {}
for node in container.xpath('.//*[@class]'):
for cls in node.classes:
byclass.setdefault(cls, []).append(node)
Style = namedtuple('Style',
'visible scramble content',
defaults=(True, False, None))
HIDE = re_compile(r"display: *none|(top|left): *-9+px|z-index: *-9+|(font-size|opacity): *0|(width|height): *0(px)?|color: *(white|transparent)|visibility: *hidden")
CONTENT = re_compile(r"content: *'([^\']*)'")
FONTFAM = re_compile(r'font-family: *"([^\"]*)"( *!important)?')
FONTURL = re_compile(r'src:.* url\("\./([^\"]*\.ttf)"\).*')
fontfam = fonturl = None
def parsestyle(decs):
global fontfam, fonturl
style = Style()
for dec in decs.split(';'):
dec = dec.strip()
if HIDE.fullmatch(dec):
style = style._replace(visible=False)
elif m := CONTENT.fullmatch(dec):
style = style._replace(content=m[1])
elif m := FONTFAM.fullmatch(dec):
if fontfam is None:
fontfam = m[1]
assert m[1] == fontfam
style = style._replace(scramble=True)
elif m := FONTURL.fullmatch(dec):
assert fonturl is None
fonturl = m[1]
return style
SELECTOR = re_compile(r"\.([a-z_]*(::after)?)")
styles, afters = {}, {}
end = 0
for m in finditer(r' *([-@a-z_.: ]+?) *\{([^}]*)\}', css):
assert m.start() == end; end = m.end()
sel, decs = m.groups()
style = parsestyle(decs)
if sel == '@font-face':
continue
parent, sel = sel.split()
assert parent[0] == '.' and len(byclass.get(parent[1:], ())) == 1
m = SELECTOR.fullmatch(sel)
assert m is not None
if m[1].endswith('::after'):
assert m[1].removesuffix('::after') not in afters
afters[m[1].removesuffix('::after')] = style
else:
assert m[1] not in styles
styles[m[1]] = style
assert not css[end:].strip()
ttf = get('http://www.vybory.izbirkom.ru/' + fonturl,
headers={'User-Agent': 'Mozilla/5.0'})
ttf = TTFont(BytesIO(ttf.content))
subst = {v: k for k, v in ttf.getBestCmap().items()}
subst = {chr(subst[n]): str(k) for k, n in
enumerate('zero one two three four five six seven eight nine'.split())}
def unscramble(s):
return ''.join(subst[c] for c in s)
def string(src):
assert src[0] == "'" and src[-1] == "'" and "\\" not in src
return src[1:-1]
revealed = set()
def reveal(cls):
styles.setdefault(cls, None)
node, = byclass.get(cls, ())
decs = ';'.join(dec for dec in node.attrib.pop('style').split(';')
if not HIDE.fullmatch(dec.strip()))
if decs:
node.set('style', decs)
revealed.add(node)
def dosetinner(cls, val, elt):
cls, val = string(cls), string(val)
assert byclass[elt][0] in revealed
styles.setdefault(cls, None)
assert '<' not in val and '&' not in val
for node in byclass.get(cls, ()):
assert not list(node)
node.text = val
def dosplice(cls, idx, elt):
cls, idx = string(cls), int(idx)
assert byclass[elt][0] in revealed
styles.setdefault(cls, None)
for node in byclass.get(cls, ()):
children = list(node)
if children and idx < 0:
text = children[-1].tail
else:
text = node.text
assert idx < len(text) and -idx <= len(text)
text = text[:idx] + text[idx+1:] if idx != -1 else text[:-1]
if children and idx < 0:
children[-1].tail = text
else:
node.text = text
def lec(node):
children = list(node)
return lec(children[-1]) if children else node
def doswaplast(fst, snd, elt):
fst, snd = int(string(fst)), int(string(snd))
table, = byclass[elt]
assert table in revealed
nodes = table.xpath('.//td') # FIXME compile?
fst, snd = lec(nodes[fst]), lec(nodes[snd])
fst.text, snd.text = snd.text, fst.text
IGNORE = re_compile(r" +|;|if *\(!lec\) *\{[^}]*\{[^}]*\}[^}]*\}|var *a *= *function\(\) *\{")
SETINNER = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*innerHTML *= *[a-z_]+ *;[^}]*\}[^}]*\} *;")
SPLICE = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*splice[^}]*\}[^}]*\} *;")
SWAPLAST = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*getElementsByTagName\('td'\)[^}]*\} *;")
REVEAL = re_compile(r"var +([a-z_]+) *= *document\.getElementsByClassName[^}]*setTimeout\(function *\(\) *\{[^}]*\}[^)]*\) *;")
CALL = re_compile(r"([a-z_]*)\(('[^\']*'), *(-?[0-9]*|'[^\']*'), *([a-z_]*)\) *;")
QUIT = re_compile(r"\} *; *document\.addEventListener\('DOMContentLoaded', *a\) *;")
setinner = splice = swaplast = None
i = 0
while True:
if m := IGNORE.match(js, i):
i = m.end()
elif m := SETINNER.match(js, i):
i = m.end(); setinner = m[1]
elif m := SPLICE.match(js, i):
i = m.end(); splice = m[1]
elif m := SWAPLAST.match(js, i):
i = m.end(); swaplast = m[1]
elif m := REVEAL.match(js, i):
i = m.end(); reveal(*m.groups())
elif m := CALL.match(js, i):
i = m.end(); func = m[1]
if func == setinner:
dosetinner(*m.groups()[1:])
elif func == splice:
dosplice(*m.groups()[1:])
elif func == swaplast:
doswaplast(*m.groups()[1:])
else:
assert not "possible"
elif m := QUIT.match(js, i):
i = m.end()
assert not js[i:].strip()
break
else:
assert not "possible"
for cls, style in afters.items():
styles.setdefault(cls, None)
if not style.visible or not style.content:
continue
assert not style.scramble
for node in byclass.get(cls, ()):
if children := list(node):
children[-1].tail = ((children[-1].tail or '') +
style.content)
else:
node.text = (node.text or '') + style.content
def applystyle(node, style):
if not style.visible:
node.drop_tree()
elif style.scramble:
node.text = unscramble(node.text)
for n in node.iterdescendants():
if n.text is not None:
n.text = unscramble(n.text)
if n.tail is not None:
n.tail = unscramble(n.tail)
for node in container.xpath('.//*[@style]'):
assert all(styles.get(cls) is None for cls in node.classes)
style = parsestyle(node.get('style'))
if node in revealed:
assert not style.scramble
continue
del node.attrib['style']
applystyle(node, style)
for cls, style in styles.items():
for node in byclass.get(cls, ()):
node.classes.remove(cls)
if style is not None:
applystyle(node, style)
for node in container.xpath('.//span'):
if not node.attrib:
node.drop_tag()
stdout.write(tostring(container, encoding='unicode', method='html'))
/* (C) Jeremy Ashkenas <https://www.lillicense.org/v1.html> */
/*--------------------- Layout and Typography ----------------------------*/
html { height: 100%; }
body {
font-family: 'Palatino Linotype', 'Book Antiqua', Palatino, FreeSerif, serif;
font-size: 14px;
line-height: 16px;
color: #252519;
margin: 0; padding: 0;
height:100%;
}
#container { min-height: 100%; }
a {
color: #261a3b;
}
a:visited {
color: #261a3b;
}
p, ul, ol {
margin: 0 0 15px;
}
h1, h2, h3, h4, h5, h6 {
margin: 30px 0 15px 0;
}
h1 {
line-height: 1.25;
margin-top: 40px;
}
hr {
border: 0 none;
border-top: 1px solid #e5e5ee;
height: 1px;
margin: 20px 0;
}
pre, tt, code {
font-size: 12px; line-height: 16px;
font-family: Menlo, Monaco, Consolas, "Lucida Console", monospace;
margin: 0; padding: 0;
}
ul.sections {
list-style: none;
padding:0 0 5px 0;;
margin:0;
}
/*
Force border-box so that % widths fit the parent
container without overlap because of margin/padding.
More Info : http://www.quirksmode.org/css/box.html
*/
ul.sections > li > div {
-moz-box-sizing: border-box; /* firefox */
-ms-box-sizing: border-box; /* ie */
-webkit-box-sizing: border-box; /* webkit */
-khtml-box-sizing: border-box; /* konqueror */
box-sizing: border-box; /* css3 */
}
/*---------------------- Jump Page -----------------------------*/
#jump_to, #jump_page {
margin: 0;
background: white;
-webkit-box-shadow: 0 0 25px #777; -moz-box-shadow: 0 0 25px #777;
-webkit-border-bottom-left-radius: 5px; -moz-border-radius-bottomleft: 5px;
font: 16px Arial;
cursor: pointer;
text-align: right;
list-style: none;
}
#jump_to a {
text-decoration: none;
}
#jump_to a.large {
display: none;
}
#jump_to a.small {
font-size: 22px;
font-weight: bold;
color: #676767;
}
#jump_to, #jump_wrapper {
position: fixed;
right: 0; top: 0;
padding: 10px 15px;
margin:0;
}
#jump_wrapper {
display: none;
padding:0;
}
#jump_to:hover #jump_wrapper {
display: block;
}
#jump_page {
padding: 5px 0 3px;
margin: 0 0 25px 25px;
}
#jump_page .source {
display: block;
padding: 15px;
text-decoration: none;
border-top: 1px solid #eee;
}
#jump_page .source:hover {
background: #f5f5ff;
}
#jump_page .source:first-child {
}
/*---------------------- Low resolutions (> 320px) ---------------------*/
@media only screen and (min-width: 320px) {
.sswrap { display: none; }
ul.sections > li > div {
display: block;
padding:5px 10px 0 10px;
}
ul.sections > li > div.annotation {
background: #fff;
}
ul.sections > li > div.annotation ul, ul.sections > li > div.annotation ol {
padding-left: 30px;
}
ul.sections > li > div.content {
background: #f5f5ff;
overflow-x:auto;
-webkit-box-shadow: inset 0 0 5px #e5e5ee;
box-shadow: inset 0 0 5px #e5e5ee;
border: 1px solid #dedede;
margin:5px 10px 5px 10px;
padding-bottom: 5px;
}
ul.sections > li > div.annotation pre {
margin: 7px 0 7px;
padding-left: 15px;
}
ul.sections > li > div.annotation p tt, .annotation code {
background: #f8f8ff;
border: 1px solid #dedede;
font-size: 12px;
padding: 0 0.2em;
}
}
/*---------------------- (> 481px) ---------------------*/
@media only screen and (min-width: 481px) {
#container {
position: relative;
}
body {
background-color: #F5F5FF;
font-size: 15px;
line-height: 22px;
}
pre, tt, code {
line-height: 18px;
}
#jump_to {
padding: 5px 10px;
}
#jump_wrapper {
padding: 0;
}
#jump_to, #jump_page {
font: 10px Arial;
text-transform: uppercase;
}
#jump_page .source {
padding: 5px 10px;
}
#jump_to a.large {
display: inline-block;
}
#jump_to a.small {
display: none;
}
#background {
position: absolute;
top: 0; bottom: 0;
width: 350px;
background: #ffffff;
border-right: 1px solid #e5e5ee;
z-index: -1;
}
ul.sections > li > div.annotation ul, ul.sections > li > div.annotation ol {
padding-left: 40px;
}
ul.sections > li {
white-space: nowrap;
}
ul.sections > li > div {
display: inline-block;
}
ul.sections > li > div.annotation {
max-width: 350px;
min-width: 350px;
min-height: 5px;
padding: 13px;
overflow-x: hidden;
white-space: normal;
vertical-align: top;
text-align: left;
}
ul.sections > li > div.annotation pre {
margin: 15px 0 15px;
padding-left: 15px;
}
ul.sections > li > div.content {
padding: 13px;
vertical-align: top;
background: #f5f5ff;
border: none;
-webkit-box-shadow: none;
box-shadow: none;
}
.sswrap {
position: relative;
display: inline;
}
.ss {
font: 12px Arial;
text-decoration: none;
color: #454545;
position: absolute;
top: 3px; left: -20px;
padding: 1px 2px;
opacity: 0;
-webkit-transition: opacity 0.2s linear;
}
.for-h1 .ss {
top: 47px;
}
.for-h2 .ss, .for-h3 .ss, .for-h4 .ss {
top: 35px;
}
ul.sections > li > div.annotation:hover .ss {
opacity: 1;
}
}
/*---------------------- (> 1025px) ---------------------*/
@media only screen and (min-width: 1025px) {
#background {
width: 525px;
}
ul.sections > li > div.annotation {
max-width: 525px;
min-width: 525px;
padding: 10px 25px 1px 50px;
}
ul.sections > li > div.content {
padding: 9px 15px 16px 25px;
}
}
/*---------------------- Syntax Highlighting -----------------------------*/
td.linenos { background-color: #f0f0f0; padding-right: 10px; }
span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
/*
github.com style (c) Vasily Polovnyov <vast@whiteants.net>
*/
pre code {
display: block; padding: 0.5em;
color: #000;
background: #f8f8ff
}
pre .hljs-comment,
pre .hljs-template_comment,
pre .hljs-diff .hljs-header,
pre .hljs-javadoc {
color: #408080;
font-style: italic
}
pre .hljs-keyword,
pre .hljs-assignment,
pre .hljs-literal,
pre .hljs-css .hljs-rule .hljs-keyword,
pre .hljs-winutils,
pre .hljs-javascript .hljs-title,
pre .hljs-lisp .hljs-title,
pre .hljs-subst {
color: #954121;
/*font-weight: bold*/
}
pre .hljs-number,
pre .hljs-hexcolor {
color: #40a070
}
pre .hljs-string,
pre .hljs-tag .hljs-value,
pre .hljs-phpdoc,
pre .hljs-tex .hljs-formula {
color: #219161;
}
pre .hljs-title,
pre .hljs-id {
color: #19469D;
}
pre .hljs-params {
color: #00F;
}
pre .hljs-javascript .hljs-title,
pre .hljs-lisp .hljs-title,
pre .hljs-subst {
font-weight: normal
}
pre .hljs-class .hljs-title,
pre .hljs-haskell .hljs-label,
pre .hljs-tex .hljs-command {
color: #458;
font-weight: bold
}
pre .hljs-tag,
pre .hljs-tag .hljs-title,
pre .hljs-rules .hljs-property,
pre .hljs-django .hljs-tag .hljs-keyword {
color: #000080;
font-weight: normal
}
pre .hljs-attribute,
pre .hljs-variable,
pre .hljs-instancevar,
pre .hljs-lisp .hljs-body {
color: #008080
}
pre .hljs-regexp {
color: #B68
}
pre .hljs-class {
color: #458;
font-weight: bold
}
pre .hljs-symbol,
pre .hljs-ruby .hljs-symbol .hljs-string,
pre .hljs-ruby .hljs-symbol .hljs-keyword,
pre .hljs-ruby .hljs-symbol .hljs-keymethods,
pre .hljs-lisp .hljs-keyword,
pre .hljs-tex .hljs-special,
pre .hljs-input_number {
color: #990073
}
pre .hljs-builtin,
pre .hljs-constructor,
pre .hljs-built_in,
pre .hljs-lisp .hljs-title {
color: #0086b3
}
pre .hljs-preprocessor,
pre .hljs-pi,
pre .hljs-doctype,
pre .hljs-shebang,
pre .hljs-cdata {
color: #999;
font-weight: bold
}
pre .hljs-deletion {
background: #fdd
}
pre .hljs-addition {
background: #dfd
}
pre .hljs-diff .hljs-change {
background: #0086b3
}
pre .hljs-chunk {
color: #aaa
}
pre .hljs-tex .hljs-formula {
opacity: 0.5;
}
*[role="doc-epigraph"] { text-align: right; font-style: italic; }
<!DOCTYPE html>
<html lang="en">
<head>
<title>Data obfuscation on the Russian Central Election Commission website</title>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, target-densitydpi=160dpi, initial-scale=1.0, maximum-scale=1.0, user-scalable=0">
<link rel="stylesheet" media="all" href="unfuck.py.css" />
</head>
<body>
<div id="container">
<div id="background"></div>
<ul class="sections">
<li id="section-1">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-1">&#x00a7;</a>
</div>
<h1 id="data-obfuscation-on-the-russian-central-election-commission-website">Data obfuscation on the Russian Central Election Commission website</h1>
</div>
</li>
<li id="section-2">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-2">&#x00a7;</a>
</div>
<p role="doc-epigraph" lang="la">Unum facit, aliud vastat</p>
<p>Since the early 2000s, the Central Election Commission of Russia, the
ultimate arbiter of Russian elections, has <a href="http://www.vybory.izbirkom.ru/">published</a> detailed
election results and related data down to full records from each polling
station. For most of the existence of the service, the only functional
output format was HTML, so researchers who studied that data (see,
<em>e.&hairsp;g.</em>, <a href="https://dx.doi.org/10.1214/16-AOAS904">Kobak <em>et al.</em> (2016)</a>, <a href="https://dx.doi.org/10.1073/pnas.1206770110">Enikolopov <em>et al.</em>
(2013)</a>, and other sources referenced in <a href="https://arxiv.org/abs/1204.0307">Shen’s living review</a>)
mostly had to resort to scaping the web site, which was never
particularly pleasant. Nevertheless, useful (if <a href="https://www.esquire.com/news-politics/a12952/russia-2012-elections-0312/">politically
provocative</a>) results were obtained, and <a href="https://github.com/dkobak/elections">collections</a> of more
easily accessible datasets were gradually being amassed.</p>
<p>The winds first started changing after the 2018 gubernatorial elections.
Amid allegations of widespread fraud, original versions of election
records that were later modified <a href="https://www.golosinfo.org/articles/142892">were discovered</a> to be readily
available in the public system. A hasty frontend patch appears to have
been applied several days later to bar access to any and all addresses
containing the string <code>version</code>, though it could be easily bypassed
using standard <a href="https://capec.mitre.org/data/definitions/267.html">alternate encoding</a> techniques.</p>
<p>In December 2019, the <a href="http://cikrf.ru/activity/docs/postanovleniya/45415/">second revision</a> of the regulations governing
the publication of election data contained a <a href="https://www.facebook.com/stas.klerk/posts/2975430105841942">subtly different
wording</a> that excluded any mention of automated access, this being
almost the only change to the <a href="http://cikrf.ru/activity/docs/postanovleniya/25386/">preceding version from 2010</a>. A
subpar but inconvenient mandatory CAPTCHA was imposed shortly afterwards
on all visitors and subsequently underwent severalrounds of relaxation
and tightening after analysts cried foul. The next year saw an
introduction of IP-address&ndash;based rate limiting of about 100
requests per <em>hour</em>, invisible to normal visitors but thoroughly
thwarting any attempts at real-time large-scale downloads without the
use of proxies (obtaining the complete precinct-level data for a single
federal election requires fetching almost 3000 summary reports, and
gathering supplementary information such as early voting numbers can
require visiting the pages of all 98000 precincts).</p>
<p>The <a href="http://www.cikrf.ru/activity/docs/postanovleniya/50452/">third revision</a>, put in effect shortly before the federal
parliamentary elections of 17&ndash;19 September 2021, contained <a href="https://www.golosinfo.org/articles/145481">a
further change of wording</a> to exclude mentions of users being able
to search or copy the data to their machines and to formally require
“protection” from automated tools. This change was at first thought to
be a largely symbolic formalization of the existing practice and
declaration of intent, until, on 19 September, it <a href="https://zona.media/chronicle/gosduma#42367">came to light</a>
that the Commission introduced a form of obfuscation onto its web pages.
In a graphical browser with JavaScript enabled, the results appeared
correctly, but attempting to copy them into the clipboard yielded
gibberish&thinsp;&mdash;&thinsp;or worse, wrong numeric
values&thinsp;&mdash;&thinsp;even though no interception of clipboard
events was taking place. Direct inspection of the HTML markup revealed
different and even more mangled numbers, misplaced table cells, garbage
characters in alphabetic strings, and a soup of seemingly meaningless
nested <a href="https://developer.mozilla.org/docs/Web/HTML/Element/span"><code>span</code> elements</a>.</p>
<p>This note describes the techniques used by the code on the page to
deobfuscate the mangled markup for the user’s consumption. It also
doubles as example Python code for performing that deobfuscation without
a full web browser. It was written by <a href="mailto:ashpilkin@gmail.com">Alexander Shpilkin</a> and
describes research current as of 21 September. Basic knowledge of web
technology is required, but the Python can be ignored without harm.</p>
<p>The canonical distribution point for this document as of the present
version is <a href="https://gist.github.com/alexshpilkin/bf25962064e570d10aca9a8a4b325b78">on GitHub</a>, which hosts the Markdown source, the
extracted Python code, and the web version generated by <a href="http://ashkenas.com/docco/">Docco</a>.
All of this is freely redistributable and modifiable without legal
restrictions as per the Creative Commons <a href="https://creativecommons.org/publicdomain/zero/1.0/">CC0 1.0</a> public domain
dedication, although the author asks you to exercise your judgment
regarding the degree of dissemination in light of the Commission’s
hostile behaviour and to follow common-sense attribution practices.</p>
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-comment">#!/usr/bin/env python3</span>
<span class="hljs-comment">#{ SPDX-License-Identifier: CC0-1.0 }</span>
<span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> namedtuple
<span class="hljs-keyword">from</span> fontTools.ttLib <span class="hljs-keyword">import</span> TTFont
<span class="hljs-keyword">from</span> io <span class="hljs-keyword">import</span> BytesIO
<span class="hljs-keyword">from</span> lxml.html <span class="hljs-keyword">import</span> document_fromstring
<span class="hljs-keyword">from</span> lxml.etree <span class="hljs-keyword">import</span> tostring
<span class="hljs-keyword">from</span> re <span class="hljs-keyword">import</span> finditer, <span class="hljs-built_in">compile</span> <span class="hljs-keyword">as</span> re_compile
<span class="hljs-keyword">from</span> requests <span class="hljs-keyword">import</span> get
<span class="hljs-keyword">from</span> sys <span class="hljs-keyword">import</span> stdin, stdout</pre></div></div>
</li>
<li id="section-3">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-3">&#x00a7;</a>
</div>
<h2 id="outline">Outline</h2>
</div>
</li>
<li id="section-4">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-4">&#x00a7;</a>
</div>
<p>The Central Election Commission website serves its pages in the
<a href="https://www.iana.org/assignments/charset-reg/windows-1251">Windows-1251</a> character encoding with Windows <a href="https://www.rfc-editor.org/old/EOLstory.txt">line endings</a>.
The example code accepts the HTML markup of the page to be deobfuscated
on standard input.</p>
</div>
<div class="content"><div class='highlight'><pre>stdin.reconfigure(encoding=<span class="hljs-string">&#x27;cp1251&#x27;</span>, newline=<span class="hljs-literal">None</span>)
tree = document_fromstring(stdin.read())</pre></div></div>
</li>
<li id="section-5">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-5">&#x00a7;</a>
</div>
<p>Apart from the HTML markup for the data table, data necessary for
deobfuscation include the CSS stylesheet and JavaScript code that are
output on a single line just afterwards, as well as an external font
file referenced in the stylesheet. These deobfuscate the data for the
user’s consumption by applying the following largely independent
transformations:</p>
<ol>
<li><p>Permute, replace, or delete some of the text using JavaScript.</p>
</li>
<li><p>Hide some HTML elements using either references to styles in the
stylesheet or inline styles. As an additional obfuscation measure,
some of the style declarations appear like they should result in
hiding the element, but are in fact illegal CSS ignored by the user
agent <a href="https://www.w3.org/TR/CSS21/syndata.html#parsing-errors">in accordance with the specification</a>.</p>
</li>
<li><p>Insert additional text after some HTML elements using <a href="https://developer.mozilla.org/docs/Web/CSS/CSS_Generated_Content">CSS generated
content</a>. As an additional obfuscation measure, some of this
text is hidden using the techiques of the previous point.</p>
</li>
<li><p>Apply a <a href="https://math.libretexts.org/@go/page/34276">simple substitution cipher</a> to some of the text by
displaying it in a special font that has its characters in the wrong
positions. This is only done for strings of digits, and only a
single font is used on a given page, although it changes if the
page is reloaded.</p>
</li>
</ol>
</div>
<div class="content"><div class='highlight'><pre>container, = tree.xpath(<span class="hljs-string">&#x27;//*[contains(concat(&quot; &quot;, @class, &quot; &quot;), &quot; show &quot;)]&#x27;</span>)
css, = container.xpath(<span class="hljs-string">&#x27;.//style&#x27;</span>); css.drop_tree(); css = <span class="hljs-built_in">str</span>(css.text)
js, = container.xpath(<span class="hljs-string">&#x27;.//script&#x27;</span>); js.drop_tree(); js = <span class="hljs-built_in">str</span>(js.text)
byclass = {}
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> container.xpath(<span class="hljs-string">&#x27;.//*[@class]&#x27;</span>):
<span class="hljs-keyword">for</span> cls <span class="hljs-keyword">in</span> node.classes:
byclass.setdefault(cls, []).append(node)</pre></div></div>
</li>
<li id="section-6">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-6">&#x00a7;</a>
</div>
<h2 id="style-syntax">Style syntax</h2>
</div>
</li>
<li id="section-7">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-7">&#x00a7;</a>
</div>
<p>There are three main things that interest the deobfuscator in a style
declaration: whether it hides the element, whether it applies the
scrambled font to it, and, for declarations applying to the <code>::after</code>
pseudoelement, what the element content will be sent to. Additionally,
the URL of the font has to be extracted from the <code>@font-face</code>
declaration. As the CSS cascade is effectively not used, which specific
properties are used to attain these ends can be ignored.</p>
<p>Property declarations used for hiding are:</p>
<ul>
<li><code>display: none</code>; other valid values such as <code>inline</code> and
<code>inline-block</code>, as well as the illegal <code>inlineblock</code>, occur as well
and must be ignored;</li>
<li><code>top: -9...9px</code>, <code>left: -9...9px</code>; the value <code>0</code> or <code>0px</code> and values
using the illegal unit <code>xp</code> occur as well and must be ignored;</li>
<li><code>z-index: -9...9</code>; the value <code>1</code> occurs as well and must be
ignored;</li>
<li><code>font-size: 0</code>; the default value <code>inherit</code> occurs as well and must
be ignored;</li>
<li><code>opacity: 0</code>; the illegal value <code>0px</code> occurs as well and must be
ignored;</li>
<li><code>width: 0</code> or <code>0px</code>, <code>height: 0</code> or <code>0px</code>; values using the illegal
unit <code>xp</code> occur here as well and must be ignored;</li>
<li><code>color: white</code> or <code>transparent</code>; the default value <code>inherit</code> occurs
as well and must be ignored;</li>
<li><code>visibility: hidden</code>; the illegal value <code>true</code> occurs as well and must
be ignored.</li>
</ul>
<p>As we can see, a relatively substantial effort seems to have been put
into thwarting simple substring matching, but no really broken CSS is
output, so matching each declarations completely against a good enough
set of regular expressions while ignoring all unrecognized declarations
is sufficient.</p>
<p>The font file is provided in multiple formats for compatibility,
including <a href="https://developer.apple.com/fonts/TrueType-Reference-Manual/">TTF</a>, <a href="https://docs.microsoft.com/en-us/typography/opentype/spec/">OTF</a>, <a href="https://www.w3.org/Submission/EOT/">EOT</a>, <a href="https://w3.org/TR/WOFF/">WOFF</a> and <a href="https://w3.org/TR/WOFF2/">WOFF2</a>,
but all of these contain equivalent data.</p>
</div>
<div class="content"><div class='highlight'><pre>Style = namedtuple(<span class="hljs-string">&#x27;Style&#x27;</span>,
<span class="hljs-string">&#x27;visible scramble content&#x27;</span>,
defaults=(<span class="hljs-literal">True</span>, <span class="hljs-literal">False</span>, <span class="hljs-literal">None</span>))
HIDE = re_compile(<span class="hljs-string">r&quot;display: *none|(top|left): *-9+px|z-index: *-9+|(font-size|opacity): *0|(width|height): *0(px)?|color: *(white|transparent)|visibility: *hidden&quot;</span>)
CONTENT = re_compile(<span class="hljs-string">r&quot;content: *&#x27;([^\&#x27;]*)&#x27;&quot;</span>)
FONTFAM = re_compile(<span class="hljs-string">r&#x27;font-family: *&quot;([^\&quot;]*)&quot;( *!important)?&#x27;</span>)
FONTURL = re_compile(<span class="hljs-string">r&#x27;src:.* url\(&quot;\./([^\&quot;]*\.ttf)&quot;\).*&#x27;</span>)
fontfam = fonturl = <span class="hljs-literal">None</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">parsestyle</span>(<span class="hljs-params">decs</span>):
<span class="hljs-keyword">global</span> fontfam, fonturl
style = Style()
<span class="hljs-keyword">for</span> dec <span class="hljs-keyword">in</span> decs.split(<span class="hljs-string">&#x27;;&#x27;</span>):
dec = dec.strip()
<span class="hljs-keyword">if</span> HIDE.fullmatch(dec):
style = style._replace(visible=<span class="hljs-literal">False</span>)
<span class="hljs-keyword">elif</span> m := CONTENT.fullmatch(dec):
style = style._replace(content=m[<span class="hljs-number">1</span>])
<span class="hljs-keyword">elif</span> m := FONTFAM.fullmatch(dec):
<span class="hljs-keyword">if</span> fontfam <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>:
fontfam = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">assert</span> m[<span class="hljs-number">1</span>] == fontfam
style = style._replace(scramble=<span class="hljs-literal">True</span>)
<span class="hljs-keyword">elif</span> m := FONTURL.fullmatch(dec):
<span class="hljs-keyword">assert</span> fonturl <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>
fonturl = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">return</span> style</pre></div></div>
</li>
<li id="section-8">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-8">&#x00a7;</a>
</div>
<p>Each rule in the stylesheet has a selector of the form <code>.T .C</code> or <code>.T .C::after</code>, where <em>T</em> is the randomly named class assigned to the data
table as a whole and <em>C</em> is a randomly named class assigned to one or
more of its children. There is also a single <code>@font-face</code> rule to point
the browser to the font file.</p>
</div>
<div class="content"><div class='highlight'><pre>SELECTOR = re_compile(<span class="hljs-string">r&quot;\.([a-z_]*(::after)?)&quot;</span>)
styles, afters = {}, {}
end = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> m <span class="hljs-keyword">in</span> finditer(<span class="hljs-string">r&#x27; *([-@a-z_.: ]+?) *\{([^}]*)\}&#x27;</span>, css):
<span class="hljs-keyword">assert</span> m.start() == end; end = m.end()
sel, decs = m.groups()
style = parsestyle(decs)
<span class="hljs-keyword">if</span> sel == <span class="hljs-string">&#x27;@font-face&#x27;</span>:
<span class="hljs-keyword">continue</span>
parent, sel = sel.split()
<span class="hljs-keyword">assert</span> parent[<span class="hljs-number">0</span>] == <span class="hljs-string">&#x27;.&#x27;</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(byclass.get(parent[<span class="hljs-number">1</span>:], ())) == <span class="hljs-number">1</span>
m = SELECTOR.fullmatch(sel)
<span class="hljs-keyword">assert</span> m <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">if</span> m[<span class="hljs-number">1</span>].endswith(<span class="hljs-string">&#x27;::after&#x27;</span>):
<span class="hljs-keyword">assert</span> m[<span class="hljs-number">1</span>].removesuffix(<span class="hljs-string">&#x27;::after&#x27;</span>) <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> afters
afters[m[<span class="hljs-number">1</span>].removesuffix(<span class="hljs-string">&#x27;::after&#x27;</span>)] = style
<span class="hljs-keyword">else</span>:
<span class="hljs-keyword">assert</span> m[<span class="hljs-number">1</span>] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> styles
styles[m[<span class="hljs-number">1</span>]] = style
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> css[end:].strip()</pre></div></div>
</li>
<li id="section-9">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-9">&#x00a7;</a>
</div>
<h2 id="font">Font</h2>
</div>
</li>
<li id="section-10">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-10">&#x00a7;</a>
</div>
<p>The server generates a new URL for the font on each request, but the
file it refers to only appears to change once per several seconds. The
generated URL is relative to the <a href="https://www.rfc-editor.org/rfc/rfc3986.html#section-3">scheme and domain</a> of the page
URL, which is usually <code>http://www.R.vybory.izbirkom.ru/</code> for some region
<em>R</em>, but, as always, all of the region-specific domains point to exactly
the same data as <code>http://www.vybory.izbirkom.ru/</code>.</p>
<p>It is unlikely that the unique URLs will be retained forever, so care
must be taken when scraping the obfuscated HTML to extract the URL for
and save the font file as well, as the substitution doesn’t seem to be
recoverable from the URL string alone. A simple if technically
incorrect regular expression such as <code>url\(&quot;\./([^\&quot;]*\.ttf)&quot;\)</code> can be
used to extract the necessary URL from the markup. The example code
ignores this potential problem and downloads the font during execution.</p>
<p>Sometime on 20 September, the web server has started rejecting requests
mentioning some common HTTP automation tools (in particular, <a href="https://curl.se/">curl</a>
and <a href="https://2.python-requests.org/">requests</a>, but not <a href="https://www.gnu.org/software/wget/">wget</a>) in their <a href="https://www.rfc-editor.org/rfc/rfc7231.html#section-5.5.3"><code>User-Agent</code> HTTP
header</a> with a <a href="https://www.rfc-editor.org/rfc/rfc7231.html#section-6.5.3">403 Forbidden status code</a>; changing the header
value to <code>Mozilla/5.0</code> (which <a href="https://webaim.org/blog/user-agent-string-history/">is contained</a> in the values sent by
popular web browsers) appears to be enough to avoid the ban.</p>
<p>A number of advanced approaches could have been used to recover the
substitution from the font file, from matching the character contours
exactly against the original font (<a href="https://www.paratype.com/fonts/pt/pt-sans">PT Sans</a> by Paratype) to
rendering the characters into a bitmap and applying perceptual hashes or
<abbr title="optical character recognition">OCR</abbr>. None of this
turns out to be necessary, as while the glyph <abbr
title="identifiers">IDs</abbr>, character positions, and glyph order in
the font file all appear to be scrambled, the <a href="https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6post.html">PostScript glyph
names</a> (which for <a href="https://www.unicode.org/glossary/#european_digits">European digits</a> are <code>zero</code>, <code>one</code>, <em>etc.</em>)
are intact.</p>
</div>
<div class="content"><div class='highlight'><pre>ttf = get(<span class="hljs-string">&#x27;http://www.vybory.izbirkom.ru/&#x27;</span> + fonturl,
headers={<span class="hljs-string">&#x27;User-Agent&#x27;</span>: <span class="hljs-string">&#x27;Mozilla/5.0&#x27;</span>})
ttf = TTFont(BytesIO(ttf.content))
subst = {v: k <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> ttf.getBestCmap().items()}
subst = {<span class="hljs-built_in">chr</span>(subst[n]): <span class="hljs-built_in">str</span>(k) <span class="hljs-keyword">for</span> k, n <span class="hljs-keyword">in</span>
<span class="hljs-built_in">enumerate</span>(<span class="hljs-string">&#x27;zero one two three four five six seven eight nine&#x27;</span>.split())}
<span class="hljs-keyword">def</span> <span class="hljs-title function_">unscramble</span>(<span class="hljs-params">s</span>):
<span class="hljs-keyword">return</span> <span class="hljs-string">&#x27;&#x27;</span>.join(subst[c] <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> s)</pre></div></div>
</li>
<li id="section-11">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-11">&#x00a7;</a>
</div>
<h2 id="javascript-semantics">JavaScript semantics</h2>
</div>
</li>
<li id="section-12">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-12">&#x00a7;</a>
</div>
<p>The three possible operations applied from JavaScript are fixed,
and no attempts seems to have been made at obfuscating their
implementation. The original names of the transformations are unknown,
so code names corresponding to their behaviour are used below.</p>
<ul>
<li><p>The operation <em>setInner</em>(<em>C</em>, <em>V</em>, <em>E</em>) replaces the
<a href="https://developer.mozilla.org/docs/Web/API/Element/innerHTML"><code>innerHTML</code></a> of each element with class <em>C</em> (passed as a
single-quoted JavaScript string literal) with <em>V</em> (passed the same
way). The scope of the operation is <em>E</em>, which is invariably the
whole data table (passed as a variable with name identical to the
randomly-generated class name of the table). The element can only
contain text with no markup both before and after the operation, and
the text may or may not be numeric.</p>
</li>
<li><p>The operation <em>splice</em>(<em>C</em>, <em>I</em>, <em>E</em>) deletes the character at
zero-based position <em>I</em> (passed as a decimal JavaScript integer
literal) in the <code>innerHTML</code> of each element with class <em>C</em> (passed as
above). If <em>I</em> is negative, then -1 refers to the last character, -2
to the second-to-last character, <em>etc.</em> The scope of the operation is
<em>E</em>, as above. The character is always inside the text either before
or after any child elements, which there can be.</p>
</li>
<li><p>The operation <em>swapLast</em>(<em>I</em>, <em>J</em>, <em>E</em>) exchanges the text content
(or, equivalently, <code>innerHTML</code>) of the <em>last leaf children</em> of the
<code>td</code> elements with zero-based numbers <em>I</em> and <em>J</em> (passed, likely to
encourage confusion with <em>setInner</em>, as single-quoted JavaScript
strings containing non-negative decimal numerals) during a <a href="https://xlinux.nist.gov/dads/HTML/preorderTraversal.html">preorder
traversal</a> of the element tree. The scope of the operation is
<em>E</em>, as above. Here, the <em>last leaf child</em> of an element is the last
node encountered during a preorder traversal that contains only text.</p>
</li>
</ul>
<p>Several successive references can be made to the same element using the
same class name.</p>
<p>Finally, until the deobfuscation is complete, several elements including
the data table are completely hidden by inline styles, probably so that
the user does not see the document change. They are then found by their
randomly-generated class names and revealed after a short delay. Those
are the elements that can be passed as the scope <em>E</em> to the operations
above.</p>
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">def</span> <span class="hljs-title function_">string</span>(<span class="hljs-params">src</span>):
<span class="hljs-keyword">assert</span> src[<span class="hljs-number">0</span>] == <span class="hljs-string">&quot;&#x27;&quot;</span> <span class="hljs-keyword">and</span> src[-<span class="hljs-number">1</span>] == <span class="hljs-string">&quot;&#x27;&quot;</span> <span class="hljs-keyword">and</span> <span class="hljs-string">&quot;\\&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> src
<span class="hljs-keyword">return</span> src[<span class="hljs-number">1</span>:-<span class="hljs-number">1</span>]
revealed = <span class="hljs-built_in">set</span>()
<span class="hljs-keyword">def</span> <span class="hljs-title function_">reveal</span>(<span class="hljs-params">cls</span>):
styles.setdefault(cls, <span class="hljs-literal">None</span>)
node, = byclass.get(cls, ())
decs = <span class="hljs-string">&#x27;;&#x27;</span>.join(dec <span class="hljs-keyword">for</span> dec <span class="hljs-keyword">in</span> node.attrib.pop(<span class="hljs-string">&#x27;style&#x27;</span>).split(<span class="hljs-string">&#x27;;&#x27;</span>)
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> HIDE.fullmatch(dec.strip()))
<span class="hljs-keyword">if</span> decs:
node.<span class="hljs-built_in">set</span>(<span class="hljs-string">&#x27;style&#x27;</span>, decs)
revealed.add(node)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">dosetinner</span>(<span class="hljs-params">cls, val, elt</span>):
cls, val = string(cls), string(val)
<span class="hljs-keyword">assert</span> byclass[elt][<span class="hljs-number">0</span>] <span class="hljs-keyword">in</span> revealed
styles.setdefault(cls, <span class="hljs-literal">None</span>)
<span class="hljs-keyword">assert</span> <span class="hljs-string">&#x27;&lt;&#x27;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> val <span class="hljs-keyword">and</span> <span class="hljs-string">&#x27;&amp;&#x27;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> val
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> byclass.get(cls, ()):
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> <span class="hljs-built_in">list</span>(node)
node.text = val
<span class="hljs-keyword">def</span> <span class="hljs-title function_">dosplice</span>(<span class="hljs-params">cls, idx, elt</span>):
cls, idx = string(cls), <span class="hljs-built_in">int</span>(idx)
<span class="hljs-keyword">assert</span> byclass[elt][<span class="hljs-number">0</span>] <span class="hljs-keyword">in</span> revealed
styles.setdefault(cls, <span class="hljs-literal">None</span>)
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> byclass.get(cls, ()):
children = <span class="hljs-built_in">list</span>(node)
<span class="hljs-keyword">if</span> children <span class="hljs-keyword">and</span> idx &lt; <span class="hljs-number">0</span>:
text = children[-<span class="hljs-number">1</span>].tail
<span class="hljs-keyword">else</span>:
text = node.text
<span class="hljs-keyword">assert</span> idx &lt; <span class="hljs-built_in">len</span>(text) <span class="hljs-keyword">and</span> -idx &lt;= <span class="hljs-built_in">len</span>(text)
text = text[:idx] + text[idx+<span class="hljs-number">1</span>:] <span class="hljs-keyword">if</span> idx != -<span class="hljs-number">1</span> <span class="hljs-keyword">else</span> text[:-<span class="hljs-number">1</span>]
<span class="hljs-keyword">if</span> children <span class="hljs-keyword">and</span> idx &lt; <span class="hljs-number">0</span>:
children[-<span class="hljs-number">1</span>].tail = text
<span class="hljs-keyword">else</span>:
node.text = text
<span class="hljs-keyword">def</span> <span class="hljs-title function_">lec</span>(<span class="hljs-params">node</span>):
children = <span class="hljs-built_in">list</span>(node)
<span class="hljs-keyword">return</span> lec(children[-<span class="hljs-number">1</span>]) <span class="hljs-keyword">if</span> children <span class="hljs-keyword">else</span> node
<span class="hljs-keyword">def</span> <span class="hljs-title function_">doswaplast</span>(<span class="hljs-params">fst, snd, elt</span>):
fst, snd = <span class="hljs-built_in">int</span>(string(fst)), <span class="hljs-built_in">int</span>(string(snd))
table, = byclass[elt]
<span class="hljs-keyword">assert</span> table <span class="hljs-keyword">in</span> revealed
nodes = table.xpath(<span class="hljs-string">&#x27;.//td&#x27;</span>) <span class="hljs-comment"># FIXME compile?</span>
fst, snd = lec(nodes[fst]), lec(nodes[snd])
fst.text, snd.text = snd.text, fst.text</pre></div></div>
</li>
<li id="section-13">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-13">&#x00a7;</a>
</div>
<h2 id="javascript-syntax">JavaScript syntax</h2>
</div>
</li>
<li id="section-14">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-14">&#x00a7;</a>
</div>
<p>The JavaScript code of the deobfuscator has a straightforward structure.
First, functions implementing some or all of the operations above, as
well as an auxiliary function for finding the <em>last leaf child</em> of an
element, are defined as necessary and in random order. The auxiliary
function is always called <code>lec</code>, but the rest have random names which
an independent deobfuscator needs to extract and remember. The
implementations used are fixed except for some of the variable names (so
can be distinguished using regular expressions) and, in a pretty-printed
form, are as follows (randomized names in capitals):</p>
<pre><code class="language-javascript"><span class="hljs-keyword">var</span> <span class="hljs-title class_">SetInner</span> = <span class="hljs-keyword">function</span>(<span class="hljs-params">C, V, E</span>) {
<span class="hljs-keyword">var</span> L = E.<span class="hljs-title function_">getElementsByClassName</span>(C);
<span class="hljs-keyword">for</span> (<span class="hljs-keyword">var</span> i = <span class="hljs-number">0</span>; i &lt; L.<span class="hljs-property">length</span>; i++) {
L[i].<span class="hljs-property">innerHTML</span> = V;
};
};
<span class="hljs-keyword">var</span> <span class="hljs-title class_">Splice</span> = <span class="hljs-keyword">function</span>(<span class="hljs-params">C, I, E</span>) {
<span class="hljs-keyword">var</span> L = E.<span class="hljs-title function_">getElementsByClassName</span>(C);
<span class="hljs-keyword">for</span> (<span class="hljs-keyword">var</span> i = <span class="hljs-number">0</span>; i &lt; L.<span class="hljs-property">length</span>; i++) {
<span class="hljs-keyword">var</span> v = L[i].<span class="hljs-property">innerHTML</span>.<span class="hljs-title function_">split</span>(<span class="hljs-string">&#x27;&#x27;</span>);
v.<span class="hljs-title function_">splice</span>(I, <span class="hljs-number">1</span>);
L[i].<span class="hljs-property">innerHTML</span> = v.<span class="hljs-title function_">join</span>(<span class="hljs-string">&#x27;&#x27;</span>);
};
};
<span class="hljs-keyword">if</span> (!lec) {
<span class="hljs-keyword">var</span> lec = <span class="hljs-keyword">function</span>(<span class="hljs-params">a</span>) {
<span class="hljs-keyword">var</span> b = a.<span class="hljs-property">lastElementChild</span>;
<span class="hljs-keyword">if</span> (!b) <span class="hljs-keyword">return</span> a;
<span class="hljs-keyword">if</span> (b.<span class="hljs-property">lastElementChild</span>) <span class="hljs-keyword">return</span> <span class="hljs-title function_">lec</span>(b);
<span class="hljs-keyword">return</span> b;
};
};;
<span class="hljs-keyword">var</span> <span class="hljs-title class_">SwapLast</span> = <span class="hljs-keyword">function</span>(<span class="hljs-params">I, J, E</span>) {
<span class="hljs-keyword">var</span> L = E.<span class="hljs-title function_">getElementsByTagName</span>(<span class="hljs-string">&#x27;td&#x27;</span>);
<span class="hljs-keyword">var</span> X = <span class="hljs-title function_">lec</span>(L[I]);
<span class="hljs-keyword">var</span> Y = <span class="hljs-title function_">lec</span>(L[J]);
<span class="hljs-keyword">var</span> S = X.<span class="hljs-property">innerHTML</span>;
<span class="hljs-keyword">var</span> T = Y.<span class="hljs-property">innerHTML</span>;
X.<span class="hljs-property">innerHTML</span> = T;
Y.<span class="hljs-property">innerHTML</span> = S;
};
</code></pre>
<p>The two redundant semicolons after the <code>if</code> are not a typo in this
document, but apparently a mistake made by the original programmer.</p>
<p>Next, a function called <code>a</code> is defined that calls the above three to
perform the deobfuscation, and also schedules elements to be revealed
using a copy of a code snippet for each. Finally, <code>a</code> is scheduled to
be executed as soon as the user agent has <a href="https://developer.mozilla.org/docs/Web/API/Window/DOMContentLoaded_event">finished building the
document tree</a>. The general structure, using the same conventions
as above, is</p>
<pre><code class="language-javascript"><span class="hljs-keyword">var</span> a = <span class="hljs-keyword">function</span>(<span class="hljs-params"></span>) {
<span class="hljs-comment">/* ... */</span>
<span class="hljs-keyword">var</span> X = <span class="hljs-variable language_">document</span>.<span class="hljs-title function_">getElementsByClassName</span>(<span class="hljs-string">&#x27;X&#x27;</span>)[<span class="hljs-number">0</span>];
X.<span class="hljs-property">style</span>.<span class="hljs-property">position</span> = <span class="hljs-string">&#x27;relative&#x27;</span>;
<span class="hljs-built_in">setTimeout</span>(<span class="hljs-keyword">function</span>(<span class="hljs-params"></span>) {
X.<span class="hljs-property">style</span>.<span class="hljs-title function_">removeProperty</span>(<span class="hljs-string">&#x27;opacity&#x27;</span>);
X.<span class="hljs-property">style</span>.<span class="hljs-title function_">removeProperty</span>(<span class="hljs-string">&#x27;visibility&#x27;</span>);
}, <span class="hljs-number">700</span>);
<span class="hljs-comment">/* ... */</span>
<span class="hljs-title function_">F</span>(<span class="hljs-string">&#x27;Y&#x27;</span>, <span class="hljs-string">&#x27;data&#x27;</span>, X);
<span class="hljs-comment">/* ... */</span>
};
<span class="hljs-variable language_">document</span>.<span class="hljs-title function_">addEventListener</span>(<span class="hljs-string">&#x27;DOMContentLoaded&#x27;</span>, a);
</code></pre>
</div>
<div class="content"><div class='highlight'><pre>IGNORE = re_compile(<span class="hljs-string">r&quot; +|;|if *\(!lec\) *\{[^}]*\{[^}]*\}[^}]*\}|var *a *= *function\(\) *\{&quot;</span>)
SETINNER = re_compile(<span class="hljs-string">r&quot;var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*innerHTML *= *[a-z_]+ *;[^}]*\}[^}]*\} *;&quot;</span>)
SPLICE = re_compile(<span class="hljs-string">r&quot;var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*splice[^}]*\}[^}]*\} *;&quot;</span>)
SWAPLAST = re_compile(<span class="hljs-string">r&quot;var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*getElementsByTagName\(&#x27;td&#x27;\)[^}]*\} *;&quot;</span>)
REVEAL = re_compile(<span class="hljs-string">r&quot;var +([a-z_]+) *= *document\.getElementsByClassName[^}]*setTimeout\(function *\(\) *\{[^}]*\}[^)]*\) *;&quot;</span>)
CALL = re_compile(<span class="hljs-string">r&quot;([a-z_]*)\((&#x27;[^\&#x27;]*&#x27;), *(-?[0-9]*|&#x27;[^\&#x27;]*&#x27;), *([a-z_]*)\) *;&quot;</span>)
QUIT = re_compile(<span class="hljs-string">r&quot;\} *; *document\.addEventListener\(&#x27;DOMContentLoaded&#x27;, *a\) *;&quot;</span>)
setinner = splice = swaplast = <span class="hljs-literal">None</span>
i = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> <span class="hljs-literal">True</span>:
<span class="hljs-keyword">if</span> m := IGNORE.match(js, i):
i = m.end()
<span class="hljs-keyword">elif</span> m := SETINNER.match(js, i):
i = m.end(); setinner = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">elif</span> m := SPLICE.match(js, i):
i = m.end(); splice = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">elif</span> m := SWAPLAST.match(js, i):
i = m.end(); swaplast = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">elif</span> m := REVEAL.match(js, i):
i = m.end(); reveal(*m.groups())
<span class="hljs-keyword">elif</span> m := CALL.match(js, i):
i = m.end(); func = m[<span class="hljs-number">1</span>]
<span class="hljs-keyword">if</span> func == setinner:
dosetinner(*m.groups()[<span class="hljs-number">1</span>:])
<span class="hljs-keyword">elif</span> func == splice:
dosplice(*m.groups()[<span class="hljs-number">1</span>:])
<span class="hljs-keyword">elif</span> func == swaplast:
doswaplast(*m.groups()[<span class="hljs-number">1</span>:])
<span class="hljs-keyword">else</span>:
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> <span class="hljs-string">&quot;possible&quot;</span>
<span class="hljs-keyword">elif</span> m := QUIT.match(js, i):
i = m.end()
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> js[i:].strip()
<span class="hljs-keyword">break</span>
<span class="hljs-keyword">else</span>:
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> <span class="hljs-string">&quot;possible&quot;</span></pre></div></div>
</li>
<li id="section-15">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-15">&#x00a7;</a>
</div>
<h2 id="style-semantics">Style semantics</h2>
</div>
</li>
<li id="section-16">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-16">&#x00a7;</a>
</div>
<p>While a full implementation of CSS, even restricted to the properties
listed above, would be unbearably complicated, the deobfuscation styles
do not make use of any particularly tricky features like the
interpretation of <code>top</code> and <code>left</code> with <code>position: static</code>, cascading or
inheritance. Furthermore, either a class reference or an inline style
can be used on any given element, but not both. A na&iuml;ve quasi-CSS
processor is thus sufficient.</p>
<p>The processor needs to run after JavaScript execution. It should add
the generated content to the document (unless it is hidden by the
accompanying styles), remove nodes which are hidden, and decipher all
text under nodes which have the scrambled font applied to them.
Generated content is never scrambled.</p>
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">for</span> cls, style <span class="hljs-keyword">in</span> afters.items():
styles.setdefault(cls, <span class="hljs-literal">None</span>)
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> style.visible <span class="hljs-keyword">or</span> <span class="hljs-keyword">not</span> style.content:
<span class="hljs-keyword">continue</span>
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> style.scramble
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> byclass.get(cls, ()):
<span class="hljs-keyword">if</span> children := <span class="hljs-built_in">list</span>(node):
children[-<span class="hljs-number">1</span>].tail = ((children[-<span class="hljs-number">1</span>].tail <span class="hljs-keyword">or</span> <span class="hljs-string">&#x27;&#x27;</span>) +
style.content)
<span class="hljs-keyword">else</span>:
node.text = (node.text <span class="hljs-keyword">or</span> <span class="hljs-string">&#x27;&#x27;</span>) + style.content
<span class="hljs-keyword">def</span> <span class="hljs-title function_">applystyle</span>(<span class="hljs-params">node, style</span>):
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> style.visible:
node.drop_tree()
<span class="hljs-keyword">elif</span> style.scramble:
node.text = unscramble(node.text)
<span class="hljs-keyword">for</span> n <span class="hljs-keyword">in</span> node.iterdescendants():
<span class="hljs-keyword">if</span> n.text <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
n.text = unscramble(n.text)
<span class="hljs-keyword">if</span> n.tail <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
n.tail = unscramble(n.tail)
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> container.xpath(<span class="hljs-string">&#x27;.//*[@style]&#x27;</span>):
<span class="hljs-keyword">assert</span> <span class="hljs-built_in">all</span>(styles.get(cls) <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">for</span> cls <span class="hljs-keyword">in</span> node.classes)
style = parsestyle(node.get(<span class="hljs-string">&#x27;style&#x27;</span>))
<span class="hljs-keyword">if</span> node <span class="hljs-keyword">in</span> revealed:
<span class="hljs-keyword">assert</span> <span class="hljs-keyword">not</span> style.scramble
<span class="hljs-keyword">continue</span>
<span class="hljs-keyword">del</span> node.attrib[<span class="hljs-string">&#x27;style&#x27;</span>]
applystyle(node, style)
<span class="hljs-keyword">for</span> cls, style <span class="hljs-keyword">in</span> styles.items():
<span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> byclass.get(cls, ()):
node.classes.remove(cls)
<span class="hljs-keyword">if</span> style <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
applystyle(node, style)</pre></div></div>
</li>
<li id="section-17">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-17">&#x00a7;</a>
</div>
<h2 id="cleanup">Cleanup</h2>
</div>
</li>
<li id="section-18">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-18">&#x00a7;</a>
</div>
<p>After both the JavaScript execution and the CSS processing are complete,
the randomly-generated classes and the inline styles can be discarded
for clarity, as they do not affect the user experience in any other way.</p>
<p>A lot of both empty and nonempty <code>span</code> elements will be left as well,
which can also be deleted if their content is retained. The rest of
the markup is significant.</p>
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">for</span> node <span class="hljs-keyword">in</span> container.xpath(<span class="hljs-string">&#x27;.//span&#x27;</span>):
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> node.attrib:
node.drop_tag()</pre></div></div>
</li>
<li id="section-19">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-19">&#x00a7;</a>
</div>
<h2 id="all-done">All done!</h2>
</div>
</li>
<li id="section-20">
<div class="annotation">
<div class="sswrap ">
<a class="ss" href="#section-20">&#x00a7;</a>
</div>
<p>The example code writes (only) the deobfuscated data to standard output
according to the system encoding and line ending convention.</p>
</div>
<div class="content"><div class='highlight'><pre>stdout.write(tostring(container, encoding=<span class="hljs-string">&#x27;unicode&#x27;</span>, method=<span class="hljs-string">&#x27;html&#x27;</span>))</pre></div></div>
</li>
</ul>
</div>
</body>
</html>

Data obfuscation on the Russian Central Election Commission website

Unum facit, aliud vastat

Since the early 2000s, the Central Election Commission of Russia, the ultimate arbiter of Russian elections, has published detailed election results and related data down to full records from each polling station. For most of the existence of the service, the only functional output format was HTML, so researchers who studied that data (see, e. g., Kobak et al. (2016), Enikolopov et al. (2013), and other sources referenced in Shen's living review) mostly had to resort to scaping the web site, which was never particularly pleasant. Nevertheless, useful (if politically provocative) results were obtained, and collections of more easily accessible datasets were gradually being amassed.

The winds first started changing after the 2018 gubernatorial elections. Amid allegations of widespread fraud, original versions of election records that were later modified were discovered to be readily available in the public system. A hasty frontend patch appears to have been applied several days later to bar access to any and all addresses containing the string version, though it could be easily bypassed using standard alternate encoding techniques.

In December 2019, the second revision of the regulations governing the publication of election data contained a subtly different wording that excluded any mention of automated access, this being almost the only change to the preceding version from 2010. A subpar but inconvenient mandatory CAPTCHA was imposed shortly afterwards on all visitors and subsequently underwent severalrounds of relaxation and tightening after analysts cried foul. The next year saw an introduction of IP-address–based rate limiting of about 100 requests per hour, invisible to normal visitors but thoroughly thwarting any attempts at real-time large-scale downloads without the use of proxies (obtaining the complete precinct-level data for a single federal election requires fetching almost 3000 summary reports, and gathering supplementary information such as early voting numbers can require visiting the pages of all 98000 precincts).

The third revision, put in effect shortly before the federal parliamentary elections of 17–19 September 2021, contained a further change of wording to exclude mentions of users being able to search or copy the data to their machines and to formally require "protection" from automated tools. This change was at first thought to be a largely symbolic formalization of the existing practice and declaration of intent, until, on 19 September, it came to light that the Commission introduced a form of obfuscation onto its web pages. In a graphical browser with JavaScript enabled, the results appeared correctly, but attempting to copy them into the clipboard yielded gibberish — or worse, wrong numeric values — even though no interception of clipboard events was taking place. Direct inspection of the HTML markup revealed different and even more mangled numbers, misplaced table cells, garbage characters in alphabetic strings, and a soup of seemingly meaningless nested span elements.

This note describes the techniques used by the code on the page to deobfuscate the mangled markup for the user's consumption. It also doubles as example Python code for performing that deobfuscation without a full web browser. It was written by Alexander Shpilkin and describes research current as of 21 September. Basic knowledge of web technology is required, but the Python can be ignored without harm.

The canonical distribution point for this document as of the present version is on GitHub, which hosts the Markdown source, the extracted Python code, and the web version generated by Docco. All of this is freely redistributable and modifiable without legal restrictions as per the Creative Commons CC0 1.0 public domain dedication, although the author asks you to exercise your judgment regarding the degree of dissemination in light of the Commission's hostile behaviour and to follow common-sense attribution practices.

#!/usr/bin/env python3
#{ SPDX-License-Identifier: CC0-1.0 }

from collections import namedtuple
from fontTools.ttLib import TTFont
from io import BytesIO
from lxml.html import document_fromstring
from lxml.etree import tostring
from re import finditer, compile as re_compile
from requests import get
from sys import stdin, stdout

Outline

The Central Election Commission website serves its pages in the Windows-1251 character encoding with Windows line endings. The example code accepts the HTML markup of the page to be deobfuscated on standard input.

stdin.reconfigure(encoding='cp1251', newline=None)
tree = document_fromstring(stdin.read())

Apart from the HTML markup for the data table, data necessary for deobfuscation include the CSS stylesheet and JavaScript code that are output on a single line just afterwards, as well as an external font file referenced in the stylesheet. These deobfuscate the data for the user's consumption by applying the following largely independent transformations:

  1. Permute, replace, or delete some of the text using JavaScript.

  2. Hide some HTML elements using either references to styles in the stylesheet or inline styles. As an additional obfuscation measure, some of the style declarations appear like they should result in hiding the element, but are in fact illegal CSS ignored by the user agent in accordance with the specification.

  3. Insert additional text after some HTML elements using CSS generated content. As an additional obfuscation measure, some of this text is hidden using the techiques of the previous point.

  4. Apply a simple substitution cipher to some of the text by displaying it in a special font that has its characters in the wrong positions. This is only done for strings of digits, and only a single font is used on a given page, although it changes if the page is reloaded.

container, = tree.xpath('//*[contains(concat(" ", @class, " "), " show ")]')
css, = container.xpath('.//style'); css.drop_tree(); css = str(css.text)
js, = container.xpath('.//script'); js.drop_tree(); js = str(js.text)

byclass = {}
for node in container.xpath('.//*[@class]'):
	for cls in node.classes:
		byclass.setdefault(cls, []).append(node)

Style syntax

There are three main things that interest the deobfuscator in a style declaration: whether it hides the element, whether it applies the scrambled font to it, and, for declarations applying to the ::after pseudoelement, what the element content will be sent to. Additionally, the URL of the font has to be extracted from the @font-face declaration. As the CSS cascade is effectively not used, which specific properties are used to attain these ends can be ignored.

Property declarations used for hiding are:

  • display: none; other valid values such as inline and inline-block, as well as the illegal inlineblock, occur as well and must be ignored;
  • top: -9...9px, left: -9...9px; the value 0 or 0px and values using the illegal unit xp occur as well and must be ignored;
  • z-index: -9...9; the value 1 occurs as well and must be ignored;
  • font-size: 0; the default value inherit occurs as well and must be ignored;
  • opacity: 0; the illegal value 0px occurs as well and must be ignored;
  • width: 0 or 0px, height: 0 or 0px; values using the illegal unit xp occur here as well and must be ignored;
  • color: white or transparent; the default value inherit occurs as well and must be ignored;
  • visibility: hidden; the illegal value true occurs as well and must be ignored.

As we can see, a relatively substantial effort seems to have been put into thwarting simple substring matching, but no really broken CSS is output, so matching each declarations completely against a good enough set of regular expressions while ignoring all unrecognized declarations is sufficient.

The font file is provided in multiple formats for compatibility, including TTF, OTF, EOT, WOFF and WOFF2, but all of these contain equivalent data.

Style = namedtuple('Style',
                   'visible scramble content',
                   defaults=(True, False, None))

HIDE    = re_compile(r"display: *none|(top|left): *-9+px|z-index: *-9+|(font-size|opacity): *0|(width|height): *0(px)?|color: *(white|transparent)|visibility: *hidden")
CONTENT = re_compile(r"content: *'([^\']*)'")
FONTFAM = re_compile(r'font-family: *"([^\"]*)"( *!important)?')
FONTURL = re_compile(r'src:.* url\("\./([^\"]*\.ttf)"\).*')

fontfam = fonturl = None

def parsestyle(decs):
	global fontfam, fonturl
	style = Style()
	for dec in decs.split(';'):
		dec = dec.strip()
		if HIDE.fullmatch(dec):
			style = style._replace(visible=False)
		elif m := CONTENT.fullmatch(dec):
			style = style._replace(content=m[1])
		elif m := FONTFAM.fullmatch(dec):
			if fontfam is None:
				fontfam = m[1]
			assert m[1] == fontfam
			style = style._replace(scramble=True)
		elif m := FONTURL.fullmatch(dec):
			assert fonturl is None
			fonturl = m[1]
	return style

Each rule in the stylesheet has a selector of the form .T .C or .T .C::after, where T is the randomly named class assigned to the data table as a whole and C is a randomly named class assigned to one or more of its children. There is also a single @font-face rule to point the browser to the font file.

SELECTOR = re_compile(r"\.([a-z_]*(::after)?)")

styles, afters = {}, {}
end = 0
for m in finditer(r' *([-@a-z_.: ]+?) *\{([^}]*)\}', css):
	assert m.start() == end; end = m.end()
	sel, decs = m.groups()
	style = parsestyle(decs)

	if sel == '@font-face':
		continue
	parent, sel = sel.split()
	assert parent[0] == '.' and len(byclass.get(parent[1:], ())) == 1
	m = SELECTOR.fullmatch(sel)
	assert m is not None
	if m[1].endswith('::after'):
		assert m[1].removesuffix('::after') not in afters
		afters[m[1].removesuffix('::after')] = style
	else:
		assert m[1] not in styles
		styles[m[1]] = style

assert not css[end:].strip()

Font

The server generates a new URL for the font on each request, but the file it refers to only appears to change once per several seconds. The generated URL is relative to the scheme and domain of the page URL, which is usually http://www.R.vybory.izbirkom.ru/ for some region R, but, as always, all of the region-specific domains point to exactly the same data as http://www.vybory.izbirkom.ru/.

It is unlikely that the unique URLs will be retained forever, so care must be taken when scraping the obfuscated HTML to extract the URL for and save the font file as well, as the substitution doesn't seem to be recoverable from the URL string alone. A simple if technically incorrect regular expression such as url\("\./([^\"]*\.ttf)"\) can be used to extract the necessary URL from the markup. The example code ignores this potential problem and downloads the font during execution.

Sometime on 20 September, the web server has started rejecting requests mentioning some common HTTP automation tools (in particular, curl and requests, but not wget) in their User-Agent HTTP header with a 403 Forbidden status code; changing the header value to Mozilla/5.0 (which is contained in the values sent by popular web browsers) appears to be enough to avoid the ban.

A number of advanced approaches could have been used to recover the substitution from the font file, from matching the character contours exactly against the original font (PT Sans by Paratype) to rendering the characters into a bitmap and applying perceptual hashes or OCR. None of this turns out to be necessary, as while the glyph IDs, character positions, and glyph order in the font file all appear to be scrambled, the PostScript glyph names (which for European digits are zero, one, etc.) are intact.

ttf = get('http://www.vybory.izbirkom.ru/' + fonturl,
          headers={'User-Agent': 'Mozilla/5.0'})
ttf = TTFont(BytesIO(ttf.content))
subst = {v: k for k, v in ttf.getBestCmap().items()}
subst = {chr(subst[n]): str(k) for k, n in
         enumerate('zero one two three four five six seven eight nine'.split())}

def unscramble(s):
	return ''.join(subst[c] for c in s)

JavaScript semantics

The three possible operations applied from JavaScript are fixed, and no attempts seems to have been made at obfuscating their implementation. The original names of the transformations are unknown, so code names corresponding to their behaviour are used below.

  • The operation setInner(C, V, E) replaces the innerHTML of each element with class C (passed as a single-quoted JavaScript string literal) with V (passed the same way). The scope of the operation is E, which is invariably the whole data table (passed as a variable with name identical to the randomly-generated class name of the table). The element can only contain text with no markup both before and after the operation, and the text may or may not be numeric.

  • The operation splice(C, I, E) deletes the character at zero-based position I (passed as a decimal JavaScript integer literal) in the innerHTML of each element with class C (passed as above). If I is negative, then -1 refers to the last character, -2 to the second-to-last character, etc. The scope of the operation is E, as above. The character is always inside the text either before or after any child elements, which there can be.

  • The operation swapLast(I, J, E) exchanges the text content (or, equivalently, innerHTML) of the last leaf children of the td elements with zero-based numbers I and J (passed, likely to encourage confusion with setInner, as single-quoted JavaScript strings containing non-negative decimal numerals) during a preorder traversal of the element tree. The scope of the operation is E, as above. Here, the last leaf child of an element is the last node encountered during a preorder traversal that contains only text.

Several successive references can be made to the same element using the same class name.

Finally, until the deobfuscation is complete, several elements including the data table are completely hidden by inline styles, probably so that the user does not see the document change. They are then found by their randomly-generated class names and revealed after a short delay. Those are the elements that can be passed as the scope E to the operations above.

def string(src):
	assert src[0] == "'" and src[-1] == "'" and "\\" not in src
	return src[1:-1]

revealed = set()
def reveal(cls):
	styles.setdefault(cls, None)
	node, = byclass.get(cls, ())
	decs = ';'.join(dec for dec in node.attrib.pop('style').split(';')
	                if not HIDE.fullmatch(dec.strip()))
	if decs:
		node.set('style', decs)
	revealed.add(node)

def dosetinner(cls, val, elt):
	cls, val = string(cls), string(val)
	assert byclass[elt][0] in revealed
	styles.setdefault(cls, None)
	assert '<' not in val and '&' not in val
	for node in byclass.get(cls, ()):
		assert not list(node)
		node.text = val

def dosplice(cls, idx, elt):
	cls, idx = string(cls), int(idx)
	assert byclass[elt][0] in revealed
	styles.setdefault(cls, None)
	for node in byclass.get(cls, ()):
		children = list(node)
		if children and idx < 0:
			text = children[-1].tail
		else:
			text = node.text
		assert idx < len(text) and -idx <= len(text)
		text = text[:idx] + text[idx+1:] if idx != -1 else text[:-1]
		if children and idx < 0:
			children[-1].tail = text
		else:
			node.text = text

def lec(node):
	children = list(node)
	return lec(children[-1]) if children else node

def doswaplast(fst, snd, elt):
	fst, snd = int(string(fst)), int(string(snd))
	table, = byclass[elt]
	assert table in revealed
	nodes = table.xpath('.//td') # FIXME compile?
	fst, snd = lec(nodes[fst]), lec(nodes[snd])
	fst.text, snd.text = snd.text, fst.text

JavaScript syntax

The JavaScript code of the deobfuscator has a straightforward structure. First, functions implementing some or all of the operations above, as well as an auxiliary function for finding the last leaf child of an element, are defined as necessary and in random order. The auxiliary function is always called lec, but the rest have random names which an independent deobfuscator needs to extract and remember. The implementations used are fixed except for some of the variable names (so can be distinguished using regular expressions) and, in a pretty-printed form, are as follows (randomized names in capitals):

var SetInner = function(C, V, E) {
	var L = E.getElementsByClassName(C);
	for (var i = 0; i < L.length; i++) {
		L[i].innerHTML = V;
	};
};

var Splice = function(C, I, E) {
	var L = E.getElementsByClassName(C);
	for (var i = 0; i < L.length; i++) {
		var v = L[i].innerHTML.split('');
		v.splice(I, 1);
		L[i].innerHTML = v.join('');
	};
};

if (!lec) {
	var lec = function(a) {
		var b = a.lastElementChild;
		if (!b) return a;
		if (b.lastElementChild) return lec(b);
		return b;
	};
};;

var SwapLast = function(I, J, E) {
	var L = E.getElementsByTagName('td');
	var X = lec(L[I]);
	var Y = lec(L[J]);
	var S = X.innerHTML;
	var T = Y.innerHTML;
	X.innerHTML = T;
	Y.innerHTML = S;
};

The two redundant semicolons after the if are not a typo in this document, but apparently a mistake made by the original programmer.

Next, a function called a is defined that calls the above three to perform the deobfuscation, and also schedules elements to be revealed using a copy of a code snippet for each. Finally, a is scheduled to be executed as soon as the user agent has finished building the document tree. The general structure, using the same conventions as above, is

var a = function() {
	/* ... */
	var X = document.getElementsByClassName('X')[0];
	X.style.position = 'relative';
	setTimeout(function() {
		X.style.removeProperty('opacity');
		X.style.removeProperty('visibility');
	}, 700);
	/* ... */
	F('Y', 'data', X);
	/* ... */
};
document.addEventListener('DOMContentLoaded', a);
IGNORE   = re_compile(r" +|;|if *\(!lec\) *\{[^}]*\{[^}]*\}[^}]*\}|var *a *= *function\(\) *\{")
SETINNER = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*innerHTML *= *[a-z_]+ *;[^}]*\}[^}]*\} *;")
SPLICE   = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*\{[^}]*splice[^}]*\}[^}]*\} *;")
SWAPLAST = re_compile(r"var +([a-z_]+) *= *function\([a-z_]+, *[a-z_]+, *[a-z_]+\) *\{[^}]*getElementsByTagName\('td'\)[^}]*\} *;")
REVEAL   = re_compile(r"var +([a-z_]+) *= *document\.getElementsByClassName[^}]*setTimeout\(function *\(\) *\{[^}]*\}[^)]*\) *;")
CALL     = re_compile(r"([a-z_]*)\(('[^\']*'), *(-?[0-9]*|'[^\']*'), *([a-z_]*)\) *;")
QUIT     = re_compile(r"\} *; *document\.addEventListener\('DOMContentLoaded', *a\) *;")

setinner = splice = swaplast = None
i = 0
while True:
	if m := IGNORE.match(js, i):
		i = m.end()
	elif m := SETINNER.match(js, i):
		i = m.end(); setinner = m[1]
	elif m := SPLICE.match(js, i):
		i = m.end(); splice = m[1]
	elif m := SWAPLAST.match(js, i):
		i = m.end(); swaplast = m[1]
	elif m := REVEAL.match(js, i):
		i = m.end(); reveal(*m.groups())
	elif m := CALL.match(js, i):
		i = m.end(); func = m[1]
		if func == setinner:
			dosetinner(*m.groups()[1:])
		elif func == splice:
			dosplice(*m.groups()[1:])
		elif func == swaplast:
			doswaplast(*m.groups()[1:])
		else:
			assert not "possible"
	elif m := QUIT.match(js, i):
		i = m.end()
		assert not js[i:].strip()
		break
	else:
		assert not "possible"

Style semantics

While a full implementation of CSS, even restricted to the properties listed above, would be unbearably complicated, the deobfuscation styles do not make use of any particularly tricky features like the interpretation of top and left with position: static, cascading or inheritance. Furthermore, either a class reference or an inline style can be used on any given element, but not both. A naïve quasi-CSS processor is thus sufficient.

The processor needs to run after JavaScript execution. It should add the generated content to the document (unless it is hidden by the accompanying styles), remove nodes which are hidden, and decipher all text under nodes which have the scrambled font applied to them. Generated content is never scrambled.

for cls, style in afters.items():
	styles.setdefault(cls, None)
	if not style.visible or not style.content:
		continue
	assert not style.scramble
	for node in byclass.get(cls, ()):
		if children := list(node):
			children[-1].tail = ((children[-1].tail or '') +
			                     style.content)
		else:
			node.text = (node.text or '') + style.content

def applystyle(node, style):
	if not style.visible:
		node.drop_tree()
	elif style.scramble:
		node.text = unscramble(node.text)
		for n in node.iterdescendants():
			if n.text is not None:
				n.text = unscramble(n.text)
			if n.tail is not None:
				n.tail = unscramble(n.tail)

for node in container.xpath('.//*[@style]'):
	assert all(styles.get(cls) is None for cls in node.classes)
	style = parsestyle(node.get('style'))
	if node in revealed:
		assert not style.scramble
		continue
	del node.attrib['style']
	applystyle(node, style)

for cls, style in styles.items():
	for node in byclass.get(cls, ()):
		node.classes.remove(cls)
		if style is not None:
			applystyle(node, style)

Cleanup

After both the JavaScript execution and the CSS processing are complete, the randomly-generated classes and the inline styles can be discarded for clarity, as they do not affect the user experience in any other way.

A lot of both empty and nonempty span elements will be left as well, which can also be deleted if their content is retained. The rest of the markup is significant.

for node in container.xpath('.//span'):
	if not node.attrib:
		node.drop_tag()

All done!

The example code writes (only) the deobfuscated data to standard output according to the system encoding and line ending convention.

stdout.write(tostring(container, encoding='unicode', method='html'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment