Skip to content

Instantly share code, notes, and snippets.

@mjpieters
Last active December 15, 2023 11:32
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjpieters/86b0d152bb51d5f5979346d11005588b to your computer and use it in GitHub Desktop.
Save mjpieters/86b0d152bb51d5f5979346d11005588b to your computer and use it in GitHub Desktop.
Python sourcemap parsing

Sourcemap processing in Python

This gist contains two Python modules:

  • sourcemap: a module to parse and generate JavaScript source maps
  • base64vlq: code to decode and encode base64 VLQ sequences, an encoding used in source maps.

License

The code is licensed under the terms of the MIT license, included in the gist.

"""Decode and encode Base64 VLQ encoded sequences
Base64 VLQ is used in source maps.
VLQ values consist of 6 bits (matching the 64 characters of the Base64
alphabet), with the most significant bit a *continuation* flag. If the
flag is set, then the next character in the input is part of the same
integer value. Multiple VLQ character sequences so form an unbounded
integer value, in little-endian order.
The *first* VLQ value consists of a continuation flag, 4 bits for the
value, and the last bit the *sign* of the integer:
+-----+-----+-----+-----+-----+-----+
| c | b3 | b2 | b1 | b0 | s |
+-----+-----+-----+-----+-----+-----+
while subsequent VLQ characters contain 5 bits of value:
+-----+-----+-----+-----+-----+-----+
| c | b4 | b3 | b2 | b1 | b0 |
+-----+-----+-----+-----+-----+-----+
For source maps, Base64 VLQ sequences can contain 1, 4 or 5 elements.
"""
from typing import Callable, Final, List, Optional, Tuple
B64CHARS: Final[bytes] = (
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
)
B64TABLE: Final[list[Optional[int]]] = [None] * (max(B64CHARS) + 1)
for i, b in enumerate(B64CHARS):
B64TABLE[b] = i
B64ENC: Callable[[int], str] = B64CHARS.decode().__getitem__
SHIFTSIZE: Final[int] = 5
FLAG: Final[int] = 1 << 5
MASK: Final[int] = (1 << 5) - 1
def base64vlq_decode(vlqval: str) -> Tuple[int]:
"""Decode Base64 VLQ value"""
results: List[int] = []
add = results.append
shiftsize, flag, mask = SHIFTSIZE, FLAG, MASK
shift = value = 0
# use byte values and a table to go from base64 characters to integers
for v in map(B64TABLE.__getitem__, vlqval.encode("ascii")):
value += (v & mask) << shift # type: ignore # v is always int
if v & flag: # type: ignore # v is always int
shift += shiftsize
continue
# determine sign and add to results
add((value >> 1) * (-1 if value & 1 else 1))
shift = value = 0
return tuple(results)
def base64vlq_encode(*values: int) -> str:
"""Encode integers to a VLQ value"""
results: List[str] = []
add = results.append
shiftsize, flag, mask = SHIFTSIZE, FLAG, MASK
for v in values:
# add sign bit
v = (abs(v) << 1) | int(v < 0)
while True:
toencode, v = v & mask, v >> shiftsize
add(B64ENC(toencode | (v and flag)))
if not v:
break
return ''.join(results)
Copyright 2020-2022 Martijn Pieters
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""Extract generated -> source mappings"""
from bisect import bisect
from collections import defaultdict
from dataclasses import dataclass, field
from functools import partial
from itertools import count
from typing import List, Literal, Mapping, Optional, Tuple, TypedDict, Union
from base64vlq import base64vlq_decode, base64vlq_encode
class autoindex(defaultdict):
def __init__(self, *args, **kwargs) -> None:
super().__init__(partial(next, count()), *args, **kwargs)
class _JSONSourceMap(TypedDict):
version: Literal[3]
sources: List[str]
names: List[str]
mappings: str
class JSONSourceMap(_JSONSourceMap, total=False):
file: str
sourceRoot: str
sourcesContent: List[Optional[str]]
@dataclass(frozen=True)
class SourceMapping:
line: int
column: int
source: Optional[str] = None
source_line: Optional[int] = None
source_column: Optional[int] = None
name: Optional[str] = None
source_content: Optional[str] = None
def __post_init__(self) -> None:
if self.source is not None and (
self.source_line is None or self.source_column is None
):
raise TypeError(
"Invalid source mapping; missing line and column for source file"
)
if self.name is not None and self.source is None:
raise TypeError(
"Invalid source mapping; name entry without source location info"
)
@property
def content_line(self) -> Optional[str]:
try:
self.source_content.splitlines()[self.source_line] # type: ignore
except (TypeError, IndexError):
return None
@dataclass(frozen=True)
class SourceMap:
file: Optional[str]
source_root: Optional[str]
entries: Mapping[Tuple[int, int], SourceMapping]
_index: List[Tuple[int, ...]] = field(default_factory=list)
def __repr__(self) -> str:
parts = []
if self.file is not None:
parts += [f"file={self.file!r}"]
if self.source_root is not None:
parts += [f"source_root={self.source_root!r}"]
parts += [f"len={len(self.entries)}"]
return f"<SourceMap({', '.join(parts)})>"
@classmethod
def from_json(cls, smap: JSONSourceMap) -> "SourceMap":
if smap["version"] != 3:
raise ValueError("Only version 3 sourcemaps are supported")
entries, index = {}, []
spos = npos = sline = scol = 0
sources, names, contents = (
smap["sources"],
smap["names"],
smap.get("sourcesContent", []),
)
for gline, vlqs in enumerate(smap["mappings"].split(";")):
index += [[]]
if not vlqs:
continue
gcol = 0
for gcd, *ref in map(base64vlq_decode, vlqs.split(",")):
gcol += gcd
kwargs = {}
if len(ref) >= 3:
sd, sld, scd, *namedelta = ref
spos, sline, scol = spos + sd, sline + sld, scol + scd
scont = contents[spos] if len(contents) > spos else None
kwargs = {
"source": sources[spos],
"source_line": sline,
"source_column": scol,
"source_content": scont,
}
if namedelta:
npos += namedelta[0]
kwargs["name"] = names[npos]
entries[gline, gcol] = SourceMapping(line=gline, column=gcol, **kwargs)
index[gline].append(gcol)
return cls(
smap.get("file"),
smap.get("sourceRoot"),
entries,
[tuple(cs) for cs in index],
)
def to_json(self) -> JSONSourceMap:
content, mappings = [], []
sources, names = autoindex(), autoindex()
entries = self.entries
spos = sline = scol = npos = 0
for gline, cols in enumerate(self._index):
gcol = 0
mapping = []
for col in cols:
entry = entries[gline, col]
ds, gcol = [col - gcol], col
if entry.source is not None:
assert entry.source_line is not None
assert entry.source_column is not None
ds += (
sources[entry.source] - spos,
entry.source_line - sline,
entry.source_column - scol,
)
spos, sline, scol = (spos + ds[1], sline + ds[2], scol + ds[3])
if spos == len(content):
content.append(entry.source_content)
if entry.name is not None:
ds += (names[entry.name] - npos,)
npos += ds[-1]
mapping.append(base64vlq_encode(*ds))
mappings.append(",".join(mapping))
encoded: JSONSourceMap = {
"version": 3,
"sources": [s for s, _ in sorted(sources.items(), key=lambda si: si[1])],
"sourcesContent": content,
"names": [n for n, _ in sorted(names.items(), key=lambda ni: ni[1])],
"mappings": ";".join(mappings),
}
if self.file is not None:
encoded["file"] = self.file
if self.source_root is not None:
encoded["sourceRoot"] = self.source_root
return encoded
def __getitem__(self, idx: Union[int, Tuple[int, int]]) -> SourceMapping:
l: int
c: int
try:
l, c = idx # type: ignore # The exception handler deals with the int case
except TypeError:
l, c = idx, 0 # type: ignore # yes, idx is guaranteed to be an int
try:
return self.entries[l, c]
except KeyError:
# find the closest column
if not (cols := self._index[l]):
raise IndexError(idx) from None
cidx = bisect(cols, c)
return self.entries[l, cols[cidx and cidx - 1]]
@milahu
Copy link

milahu commented Mar 10, 2022

thanks for this : )

here is my version, to encode unsigned integers as base16 strings
https://github.com/milahu/random/blob/master/python/base16vlq.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment