Skip to content

Instantly share code, notes, and snippets.

@jjrv
Created April 9, 2024 05:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjrv/ac616cea4c039b8c94b4fdc354e063bc to your computer and use it in GitHub Desktop.
Save jjrv/ac616cea4c039b8c94b4fdc354e063bc to your computer and use it in GitHub Desktop.
Histogram diff in TypeScript
import { LCS, Interned } from './lcs';
export class Histogram<Token> {
constructor(before: Token[], after: Token[]) {
let prefix = 0;
let beforeEnd = before.length;
let afterEnd = after.length;
// Skip common prefix.
while(prefix < beforeEnd && prefix < afterEnd && before[prefix] == after[prefix]) ++prefix;
// Skip common suffix.
while(beforeEnd > prefix && afterEnd > prefix) {
if(before[--beforeEnd] != after[--afterEnd]) {
++beforeEnd;
++afterEnd;
break;
}
}
const beforeCount = beforeEnd - prefix;
const afterCount = afterEnd - prefix;
this.before = new Uint32Array(beforeCount);
this.after = new Uint32Array(afterCount);
// Intern tokens.
const tokenMap = this.tokenMap;
let tokens = 1;
for(let pos = 0; pos < beforeCount; ++pos) {
const token = before[pos + prefix];
this.before[pos] = tokenMap.get(token) || (tokenMap.set(token, tokens), tokens++);
}
for(let pos = 0; pos < afterCount; ++pos) {
const token = after[pos + prefix];
this.after[pos] = tokenMap.get(token) || (tokenMap.set(token, tokens), tokens++);
}
// Allocate histogram buckets.
this.tokenBuckets = new Uint32Array(tokens);
this.tokenCounts = new Uint32Array(tokens + 2);
this.offsets = { data: new Uint32Array(beforeCount), pos: 0, end: 0 };
const lcs = new LCS(this);
lcs.beforePos = 0;
lcs.beforeEnd = beforeCount;
lcs.afterPos = 0;
lcs.afterEnd = afterCount;
this.prefix = prefix;
this.stack = [lcs];
}
countOffsets(token: Interned) {
const tokenCounts = this.tokenCounts;
const bucket = this.tokenBuckets[token];
return bucket && tokenCounts[bucket] - tokenCounts[bucket - 1];
}
getOffsets(token: Interned) {
const tokenCounts = this.tokenCounts;
const bucket = this.tokenBuckets[token];
const offsets = this.offsets;
offsets.pos = tokenCounts[bucket - 1];
offsets.end = tokenCounts[bucket];
return offsets;
}
populate(lcs: LCS) {
const tokens = this.before;
const tokenBuckets = this.tokenBuckets;
const tokenCounts = this.tokenCounts;
const tokenOffsets = this.offsets.data;
const prevEnd = this.end;
// Clear old bucket assignments.
for(let pos = this.start; pos < prevEnd; ++pos) {
const token = tokens[pos];
tokenBuckets[token] = 0;
}
const start = lcs.beforePos;
const end = lcs.beforeEnd;
this.start = start;
this.end = end;
let buckets = 1;
tokenCounts[0] = 0;
tokenCounts[1] = 0;
// Assign buckets for unique tokens.
for(let pos = start; pos < end; ++pos) {
const token = tokens[pos];
if(!tokenBuckets[token]) {
tokenBuckets[token] = buckets++;
tokenCounts[buckets] = 0;
}
}
// Count bucket sizes.
for(let pos = start; pos < end; ++pos) {
++tokenCounts[tokenBuckets[tokens[pos]] + 1];
}
// Turn bucket sizes into cumulative sums.
let count = 0;
for(let bucket = 2; bucket < buckets; ++bucket) {
tokenCounts[bucket] = count += tokenCounts[bucket];
}
// Store input offset of each bucket member.
for(let pos = start; pos < end; ++pos) {
tokenOffsets[tokenCounts[tokenBuckets[tokens[pos]]]++] = pos;
}
}
step(): LCS | null {
let stackPos = this.stackPos;
if(stackPos < 0) return null;
const stack = this.stack;
const lcs = stack[stackPos]!;
stack[stackPos] = this.lcs;
stackPos--;
while(lcs.beforePos != lcs.beforeEnd && lcs.afterPos != lcs.afterEnd) {
this.populate(lcs);
if(!lcs.find()) {
throw new Error('Should fall back to Myers');
}
if(lcs.len === 0) break;
++stackPos;
const next = stack[stackPos] || (stack[stackPos] = new LCS(this));
next.beforePos = lcs.beforeNext + lcs.len;
next.beforeEnd = lcs.beforeEnd;
next.afterPos = lcs.afterNext + lcs.len;
next.afterEnd = lcs.afterEnd;
lcs.beforeEnd = lcs.beforeNext;
lcs.afterEnd = lcs.afterNext;
}
this.lcs = lcs;
this.stackPos = stackPos;
const prefix = this.prefix;
lcs.beforePos += prefix;
lcs.beforeEnd += prefix;
lcs.afterPos += prefix;
lcs.afterEnd += prefix;
return lcs;
}
private lcs?: LCS;
private stack: (LCS | undefined)[];
private stackPos = 0;
private tokenMap = new Map<Token, Interned>();
private tokenBuckets: Uint32Array;
private tokenCounts: Uint32Array;
private start = 0;
private end = 0;
before: Uint32Array;
after: Uint32Array;
prefix = 0;
offsets: { data: Uint32Array, pos: number, end: number };
}
import type { Histogram } from './Histogram';
export const MAX_CHAIN_LEN = 63;
/** Interned tokens are numbers to use as array indices. */
export type Interned = number;
/** Longest common subsequence */
export class LCS {
constructor(private histogram: Histogram<unknown>) {
this.reset();
}
find(): boolean {
const histogram = this.histogram;
const after = histogram.after;
const afterEnd = this.afterEnd;
let found = false;
this.reset();
this.beforeNext = this.beforePos;
this.afterNext = this.afterPos;
for(let afterOffset = this.afterPos; afterOffset < afterEnd; ++afterOffset) {
const token = after[afterOffset];
const count = histogram.countOffsets(token);
if(count) {
found = true;
if(count <= this.minCount) afterOffset = this.update(afterOffset, token) - 1;
}
}
return !found || this.minCount <= MAX_CHAIN_LEN;
}
reset() {
this.len = 0;
this.minCount = MAX_CHAIN_LEN + 1;
}
update(afterOffset: number, token: Interned): number {
const { beforePos, beforeEnd, afterPos, afterEnd } = this;
const histogram = this.histogram;
const before = histogram.before;
const after = histogram.after;
let { data: offsets, pos: offsetPos, end: offsetEnd } = histogram.getOffsets(token);
const offsetCount = offsetEnd - offsetPos;
let beforeOffset = offsets[offsetPos++];
let posNext = afterOffset + 1;
while(offsetPos <= offsetEnd) {
let count = offsetCount;
let beforeNext = beforeOffset;
let afterNext = afterOffset;
while(beforeNext > beforePos && afterNext > afterPos && before[beforeNext - 1] == after[afterNext - 1]) {
--beforeNext;
--afterNext;
count = Math.min(count, histogram.countOffsets(before[beforeNext]));
}
let beforeEndNext = beforeOffset + 1;
let afterEndNext = afterOffset + 1;
while(beforeEndNext < beforeEnd && afterEndNext < afterEnd && before[beforeEndNext] == after[afterEndNext]) {
count = Math.min(count, histogram.countOffsets(before[beforeEndNext]));
++beforeEndNext;
++afterEndNext;
}
if(posNext < afterEndNext) posNext = afterEndNext;
const len = afterEndNext - afterNext; // Equals beforeEndNext - beforeNext
if(this.len < len || this.minCount > count) {
this.minCount = count;
this.beforeNext = beforeNext;
this.afterNext = afterNext;
this.len = len;
}
let offset: number;
while((offset = offsets[offsetPos++])) {
if(offset > afterEndNext) {
beforeOffset = offset;
break;
}
}
}
return posNext;
}
private minCount: number;
beforePos = 0;
beforeEnd = 0;
afterPos = 0;
afterEnd = 0;
beforeNext = 0;
afterNext = 0;
len: number;
}
@jjrv
Copy link
Author

jjrv commented Apr 9, 2024

This is based on imara-diff:
https://github.com/pascalkuthe/imara-diff

Any mistakes in the TypeScript version are mine, not very much tested yet.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment