Skip to content

Instantly share code, notes, and snippets.

@trvswgnr
Last active July 25, 2024 23:18
Show Gist options
  • Save trvswgnr/48e46dd6a60de1e9d8e8571d914ef2b0 to your computer and use it in GitHub Desktop.
Save trvswgnr/48e46dd6a60de1e9d8e8571d914ef2b0 to your computer and use it in GitHub Desktop.
check if a file contains a specified phrase, without loading the entire file into memory
import { describe, it, expect, beforeAll, afterAll } from "bun:test";
import { fileContains, BoyerMooreSearcher } from "./fileContains";
import fs from "fs";
import path from "path";
describe("fileContains", () => {
const testDir = path.join(__dirname, "test-files");
beforeAll(() => {
if (!fs.existsSync(testDir)) {
fs.mkdirSync(testDir);
}
});
afterAll(() => {
fs.rmSync(testDir, { recursive: true, force: true });
});
it("should find a phrase in a small file", async () => {
const filePath = path.join(testDir, "small.txt");
fs.writeFileSync(filePath, "Hello, world!");
expect(await fileContains(filePath, "world")).toBe(true);
expect(await fileContains(filePath, "universe")).toBe(false);
});
it("should find a phrase spanning multiple chunks", async () => {
const filePath = path.join(testDir, "multi-chunk.txt");
const content = "a".repeat(65536) + "target" + "b".repeat(65536);
fs.writeFileSync(filePath, content);
expect(await fileContains(filePath, "target")).toBe(true);
});
it("should handle multi-byte characters", async () => {
const filePath = path.join(testDir, "multi-byte.txt");
fs.writeFileSync(filePath, "你好,世界!");
expect(await fileContains(filePath, "世界")).toBe(true);
expect(await fileContains(filePath, "宇宙")).toBe(false);
});
it("should handle large files efficiently", async () => {
const filePath = path.join(testDir, "large.txt");
const content = "a".repeat(10 * 1024 * 1024) + "needle" + "b".repeat(10 * 1024 * 1024);
fs.writeFileSync(filePath, content);
const start = Date.now();
const result = await fileContains(filePath, "needle");
const end = Date.now();
expect(result).toBe(true);
expect(end - start).toBeLessThan(1000); // Should complete in less than 1 second
});
});
describe("StreamSearcher", () => {
it("should find a phrase in a single chunk", () => {
const searcher = new BoyerMooreSearcher("world");
expect(searcher.search("Hello, world!")).toBe(true);
expect(searcher.search("Hello, universe!")).toBe(false);
});
it("should find a phrase spanning multiple chunks", () => {
const searcher = new BoyerMooreSearcher("target");
expect(searcher.search("Some text before the tar")).toBe(false);
expect(searcher.search("get phrase")).toBe(true);
});
it("should handle multi-byte characters", () => {
const searcher = new BoyerMooreSearcher("世界");
expect(searcher.search("你好,")).toBe(false);
expect(searcher.search("世界!")).toBe(true);
});
});
import { createReadStream } from "fs";
/**
* the default chunk size to read from the file, in bytes
*/
const DEFAULT_CHUNK_SIZE = 16 * 1024;
/**
* searches for a specific phrase in a file using the Boyer-Moore algorithm
*
* reads the file in chunks and uses a stream-based approach to efficiently
* search for the given phrase without loading the entire file into memory. it
* utilizes the Boyer-Moore string searching algorithm for improved performance
*
* @param filePath - the path to the file to be searched
* @param phrase - the phrase to search for within the file
* @param chunkSize - the size of each chunk to read from the file, in bytes
* defaults to 16KB (16 * 1024 bytes)
*
* @returns a Promise that resolves to `true` if the phrase is found in the
* file, otherwise `false`
*
* @throws rejects with an error if there's an issue reading the file
*
* @example
* const found = await searchFileForPhrase('/path/to/file.txt', 'search phrase');
* if (found) {
* console.log('phrase found in file');
* } else {
* console.log('not found');
* }
*/
export async function fileContains(
filePath: string,
phrase: string,
chunkSize = DEFAULT_CHUNK_SIZE,
): Promise<boolean> {
return new Promise((resolve, reject) => {
const searcher = new BoyerMooreSearcher(phrase);
const stream = createReadStream(filePath, { highWaterMark: chunkSize });
stream.on("data", (chunk: Buffer) => {
if (searcher.search(chunk.toString("utf8"))) {
stream.destroy();
resolve(true);
}
});
stream.on("end", () => resolve(false));
stream.on("error", reject);
});
}
/**
* implements the Boyer-Moore string searching algorithm
*
* designed to efficiently search for a pattern within a stream of characters
*/
export class BoyerMooreSearcher {
private badCharMap: BadCharMap;
private pattern: string;
private buffer: string;
private patternLength: number;
/**
* initializes a new instance of the BooyerMooreSearcher
*
* @param pattern - the pattern to search for
* @constructor
*/
constructor(pattern: string) {
this.pattern = pattern;
this.badCharMap = new BadCharMap(pattern);
this.buffer = "";
this.patternLength = [...pattern].length; // handle multi-byte characters correctly
}
/**
* searches for the pattern in the given chunk.
*
* designed to be called repeatedly with each chunk in the stream
*
* @param chunk - the next chunk in the stream to search.
* @returns `true` if the pattern is found, otherwise `false`
* @note modifies the internal buffer and returns a boolean
*/
search(chunk: string): boolean {
this.buffer += chunk;
const r = this.findPattern();
// reset the buffer if a match is found or keep only the last
// `(patternLength - 1)` characters
this.buffer = r ? "" : this.buffer.slice(-this.patternLength + 1);
return r;
}
/**
* implements the core Boyer-Moore algorithm to find the pattern in the
* current buffer
*
* @returns `true` if the pattern is found in the current buffer, otherwise
* `false`
* @private
*/
private findPattern(): boolean {
const len = this.buffer.length;
let i = this.patternLength - 1;
while (i < len) {
let j = this.patternLength - 1;
// compare characters from right to left
while (j >= 0 && this.buffer[i - (this.patternLength - j - 1)] === this.pattern[j]) {
j--;
}
if (j < 0) {
return true; // pattern found
}
// use the bad character heuristic to skip characters
const skip = this.badCharMap.get(this.buffer[i]!) ?? this.patternLength;
i += Math.max(1, skip);
}
return false; // pattern not found in this chunk
}
}
/**
* implements a Bad Character Map for the Boyer-Moore string searching algorithm
*
* used to preprocess the search pattern and create a lookup table for the bad
* character heuristic. it allows for efficient skipping of characters during
* the search process, improving the overall performance of the algorithm
*
* the map uses hashed character codes as keys and their rightmost positions in
* the pattern as values, allowing for constant-time lookups
*/
export class BadCharMap extends Map<number, number> {
/**
* constructs a new BadCharMap for the given pattern
*
* @param pattern - the search pattern to create the map for
* @constructor
*/
constructor(pattern: string) {
super();
const chars = [...pattern]; // handle Unicode characters correctly
const len = chars.length;
let hash: number;
// iterate through the pattern from left to right
for (let i = 0; i < len; i++) {
// hash each character and store its rightmost position
hash = hashString(chars.at(i)!);
// the value is the distance from the rightmost occurrence to the
// end of the pattern
this.set(hash, len - i - 1);
}
}
/**
* retrieves the skip value for a given character
*
* overrides the default {@link Map.prototype.get} method to allow lookups
* using either a string (which will be hashed) or a pre-computed hash value
*
* @param i - the character (as a string) or its hash value (as a number) to
* look up
* @returns the skip value for the character, or `undefined` if not found in
* the map
*/
get(i: string | number): number | undefined {
if (typeof i === "string") {
return super.get(hashString(i));
}
return super.get(i);
}
}
/**
* generates a hash code for a string using the djb2 algorithm
*
* @param s - the input string to hash
* @returns a 32-bit integer hash code
*/
export function hashString(s: string): number {
let hash = 5381;
const len = s.length;
for (let i = 0; i < len; i++) {
// bitwise left shift by 5 is equivalent to multiplying by 32
hash = (hash << 5) + hash + s.charCodeAt(i);
// convert to 32-bit integer
hash |= 0;
}
return hash >>> 0; // make sure it's an unsigned 32-bit integer
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment