ikskuh/anagrams.zig

## anagrams.zig
const std = @import("std");

// debug switch to print all found anagrams
const print_all_results = false;

pub fn main() !void {
    // arena for fast allocations
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();

    var allocator = &arena.allocator;

    // the content of the file
    var dictionary_contents = try std.fs.cwd().readFileAlloc(allocator, "unixdict.txt", 1_000_000);
    defer allocator.free(dictionary_contents);

    // a set of words per anagram identifier
    // we store a anagram identifier as a key (sorted word, so length remains right, but characters are ordered lexically)
    // the value is a unmanaged array list (saves some memory compared to  anormal array list)
    // of all words for the anagram identifier
    var words = std.StringHashMap(std.ArrayListUnmanaged([]const u8)).init(allocator);
    defer {
        var iter = words.iterator();
        while (iter.next()) |kv| {
            kv.value.deinit(allocator);
        }
        words.deinit();
    }

    // Make sure we have enough space for some words already
    try words.ensureCapacity(100_000);

    var max_length: usize = 0;

    // iterate over all lines in dictionary_contents
    {
        // temporary buffer for sorting the word
        // this is mutable and will be overwritten per word
        var word_buffer: [256]u8 = undefined;

        var lines = std.mem.split(dictionary_contents, "\n");
        while (lines.next()) |word| {
            // copy the word to a temporary buffer so we can sort it without
            // modifying the original source, then store the sorted version
            std.mem.copy(u8, &word_buffer, word);
            const sorted_word = word_buffer[0..word.len];
            std.sort.sort(u8, sorted_word, {}, struct {
                fn lessThan(ctx: void, lhs: u8, rhs: u8) bool {
                    return lhs < rhs;
                }
            }.lessThan);

            // Get a existing or create a new list for this anagram identifier
            var list = if (words.getEntry(sorted_word)) |kv| &kv.value else blk: {
                var sorted_word_alloc = try allocator.dupe(u8, sorted_word);

                const kv = try words.getOrPut(sorted_word_alloc);
                std.debug.assert(kv.found_existing == false);
                kv.entry.value = std.ArrayListUnmanaged([]const u8){};

                break :blk &kv.entry.value;
            };

            // Push the word into the list, then check if the current list got longer than
            // the previously found longest list. If so: Keep the new length
            try list.append(allocator, word);

            if (max_length < list.items.len) {
                max_length = list.items.len;
            }
        }
    }

    // Print out all found results, in a unsorted manner
    {
        var stdout = std.io.getStdOut().writer();

        var buckets = words.iterator();
        while (buckets.next()) |bucket| {
            if (print_all_results) {
                if (bucket.value.items.len <= 1)
                    continue;
            } else {
                if (bucket.value.items.len != max_length)
                    continue;
            }

            for (bucket.value.items) |word, i| {
                if (i > 0) {
                    try stdout.print(", {}", .{word});
                } else {
                    try stdout.print("{}", .{word});
                }
            }
            try stdout.print("\n", .{});
        }
    }
}

// Output:
// evil, levi, live, veil, vile
// caret, carte, cater, crate, trace
// abel, able, bale, bela, elba
// alger, glare, lager, large, regal
// angel, angle, galen, glean, lange
// elan, lane, lean, lena, neal

## perf.stat
 Performance counter stats for './zig-cache/bin/anagrams':

             18,91 msec task-clock:u              #    0,946 CPUs utilized
                 0      context-switches:u        #    0,000 K/sec
                 0      cpu-migrations:u          #    0,000 K/sec
             2.114      page-faults:u             #    0,112 M/sec
        29.748.027      cycles:u                  #    1,573 GHz
         1.572.739      stalled-cycles-frontend:u #    5,29% frontend cycles idle
        12.228.844      stalled-cycles-backend:u  #   41,11% backend cycles idle
        20.443.025      instructions:u            #    0,69  insn per cycle
                                                  #    0,60  stalled cycles per insn
         4.178.738      branches:u                #  220,942 M/sec
           313.344      branch-misses:u           #    7,50% of all branches

       0,019999471 seconds time elapsed

       0,006689000 seconds user
       0,013379000 seconds sys
	const std = @import("std");

	// debug switch to print all found anagrams
	const print_all_results = false;

	pub fn main() !void {
	// arena for fast allocations
	var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
	defer arena.deinit();

	var allocator = &arena.allocator;

	// the content of the file
	var dictionary_contents = try std.fs.cwd().readFileAlloc(allocator, "unixdict.txt", 1_000_000);
	defer allocator.free(dictionary_contents);

	// a set of words per anagram identifier
	// we store a anagram identifier as a key (sorted word, so length remains right, but characters are ordered lexically)
	// the value is a unmanaged array list (saves some memory compared to anormal array list)
	// of all words for the anagram identifier
	var words = std.StringHashMap(std.ArrayListUnmanaged([]const u8)).init(allocator);
	defer {
	var iter = words.iterator();
	while (iter.next()) \|kv\| {
	kv.value.deinit(allocator);
	}
	words.deinit();
	}

	// Make sure we have enough space for some words already
	try words.ensureCapacity(100_000);

	var max_length: usize = 0;

	// iterate over all lines in dictionary_contents
	{
	// temporary buffer for sorting the word
	// this is mutable and will be overwritten per word
	var word_buffer: [256]u8 = undefined;

	var lines = std.mem.split(dictionary_contents, "\n");
	while (lines.next()) \|word\| {
	// copy the word to a temporary buffer so we can sort it without
	// modifying the original source, then store the sorted version
	std.mem.copy(u8, &word_buffer, word);
	const sorted_word = word_buffer[0..word.len];
	std.sort.sort(u8, sorted_word, {}, struct {
	fn lessThan(ctx: void, lhs: u8, rhs: u8) bool {
	return lhs < rhs;
	}
	}.lessThan);

	// Get a existing or create a new list for this anagram identifier
	var list = if (words.getEntry(sorted_word)) \|kv\| &kv.value else blk: {
	var sorted_word_alloc = try allocator.dupe(u8, sorted_word);

	const kv = try words.getOrPut(sorted_word_alloc);
	std.debug.assert(kv.found_existing == false);
	kv.entry.value = std.ArrayListUnmanaged([]const u8){};

	break :blk &kv.entry.value;
	};

	// Push the word into the list, then check if the current list got longer than
	// the previously found longest list. If so: Keep the new length
	try list.append(allocator, word);

	if (max_length < list.items.len) {
	max_length = list.items.len;
	}
	}
	}

	// Print out all found results, in a unsorted manner
	{
	var stdout = std.io.getStdOut().writer();

	var buckets = words.iterator();
	while (buckets.next()) \|bucket\| {
	if (print_all_results) {
	if (bucket.value.items.len <= 1)
	continue;
	} else {
	if (bucket.value.items.len != max_length)
	continue;
	}

	for (bucket.value.items) \|word, i\| {
	if (i > 0) {
	try stdout.print(", {}", .{word});
	} else {
	try stdout.print("{}", .{word});
	}
	}
	try stdout.print("\n", .{});
	}
	}
	}

	// Output:
	// evil, levi, live, veil, vile
	// caret, carte, cater, crate, trace
	// abel, able, bale, bela, elba
	// alger, glare, lager, large, regal
	// angel, angle, galen, glean, lange
	// elan, lane, lean, lena, neal
	Performance counter stats for './zig-cache/bin/anagrams':

	18,91 msec task-clock:u # 0,946 CPUs utilized
	0 context-switches:u # 0,000 K/sec
	0 cpu-migrations:u # 0,000 K/sec
	2.114 page-faults:u # 0,112 M/sec
	29.748.027 cycles:u # 1,573 GHz
	1.572.739 stalled-cycles-frontend:u # 5,29% frontend cycles idle
	12.228.844 stalled-cycles-backend:u # 41,11% backend cycles idle
	20.443.025 instructions:u # 0,69 insn per cycle
	# 0,60 stalled cycles per insn
	4.178.738 branches:u # 220,942 M/sec
	313.344 branch-misses:u # 7,50% of all branches

	0,019999471 seconds time elapsed

	0,006689000 seconds user
	0,013379000 seconds sys