feklee/parse_docurls

## parse_docurls
#!/usr/bin/perl

# All URLs of the format <http://f76.eu/a*> are for identifying my
# documents. See: http://f76.eu/a0
#
# This script finds all files when run from the root of the directory
# tree with my documents.

# Example confguration `~/.config/parse_docurls.conf`:
#
#     doc_dir = '~/windows/Dropbox/My Data'
#     max_file_size = 1 # KiB

# Felix E. Klee <felix.klee@inka.de>

use strict;
use feature "state";
use ConfigReader::Simple;

my $config_filename = glob "~/.config/parse_docurls.conf";
my $output_filename = glob "~/.local/share/docurl_cache";
my $doc_dir;
my $max_file_size;

sub read_config {
    my $config = ConfigReader::Simple->new($config_filename);
    die "Could not read config: $ConfigReader::Simple::ERROR\n"
        unless ref $config;

    my $doc_dir = $config->get('doc_dir') || die "Missing config";
    my $max_file_size = $config->get("max_file_size");

    $doc_dir = glob "'$doc_dir'";

    return ($doc_dir, $max_file_size);
}

# All IDs start with an "a", which is followed by the actual
# identifier.  Let's call that sub ID.
sub sub_id {
    my $id = shift;
    if (substr($id, 0, 1) ne 'a') {
        die('ID "' . $id . '" is in wrong format');
    }
    return substr($id, 1);
}

sub id_from_sub_id {
    my $sub_id = shift;
    return 'a' . $sub_id;
}

sub next_id {
    my $id = shift;
    my @chars = split(//, sub_id($id));
    my @rchars = reverse @chars;
    my $l = @rchars;

    while (my ($i, $char) = each @rchars) {
        if ($char =~ /[0-8a-yA-Y]/) {
            $rchars[$i] = chr(ord($char) + 1);
            last;
        } elsif ($char eq "9") {
            $rchars[$i] = "a";
            last;
        } elsif ($char eq "z") {
            $rchars[$i] = "A";
            last;
        } else {
            $rchars[$i] = "0";
            if ($i == ($l - 1)) {
                $rchars[$i] = "00";
            }
        }
    }

    my $new_sub_id = join('', reverse @rchars);
    return id_from_sub_id($new_sub_id);
}

sub format_line {
    my $line = shift;

    # When designing regular expressions, beware of carriage returns
    # in DOS style line endings.  They can cause weird effects in
    # substitutions.
    chomp($line);
    $line =~ s/\.\/(.*):DocURL:\s*http:\/\/f76.eu\/([[:alnum:]]*)/$2\t$1/g;

    return $line;
}

sub compare_ids {
    my ($x, $y) = @_;
    $x =~ tr/A-Za-z/a-zA-Z/;
    $y =~ tr/A-Za-z/a-zA-Z/;
    my $result = (length $x <=> length $y || $x cmp $y);
    return (length $x <=> length $y || $x cmp $y);
}

sub compare_lines {
    my @x = split(/\t/, $a);
    my @y = split(/\t/, $b);
    return compare_ids($x[0], $y[0]);
}

sub report_gap {
    my ($id, $expected_id) = @_;
    my $first_id = $$expected_id;

    my $last_missing_id;

    while (compare_ids($id, $$expected_id) > 0) {
        $last_missing_id = $$expected_id;
        $$expected_id = next_id($$expected_id);
    }

    my $gap_found = defined($last_missing_id);
    if ($gap_found) {
        print STDERR "Missing: ".$first_id;
        if ($last_missing_id ne $first_id) {
            print STDERR " - ".$last_missing_id;
        }
        print STDERR "\n";
    }

    my $expected_id_found = $id eq $$expected_id;
    return ($gap_found, $expected_id_found);
}

sub report_if_duplicate {
    my $id = shift;
    state $prev_id;
    my $duplicate_found = $id eq $prev_id;
    $prev_id = $id;

    print STDERR "Duplicate: ".$id."\n" if $duplicate_found;

    return $duplicate_found;
}

sub report_inconsistencies {
    my @lines = @_;
    my $expected_id = "a0";
    my @augmented_lines;
    my $errors_found;

    foreach (@lines) {
        my ($id) = split(/\t/, $_);
        my $duplicate_found = report_if_duplicate $id;
        if ($duplicate_found) {
            $_ = "# Duplicate: ".$_;
            $errors_found = 1;
        }
        my ($gap_found, $expected_id_found) = report_gap($id, \$expected_id);
        if ($expected_id_found) {
            $expected_id = next_id($expected_id);
        }
        if ($gap_found) {
            push(@augmented_lines, "# Gap");
            $errors_found = 1;
        }
        push(@augmented_lines, $_);
    }

    if ($errors_found) {
        push(@augmented_lines, "");
        push(@augmented_lines, "# Errors found, see comments above.");
    }

    return @augmented_lines;
}

sub timestamp {
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
    my $timestamp = sprintf("%04d-%02d-%02d %02d:%02d:%02d",
                            $year+1900,$mon+1,$mday,$hour,$min,$sec);
    return $timestamp;
}

sub append_timestamp {
    my @lines = @_;
    push(@lines, "");
    push(@lines, "# Last update: " . timestamp);
    return @lines;
}

sub clean_up {
    $_[0] =~ s/\r|\n//g;
    $_[0] =~ s/^\s+|\s+$//g;
}

sub summarize {
    print <<~EOF;

    Finished parsing: $doc_dir
    Wrote output to: $output_filename
    Maximum file size searched: $max_file_size KiB
    If there are missing IDs, consider increasing file size for searching.
    EOF
}

sub write_output {
    open(OUT, '>', $output_filename) or die $!;
    print OUT <<~EOF;
    # DocURL cache.  For information about f76.eu DocURLs and how to
    # rebuild the cache, see:
    #
    #   http://f76.eu/a0
    #
    # This file was generated by:
    #
    #   $0
    #
    # Felix E. Klee <felix.klee\@inka.de>

    EOF
    print OUT join("\n", @_);
    print OUT "\n";
    close OUT;
}

# See documentation of Gnu `find`: When an upper size limit is
# specified, it does not include the limit.  Instead the maximum is
# one unit below that.
sub find_size {
    return -($max_file_size + 1)."k";
}

sub parse_docs {
    my $find_cmd =
        'find -type f -size '.find_size.' -exec grep -Hi "^DocURL:" {} \;';
    my @lines;
    my @ids;
    open(FIND, "$find_cmd |");
    while (<FIND>) {
        my $line = format_line($_);
        clean_up $line;
        print "$line\n";
        push(@lines, $line);
    }
    return @lines;
}

($doc_dir, $max_file_size) = read_config;

chdir $doc_dir || die "Cannot enter " . $doc_dir;
my @lines = parse_docs;
my @sorted_lines = sort compare_lines @lines;
my @augmented_lines = report_inconsistencies(@sorted_lines);
@augmented_lines = append_timestamp(@augmented_lines);
write_output @augmented_lines;

summarize;
	#!/usr/bin/perl

	# All URLs of the format <http://f76.eu/a*> are for identifying my
	# documents. See: http://f76.eu/a0
	#
	# This script finds all files when run from the root of the directory
	# tree with my documents.

	# Example confguration `~/.config/parse_docurls.conf`:
	#
	# doc_dir = '~/windows/Dropbox/My Data'
	# max_file_size = 1 # KiB

	# Felix E. Klee <felix.klee@inka.de>

	use strict;
	use feature "state";
	use ConfigReader::Simple;

	my $config_filename = glob "~/.config/parse_docurls.conf";
	my $output_filename = glob "~/.local/share/docurl_cache";
	my $doc_dir;
	my $max_file_size;

	sub read_config {
	my $config = ConfigReader::Simple->new($config_filename);
	die "Could not read config: $ConfigReader::Simple::ERROR\n"
	unless ref $config;

	my $doc_dir = $config->get('doc_dir') \|\| die "Missing config";
	my $max_file_size = $config->get("max_file_size");

	$doc_dir = glob "'$doc_dir'";

	return ($doc_dir, $max_file_size);
	}

	# All IDs start with an "a", which is followed by the actual
	# identifier. Let's call that sub ID.
	sub sub_id {
	my $id = shift;
	if (substr($id, 0, 1) ne 'a') {
	die('ID "' . $id . '" is in wrong format');
	}
	return substr($id, 1);
	}

	sub id_from_sub_id {
	my $sub_id = shift;
	return 'a' . $sub_id;
	}

	sub next_id {
	my $id = shift;
	my @chars = split(//, sub_id($id));
	my @rchars = reverse @chars;
	my $l = @rchars;

	while (my ($i, $char) = each @rchars) {
	if ($char =~ /[0-8a-yA-Y]/) {
	$rchars[$i] = chr(ord($char) + 1);
	last;
	} elsif ($char eq "9") {
	$rchars[$i] = "a";
	last;
	} elsif ($char eq "z") {
	$rchars[$i] = "A";
	last;
	} else {
	$rchars[$i] = "0";
	if ($i == ($l - 1)) {
	$rchars[$i] = "00";
	}
	}
	}

	my $new_sub_id = join('', reverse @rchars);
	return id_from_sub_id($new_sub_id);
	}

	sub format_line {
	my $line = shift;

	# When designing regular expressions, beware of carriage returns
	# in DOS style line endings. They can cause weird effects in
	# substitutions.
	chomp($line);
	$line =~ s/\.\/(.):DocURL:\shttp:\/\/f76.eu\/([[:alnum:]]*)/$2\t$1/g;

	return $line;
	}

	sub compare_ids {
	my ($x, $y) = @_;
	$x =~ tr/A-Za-z/a-zA-Z/;
	$y =~ tr/A-Za-z/a-zA-Z/;
	my $result = (length $x <=> length $y \|\| $x cmp $y);
	return (length $x <=> length $y \|\| $x cmp $y);
	}

	sub compare_lines {
	my @x = split(/\t/, $a);
	my @y = split(/\t/, $b);
	return compare_ids($x[0], $y[0]);
	}

	sub report_gap {
	my ($id, $expected_id) = @_;
	my $first_id = $$expected_id;

	my $last_missing_id;

	while (compare_ids($id, $$expected_id) > 0) {
	$last_missing_id = $$expected_id;
	$$expected_id = next_id($$expected_id);
	}

	my $gap_found = defined($last_missing_id);
	if ($gap_found) {
	print STDERR "Missing: ".$first_id;
	if ($last_missing_id ne $first_id) {
	print STDERR " - ".$last_missing_id;
	}
	print STDERR "\n";
	}

	my $expected_id_found = $id eq $$expected_id;
	return ($gap_found, $expected_id_found);
	}

	sub report_if_duplicate {
	my $id = shift;
	state $prev_id;
	my $duplicate_found = $id eq $prev_id;
	$prev_id = $id;

	print STDERR "Duplicate: ".$id."\n" if $duplicate_found;

	return $duplicate_found;
	}

	sub report_inconsistencies {
	my @lines = @_;
	my $expected_id = "a0";
	my @augmented_lines;
	my $errors_found;

	foreach (@lines) {
	my ($id) = split(/\t/, $_);
	my $duplicate_found = report_if_duplicate $id;
	if ($duplicate_found) {
	$_ = "# Duplicate: ".$_;
	$errors_found = 1;
	}
	my ($gap_found, $expected_id_found) = report_gap($id, \$expected_id);
	if ($expected_id_found) {
	$expected_id = next_id($expected_id);
	}
	if ($gap_found) {
	push(@augmented_lines, "# Gap");
	$errors_found = 1;
	}
	push(@augmented_lines, $_);
	}

	if ($errors_found) {
	push(@augmented_lines, "");
	push(@augmented_lines, "# Errors found, see comments above.");
	}

	return @augmented_lines;
	}

	sub timestamp {
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
	my $timestamp = sprintf("%04d-%02d-%02d %02d:%02d:%02d",
	$year+1900,$mon+1,$mday,$hour,$min,$sec);
	return $timestamp;
	}

	sub append_timestamp {
	my @lines = @_;
	push(@lines, "");
	push(@lines, "# Last update: " . timestamp);
	return @lines;
	}

	sub clean_up {
	$_[0] =~ s/\r\|\n//g;
	$_[0] =~ s/^\s+\|\s+$//g;
	}

	sub summarize {
	print <<~EOF;

	Finished parsing: $doc_dir
	Wrote output to: $output_filename
	Maximum file size searched: $max_file_size KiB
	If there are missing IDs, consider increasing file size for searching.
	EOF
	}

	sub write_output {
	open(OUT, '>', $output_filename) or die $!;
	print OUT <<~EOF;
	# DocURL cache. For information about f76.eu DocURLs and how to
	# rebuild the cache, see:
	#
	# http://f76.eu/a0
	#
	# This file was generated by:
	#
	# $0
	#
	# Felix E. Klee <felix.klee\@inka.de>

	EOF
	print OUT join("\n", @_);
	print OUT "\n";
	close OUT;
	}

	# See documentation of Gnu `find`: When an upper size limit is
	# specified, it does not include the limit. Instead the maximum is
	# one unit below that.
	sub find_size {
	return -($max_file_size + 1)."k";
	}

	sub parse_docs {
	my $find_cmd =
	'find -type f -size '.find_size.' -exec grep -Hi "^DocURL:" {} \;';
	my @lines;
	my @ids;
	open(FIND, "$find_cmd \|");
	while (<FIND>) {
	my $line = format_line($_);
	clean_up $line;
	print "$line\n";
	push(@lines, $line);
	}
	return @lines;
	}

	($doc_dir, $max_file_size) = read_config;

	chdir $doc_dir \|\| die "Cannot enter " . $doc_dir;
	my @lines = parse_docs;
	my @sorted_lines = sort compare_lines @lines;
	my @augmented_lines = report_inconsistencies(@sorted_lines);
	@augmented_lines = append_timestamp(@augmented_lines);
	write_output @augmented_lines;

	summarize;