Skip to content

Instantly share code, notes, and snippets.

@feklee
Last active April 3, 2024 09:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save feklee/7fbdc380abd41fb5d75b5a35e7324463 to your computer and use it in GitHub Desktop.
Save feklee/7fbdc380abd41fb5d75b5a35e7324463 to your computer and use it in GitHub Desktop.
Finds URLs of files in my directory tree
#!/usr/bin/perl
# All URLs of the format <http://f76.eu/a*> are for identifying my
# documents. See: http://f76.eu/a0
#
# This script finds all files when run from the root of the directory
# tree with my documents.
# Example confguration `~/.config/parse_docurls.conf`:
#
# doc_dir = '~/windows/Dropbox/My Data'
# max_file_size = 1 # KiB
# Felix E. Klee <felix.klee@inka.de>
use strict;
use feature "state";
use ConfigReader::Simple;
my $config_filename = glob "~/.config/parse_docurls.conf";
my $output_filename = glob "~/.local/share/docurl_cache";
my $doc_dir;
my $max_file_size;
sub read_config {
my $config = ConfigReader::Simple->new($config_filename);
die "Could not read config: $ConfigReader::Simple::ERROR\n"
unless ref $config;
my $doc_dir = $config->get('doc_dir') || die "Missing config";
my $max_file_size = $config->get("max_file_size");
$doc_dir = glob "'$doc_dir'";
return ($doc_dir, $max_file_size);
}
# All IDs start with an "a", which is followed by the actual
# identifier. Let's call that sub ID.
sub sub_id {
my $id = shift;
if (substr($id, 0, 1) ne 'a') {
die('ID "' . $id . '" is in wrong format');
}
return substr($id, 1);
}
sub id_from_sub_id {
my $sub_id = shift;
return 'a' . $sub_id;
}
sub next_id {
my $id = shift;
my @chars = split(//, sub_id($id));
my @rchars = reverse @chars;
my $l = @rchars;
while (my ($i, $char) = each @rchars) {
if ($char =~ /[0-8a-yA-Y]/) {
$rchars[$i] = chr(ord($char) + 1);
last;
} elsif ($char eq "9") {
$rchars[$i] = "a";
last;
} elsif ($char eq "z") {
$rchars[$i] = "A";
last;
} else {
$rchars[$i] = "0";
if ($i == ($l - 1)) {
$rchars[$i] = "00";
}
}
}
my $new_sub_id = join('', reverse @rchars);
return id_from_sub_id($new_sub_id);
}
sub format_line {
my $line = shift;
# When designing regular expressions, beware of carriage returns
# in DOS style line endings. They can cause weird effects in
# substitutions.
chomp($line);
$line =~ s/\.\/(.*):DocURL:\s*http:\/\/f76.eu\/([[:alnum:]]*)/$2\t$1/g;
return $line;
}
sub compare_ids {
my ($x, $y) = @_;
$x =~ tr/A-Za-z/a-zA-Z/;
$y =~ tr/A-Za-z/a-zA-Z/;
my $result = (length $x <=> length $y || $x cmp $y);
return (length $x <=> length $y || $x cmp $y);
}
sub compare_lines {
my @x = split(/\t/, $a);
my @y = split(/\t/, $b);
return compare_ids($x[0], $y[0]);
}
sub report_gap {
my ($id, $expected_id) = @_;
my $first_id = $$expected_id;
my $last_missing_id;
while (compare_ids($id, $$expected_id) > 0) {
$last_missing_id = $$expected_id;
$$expected_id = next_id($$expected_id);
}
my $gap_found = defined($last_missing_id);
if ($gap_found) {
print STDERR "Missing: ".$first_id;
if ($last_missing_id ne $first_id) {
print STDERR " - ".$last_missing_id;
}
print STDERR "\n";
}
my $expected_id_found = $id eq $$expected_id;
return ($gap_found, $expected_id_found);
}
sub report_if_duplicate {
my $id = shift;
state $prev_id;
my $duplicate_found = $id eq $prev_id;
$prev_id = $id;
print STDERR "Duplicate: ".$id."\n" if $duplicate_found;
return $duplicate_found;
}
sub report_inconsistencies {
my @lines = @_;
my $expected_id = "a0";
my @augmented_lines;
my $errors_found;
foreach (@lines) {
my ($id) = split(/\t/, $_);
my $duplicate_found = report_if_duplicate $id;
if ($duplicate_found) {
$_ = "# Duplicate: ".$_;
$errors_found = 1;
}
my ($gap_found, $expected_id_found) = report_gap($id, \$expected_id);
if ($expected_id_found) {
$expected_id = next_id($expected_id);
}
if ($gap_found) {
push(@augmented_lines, "# Gap");
$errors_found = 1;
}
push(@augmented_lines, $_);
}
if ($errors_found) {
push(@augmented_lines, "");
push(@augmented_lines, "# Errors found, see comments above.");
}
return @augmented_lines;
}
sub timestamp {
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
my $timestamp = sprintf("%04d-%02d-%02d %02d:%02d:%02d",
$year+1900,$mon+1,$mday,$hour,$min,$sec);
return $timestamp;
}
sub append_timestamp {
my @lines = @_;
push(@lines, "");
push(@lines, "# Last update: " . timestamp);
return @lines;
}
sub clean_up {
$_[0] =~ s/\r|\n//g;
$_[0] =~ s/^\s+|\s+$//g;
}
sub summarize {
print <<~EOF;
Finished parsing: $doc_dir
Wrote output to: $output_filename
Maximum file size searched: $max_file_size KiB
If there are missing IDs, consider increasing file size for searching.
EOF
}
sub write_output {
open(OUT, '>', $output_filename) or die $!;
print OUT <<~EOF;
# DocURL cache. For information about f76.eu DocURLs and how to
# rebuild the cache, see:
#
# http://f76.eu/a0
#
# This file was generated by:
#
# $0
#
# Felix E. Klee <felix.klee\@inka.de>
EOF
print OUT join("\n", @_);
print OUT "\n";
close OUT;
}
# See documentation of Gnu `find`: When an upper size limit is
# specified, it does not include the limit. Instead the maximum is
# one unit below that.
sub find_size {
return -($max_file_size + 1)."k";
}
sub parse_docs {
my $find_cmd =
'find -type f -size '.find_size.' -exec grep -Hi "^DocURL:" {} \;';
my @lines;
my @ids;
open(FIND, "$find_cmd |");
while (<FIND>) {
my $line = format_line($_);
clean_up $line;
print "$line\n";
push(@lines, $line);
}
return @lines;
}
($doc_dir, $max_file_size) = read_config;
chdir $doc_dir || die "Cannot enter " . $doc_dir;
my @lines = parse_docs;
my @sorted_lines = sort compare_lines @lines;
my @augmented_lines = report_inconsistencies(@sorted_lines);
@augmented_lines = append_timestamp(@augmented_lines);
write_output @augmented_lines;
summarize;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment