Skip to content

Instantly share code, notes, and snippets.

@simonhf
Last active October 29, 2019 19:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simonhf/6bb7f92824ab331e77eec70c0984340d to your computer and use it in GitHub Desktop.
Save simonhf/6bb7f92824ab331e77eec70c0984340d to your computer and use it in GitHub Desktop.

Experiments with libarchive read blocking: Part 2

  • Note: This is a continuation from Part 1 [1].

[1] Experiments with libarchive read blocking: Part 1

How to comprehend libarchive internally?

There are too many files and lines of code (LOC) to start reading file by file

  • Production and test files:
$ find . -type f | egrep "\.[ch]" | egrep -v test | wc -l
236
$ find . -type f | egrep "\.[ch]" | egrep    test | wc -l
444
  • Production and test file lines of code:
$ find . -type f | egrep "\.[ch]" | egrep -v test | xargs wc -l | tail -1
 197,302 total
$ find . -type f | egrep "\.[ch]" | egrep    test | xargs wc -l | tail -1
  80,548 total

How to auto instrument librachive

  • Ideally I'd like to auto instrument libarchive, so that when running the test code in part 1, I can see which internal functions need to be examined.
  • Once in the context of the internal functions, I can start manually instrumenting libarchive to discover more internal details.
  • Build libarchive using this experimental script [1] which auto instruments the C source code.

[1] https://gist.github.com/simonhf/35a5d45d1498c0e451b579a0edd46768

$ # copy cwrap.pl into libarchive folder
$ make clean ; find . -type f | egrep "\.cwrap" | xargs rm
$ make CC="perl cwrap.pl gcc"
$ find . -type f | egrep "cwrap.2.c" | xargs egrep "extern unsigned long long cwrap_count_call_" | wc -l # total functions auto instrumented
2,102
$ find . -type f | egrep "cwrap.2.c" | xargs egrep "extern unsigned long long cwrap_count_call_" | egrep call_archive_read_open # example API functions auto instrumented
./libarchive/archive_read_open_memory.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_memory;
./libarchive/archive_read_open_memory.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_memory2;
./libarchive/archive_read.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open;
./libarchive/archive_read.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open2;
./libarchive/archive_read.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open1;
./libarchive/archive_read_open_file.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_FILE;
./libarchive/archive_read_open_fd.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_fd;
./libarchive/archive_read_open_filename.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_file;
./libarchive/archive_read_open_filename.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_filename;
./libarchive/archive_read_open_filename.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_filenames;
./libarchive/archive_read_open_filename.cwrap.2.c:extern unsigned long long cwrap_count_call_archive_read_open_filename_w;

Create a C version for example2.cpp because the auto instrument script does not yet work with CPP

  • Note after running example3.c then we get the same output as with example2.cpp:
$ cp example2.cpp example3.c
$ # hack cpp to c in order to work with cwrap
$ perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -O2 -fPIC -c -g -o example3.o example3.c
$ perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -fPIC -o example3 example3.o libarchive/*.o -lz -lexpat -lcrypto
$ strace -c ./example3 -r test_1.tar.gz
     - /* for each archive file (1): read off disk, and archive_read_open() */
0=id - attempting to open: test_1.tar.gz
0=id - read 1,689,852 bytes into id_buff_archive[0]
0=id - archive_read_new() {}
0=id - archive_read_open() {
0=id   - libarchiveRead() {} = 7=bytes_available // callback
0=id   - libarchiveRead() {} = 7=bytes_available // callback
0=id   - libarchiveRead() {} = 7=bytes_available // callback
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   } = ARCHIVE_OK
     - /* for each archive file: archive_read_next_header() and archive_read_extract() */
0=id - archive_read_next_header() {} = ARCHIVE_OK
0=id - archive_entry_pathname(entry) {} = .libs/libarchive_test_1a.a // file to extract
0=id - archive_read_extract() {
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   - libarchiveRead() {} = 262,144=bytes_available // callback
0=id   - libarchiveRead() {} = 116,967=bytes_available // callback
0=id   } = ARCHIVE_OK
     - /* for each archive file: archive_read_next_header() and archive_read_extract() */
0=id - archive_read_next_header() {} = ARCHIVE_OK
0=id - archive_entry_pathname(entry) {} = .libs/libarchive_test_1b.a // file to extract
0=id - archive_read_extract() {
0=id   - libarchiveRead() {} = 0=bytes_available // callback
0=id   } = ARCHIVE_OK
     - /* for each archive file: archive_read_next_header() and archive_read_extract() */
0=id - archive_read_next_header() {} = ARCHIVE_EOF
     - /* for each archive file: archive_read_close() and archive_read_free() */
0=id - archive_read_close() {}
0=id - archive_read_free() {}
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 97.86    0.000504           0      3818           write
  2.14    0.000011           0      3690           gettid
  0.00    0.000000           0         6           read
  0.00    0.000000           0        13         2 open
  0.00    0.000000           0        10           close
  0.00    0.000000           0         9           fstat
  0.00    0.000000           0         2           lstat
  0.00    0.000000           0        18           mmap
  0.00    0.000000           0        12           mprotect
  0.00    0.000000           0         1           munmap
  0.00    0.000000           0         4           brk
  0.00    0.000000           0         7         7 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         2           fcntl
  0.00    0.000000           0         2           unlink
  0.00    0.000000           0         6           umask
  0.00    0.000000           0         1           geteuid
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         2           utimensat
------ ----------- ----------- --------- --------- ----------------
100.00    0.000515                  7605         9 total
  • However the auto instrumentation code has written its own log:
$ cat example3.cwrap.out | wc -l
5,362
  • Example auto instrumentation from the start of execution for example3.c:
$ head example3.cwrap.out
T23429 0.000000s + archive_read_new() {
T23429 0.000427s   + archive_entry_new2() {} = ? // #1
T23429 0.000743s   } = ? // archive_read_new() #1
T23429 0.000854s + archive_read_support_format_all() {
T23429 0.000890s   + __archive_check_magic() {} = 0 // #1
T23429 0.001049s   + archive_read_support_format_ar() {
T23429 0.001086s     + __archive_check_magic() {} = 0 // #2
T23429 0.001243s     + __archive_read_register_format() {
T23429 0.001280s       + __archive_check_magic() {} = 0 // #3
T23429 0.001435s       } = 0 // __archive_read_register_format() #1
  • Example auto instrumentation from the first call to archive_read_open():
$ egrep "\+ archive_read_open" --after-context=50 example3.cwrap.out
T23429 0.122732s + archive_read_open() {
T23429 0.122745s   + archive_read_set_open_callback() {
T23429 0.122771s     + __archive_check_magic() {} = 0 // #46
T23429 0.122811s     } = 0 // archive_read_set_open_callback() #1
T23429 0.122837s   + archive_read_set_read_callback() {
T23429 0.122847s     + __archive_check_magic() {} = 0 // #47
T23429 0.122888s     } = 0 // archive_read_set_read_callback() #1
T23429 0.122913s   + archive_read_set_close_callback() {
T23429 0.122923s     + __archive_check_magic() {} = 0 // #48
T23429 0.122964s     } = 0 // archive_read_set_close_callback() #1
T23429 0.122989s   + archive_read_set_callback_data() {
T23429 0.122999s     + archive_read_set_callback_data2() {
T23429 0.123024s       + __archive_check_magic() {} = 0 // #49
T23429 0.123065s       } = 0 // archive_read_set_callback_data2() #1
T23429 0.123090s     } = 0 // archive_read_set_callback_data() #1
T23429 0.123115s   + archive_read_open1() {
T23429 0.123125s     + __archive_check_magic() {} = 0 // #50
T23429 0.123166s     + archive_clear_error() {} // #3
T23429 0.123193s     + choose_filters() {
T23429 0.123203s       + __archive_read_filter_ahead() {
T23429 0.123239s         - libarchive_read_callback(0=id) {} = 7=bytes_available
T23429 0.123295s         - libarchive_read_callback(0=id) {} = 7=bytes_available
T23429 0.123321s         } // __archive_read_filter_ahead() #1
T23429 0.123348s       + __archive_read_filter_ahead() {} // #2
T23429 0.123374s       + peek_at_header() {
T23429 0.123384s         + __archive_read_filter_ahead() {} // #3
T23429 0.123425s         } = ? // peek_at_header() #1
T23429 0.123450s       + lzip_has_member() {
T23429 0.123460s         + __archive_read_filter_ahead() {} // #4
T23429 0.123501s         } = 0 // lzip_has_member() #1
T23429 0.123526s       + __archive_read_filter_ahead() {} // #5
T23429 0.123551s       + __archive_read_filter_ahead() {} // #6
T23429 0.123576s       + __archive_read_filter_ahead() {} // #7
T23429 0.123602s       + bid_get_line() {
T23429 0.123612s         + get_line() {} = ? // #1
T23429 0.123653s         } = ? // bid_get_line() #1
T23429 0.123679s       + __archive_read_filter_ahead() {} // #8
T23429 0.123704s       + __archive_read_filter_ahead() {} // #9
T23429 0.123729s       + __archive_read_filter_ahead() {} // #10
T23429 0.123754s       + __archive_read_filter_ahead() {} // #11
T23429 0.123780s       + __archive_read_filter_ahead() {} // #12
T23429 0.123805s       + __archive_read_filter_ahead() {} // #13
T23429 0.123833s       + __archive_read_filter_ahead() {
T23429 0.123844s         + consume_header() {
T23429 0.123869s           + peek_at_header() {
T23429 0.123894s             + __archive_read_filter_ahead() {} // #15
T23429 0.123934s             } = ? // peek_at_header() #2
T23429 0.123960s           + __archive_read_filter_consume() {
T23429 0.123970s             + advance_file_pointer() {} = ? // #1
T23429 0.124019s             } = ? // __archive_read_filter_consume() #1
T23429 0.124045s           + __archive_read_filter_ahead() {} // #16
T23429 0.124074s           } = 0 // consume_header() #1
T23429 0.124100s         + __archive_read_filter_ahead() {} // #17
T23429 0.124130s         + __archive_read_filter_consume() {
T23429 0.124140s           + advance_file_pointer() {} = ? // #2
T23429 0.124181s           } = ? // __archive_read_filter_consume() #2
T23429 0.124206s         + __archive_read_filter_ahead() {
T23429 0.124226s           - libarchive_read_callback(0=id) {} = 7=bytes_available
T23429 0.124267s           } // __archive_read_filter_ahead() #18
T23429 0.124295s         + __archive_read_filter_consume() {
T23429 0.124305s           + advance_file_pointer() {} = ? // #3
T23429 0.124346s           } = ? // __archive_read_filter_consume() #3
T23429 0.124371s         + __archive_read_filter_ahead() {
T23429 0.124392s           - libarchive_read_callback(0=id) {} = 262,144=bytes_available
T23429 0.124433s           } // __archive_read_filter_ahead() #19
T23429 0.124707s         + __archive_read_filter_consume() {

Add more instrumentation manually to comprehend further

Track changes to source files with git status

  • Add *cwrap* to .gitignore to ignore the many auto intrumentated files:
$ git status
On branch master
Your branch is up-to-date with 'origin/master'.
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

        modified:   .gitignore

Untracked files:
  (use "git add <file>..." to include in what will be committed)

        example3
        example3.c
        test_1.tar.gz
        test_2.tar.gz

no changes added to commit (use "git add" and/or "git commit -a")

Adding example manual instrumentation the archive_read_open1() function

  • First figure out in which file the archive_read_open1() function is:
$ find . -type f | egrep "\.aux$" | xargs cat | egrep ":NF" | egrep "archive_read_open1 "
/* libarchive/archive_read.c:461:NF */ extern int archive_read_open1 (struct archive *_a); /* (_a) struct archive *_a; */
  • Edit the file and as an example, promote an existing source code comment to instrumentation:
$ git diff -w libarchive/archive_read.c
diff --git a/libarchive/archive_read.c b/libarchive/archive_read.c
index 5872601..967221f 100644
--- a/libarchive/archive_read.c
+++ b/libarchive/archive_read.c
@@ -510,7 +510,7 @@ archive_read_open1(struct archive *_a)
     if (!a->filter || !a->bypass_filter_bidding)
     {
         a->filter = filter;
-               /* Build out the input pipeline. */
+        // cwrap_log("- Build out the input pipeline.");
         e = choose_filters(a);
         if (e < ARCHIVE_WARN) {
             a->archive.state = ARCHIVE_STATE_FATAL;
  • Recompile and re-run:

  • Note: Makefile still functions as expected even though there are tons of extra files for the instrumentation kicking around.

$ make CC="perl cwrap.pl gcc"
make  all-am
make[1]: Entering directory '/home/simon/20191016-libarchive/libarchive'
  CC       libarchive/archive_read.lo
Sat Oct 26 17:34:26 2019 0.742719 cwrap: gcc .. >                                      libarchive/archive_read.o.cwrap.cc.out;   54 functions,    0 warnings
  CCLD     libarchive.la
ar: `u' modifier ignored since `D' is the default (see `U')
  CCLD     bsdtar
Sat Oct 26 17:34:28 2019 0.227931 cwrap: gcc .. >                                                               .cwrap.cc.out;    7 objects  ,    2 libs
  CCLD     bsdcpio
Sat Oct 26 17:34:29 2019 0.207650 cwrap: gcc .. >                                                               .cwrap.cc.out;    2 objects  ,    2 libs
  CCLD     bsdcat
Sat Oct 26 17:34:29 2019 0.087651 cwrap: gcc .. >                                                               .cwrap.cc.out;    2 objects  ,    2 libs
make[1]: Leaving directory '/home/simon/20191016-libarchive/libarchive'
$ perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -O2 -fPIC -c -g -o example3.o example3.c
$ perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -fPIC -o example3 example3.o libarchive/*.o -lz -lexpat -lcrypto
$ strace -c ./example3 -r test_1.tar.gz
$ egrep "\+ archive_read_open1" --after-context=10 example3.cwrap.out
T25514 0.085880s   + archive_read_open1() {
T25514 0.085890s     + __archive_check_magic() {} = 0 // #50
T25514 0.085930s     + archive_clear_error() {} // #3
T25514 0.085955s     - Build out the input pipeline.                 <-- freshly baked manual instrumentation
T25514 0.085981s     + choose_filters() {
T25514 0.085991s       + __archive_read_filter_ahead() {
T25514 0.086027s         - libarchive_read_callback(0=id) {} = 7=bytes_available
T25514 0.086082s         - libarchive_read_callback(0=id) {} = 7=bytes_available
T25514 0.086108s         } // __archive_read_filter_ahead() #1
T25514 0.086135s       + __archive_read_filter_ahead() {} // #2
T25514 0.086160s       + peek_at_header() {

Create a simple shell script to recompile and run after changes

$ cat doit.sh
make CC="perl cwrap.pl gcc"
perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -O2 -fPIC -c -g -o example3.o example3.c
perl cwrap.pl gcc -DFROM_STREAM -Ilibarchive -fPIC -o example3 example3.o libarchive/*.o -lz -lexpat -lcrypto
strace -c ./example3 -r test_1.tar.gz
egrep "\+ archive_read_open1" --after-context=50 example3.cwrap.out

Received very welcome feedback via libarchive github issue [1]

You're right that the libarchive API does not currently support asynchronous operation.
It would indeed be a pretty big change, since it would likely require major alterations
to every single format and filter handler.

If you want to pursue this, I would suggest first building a stripped-down libarchive
that only contains the tar format and no filter handlers. (You should be able to do
this simply by deleting all of the other format and filter source files.) That will let
you experiment with various approaches without having to understand every part of
libarchive. Then add the gzip filter and see if you can get that to work correctly.

Good luck!

[1] how to get non-blocking behaviour when reading larger archive streams?

Investigate where current read callback is called from inside archive_read_open1()

  • Figure out in which file the choose_filters() function is, because it calls our read callback function:
$ find . -type f | egrep "\.aux$" | xargs cat | egrep ":NF" | egrep "choose_filters "
/* libarchive/archive_read.c:557:NF */ static int choose_filters (struct archive_read *a); /* (a) struct archive_read *a; */
  • Figure out in which file the choose_format() function is, because it calls our read callback function:
$ find . -type f | egrep "\.aux$" | xargs cat | egrep ":NF" | egrep "choose_format "
/* libarchive/archive_read.c:712:NF */ static int choose_format (struct archive_read *a); /* (a) struct archive_read *a; */
  • Figure out in which file the __archive_read_filter_ahead() function is, because it calls our read callback function:
$ find . -type f | egrep "\.aux$" | xargs cat | egrep ":NF" | egrep "__archive_read_filter_ahead "
/* libarchive/archive_read.c:1321:NF */ extern const void *__archive_read_filter_ahead (struct archive_read_filter *filter, size_t min, ssize_t *avail); /* (filter, min, avail) struct archive_read_filter *filter; size_t min; ssize_t *avail; */
  • Finding the __archive_read_filter_ahead() function rewards us with a larger source comment about the read ahead mechanism in general [1].
  • Note: We probably want to update this comment if we ever end up change the code to be async.
  • Add in various manual instrumentation.
  • The function archive_read_open1() now looks like this:
  • Note: You can see that each choose_filters() iterates through many filter handlers.
  • Note: Each filter handlers determines itself if many client raw bytes need to be read via the callback.
$ ./doit.sh
...
T29590 0.054947s   + archive_read_open1() {
T29590 0.054957s     + __archive_check_magic() {} = 0 // #50
T29590 0.054997s     + archive_clear_error() {} // #3
T29590 0.055023s     - Open data source.
T29590 0.055048s     - Build out the input pipeline.
T29590 0.055073s     + choose_filters() {
T29590 0.055083s       - Allow each registered stream transform to bid on whether it wants to handle this stream.  Repeat until we've finished building the pipeline.
T29590 0.055123s       - for number_filters=0 of 25; for bidder i=0 of 16; calling bidder bzip2
T29590 0.055150s       + bzip2_reader_bid() {
T29590 0.055160s         + __archive_read_filter_ahead() {
T29590 0.055185s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=0; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.055211s           + client_read_proxy() {
T29590 0.055248s             - libarchive_read_callback(0=id) {} = 7=bytes_available
T29590 0.055289s             } = ? // client_read_proxy() #1
T29590 0.055314s           - bytes_read=7
T29590 0.055339s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=7; Come here if we can't satisfy the request from the copy buffer or the existing client data.
T29590 0.055357s           - min=14 filter->avail=7 filter->buffer_size=14 filter->client_total=7; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.055384s           + client_read_proxy() {
T29590 0.055420s             - libarchive_read_callback(0=id) {} = 7=bytes_available
T29590 0.055461s             } = ? // client_read_proxy() #2
T29590 0.055486s           - bytes_read=7
T29590 0.055511s           - min=14 filter->avail=7 filter->buffer_size=14 filter->client_total=7; Come here if we can't satisfy the request from the copy buffer or the existing client data.
T29590 0.055521s           - min=14 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.055547s           } // __archive_read_filter_ahead() #1
T29590 0.055599s         } = 0 // bzip2_reader_bid() #1
T29590 0.055624s       - for number_filters=0 of 25; for bidder i=1 of 16; calling bidder compress (.Z)
T29590 0.055650s       + compress_bidder_bid() {
T29590 0.055660s         + __archive_read_filter_ahead() {
T29590 0.055685s           - min=3 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.055710s           } // __archive_read_filter_ahead() #2
T29590 0.055751s         } = 0 // compress_bidder_bid() #1
T29590 0.055776s       - for number_filters=0 of 25; for bidder i=2 of 16; calling bidder gzip
T29590 0.055801s       + gzip_bidder_bid() {
T29590 0.055811s         + peek_at_header() {
T29590 0.055837s           + __archive_read_filter_ahead() {
T29590 0.055862s             - min=10 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.055887s             } // __archive_read_filter_ahead() #3
T29590 0.055928s           } = ? // peek_at_header() #1
T29590 0.055953s         } = 27 // gzip_bidder_bid() #1
T29590 0.055978s       - new best bidder because bid=27 > best_bid=0
T29590 0.056004s       - for number_filters=0 of 25; for bidder i=3 of 16; calling bidder lzip
T29590 0.056029s       + lzip_bidder_bid() {
T29590 0.056039s         + lzip_has_member() {
T29590 0.056064s           + __archive_read_filter_ahead() {
T29590 0.056089s             - min=6 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056114s             } // __archive_read_filter_ahead() #4
T29590 0.056156s           } = 0 // lzip_has_member() #1
T29590 0.056181s         } = 0 // lzip_bidder_bid() #1
T29590 0.056206s       - for number_filters=0 of 25; for bidder i=4 of 16; calling bidder lzma
T29590 0.056231s       + lzma_bidder_bid() {
T29590 0.056241s         + __archive_read_filter_ahead() {
T29590 0.056266s           - min=14 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056295s           } // __archive_read_filter_ahead() #5
T29590 0.056338s         } = 0 // lzma_bidder_bid() #1
T29590 0.056363s       - for number_filters=0 of 25; for bidder i=5 of 16; calling bidder xz
T29590 0.056388s       + xz_bidder_bid() {
T29590 0.056398s         + __archive_read_filter_ahead() {
T29590 0.056422s           - min=6 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056448s           } // __archive_read_filter_ahead() #6
T29590 0.056488s         } = 0 // xz_bidder_bid() #1
T29590 0.056513s       - for number_filters=0 of 25; for bidder i=6 of 16; calling bidder uu
T29590 0.056539s       + uudecode_bidder_bid() {
T29590 0.056549s         + __archive_read_filter_ahead() {
T29590 0.056573s           - min=1 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056599s           } // __archive_read_filter_ahead() #7
T29590 0.056639s         + bid_get_line() {
T29590 0.056649s           + get_line() {} = ? // #1
T29590 0.056690s           } = ? // bid_get_line() #1
T29590 0.056715s         } = 0 // uudecode_bidder_bid() #1
T29590 0.056740s       - for number_filters=0 of 25; for bidder i=7 of 16; calling bidder rpm
T29590 0.056765s       + rpm_bidder_bid() {
T29590 0.056775s         + __archive_read_filter_ahead() {
T29590 0.056800s           - min=8 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056825s           } // __archive_read_filter_ahead() #8
T29590 0.056865s         } = 0 // rpm_bidder_bid() #1
T29590 0.056890s       - for number_filters=0 of 25; for bidder i=8 of 16; calling bidder lrzip
T29590 0.056916s       + lrzip_bidder_bid() {
T29590 0.056925s         + __archive_read_filter_ahead() {
T29590 0.056951s           - min=6 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.056976s           } // __archive_read_filter_ahead() #9
T29590 0.057019s         } = 0 // lrzip_bidder_bid() #1
T29590 0.057044s       - for number_filters=0 of 25; for bidder i=9 of 16; calling bidder (null)
T29590 0.057070s       + lzop_bidder_bid() {
T29590 0.057079s         + __archive_read_filter_ahead() {
T29590 0.057104s           - min=9 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.057130s           } // __archive_read_filter_ahead() #10
T29590 0.057170s         } = 0 // lzop_bidder_bid() #1
T29590 0.057195s       - for number_filters=0 of 25; for bidder i=10 of 16; calling bidder (null)
T29590 0.057220s       + grzip_bidder_bid() {
T29590 0.057230s         + __archive_read_filter_ahead() {
T29590 0.057256s           - min=12 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.057281s           } // __archive_read_filter_ahead() #11
T29590 0.057321s         } = 0 // grzip_bidder_bid() #1
T29590 0.057346s       - for number_filters=0 of 25; for bidder i=11 of 16; calling bidder lz4
T29590 0.057372s       + lz4_reader_bid() {
T29590 0.057382s         + __archive_read_filter_ahead() {
T29590 0.057407s           - min=11 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.057432s           } // __archive_read_filter_ahead() #12
T29590 0.057472s         } = 0 // lz4_reader_bid() #1
T29590 0.057497s       - for number_filters=0 of 25; for bidder i=12 of 16; calling bidder zstd
T29590 0.057523s       + zstd_bidder_bid() {
T29590 0.057533s         + __archive_read_filter_ahead() {
T29590 0.057558s           - min=4 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.057583s           } // __archive_read_filter_ahead() #13
T29590 0.057624s         } = 0 // zstd_bidder_bid() #1
T29590 0.057649s       - for number_filters=0 of 25; for bidder i=13 of 16; no bidder available
T29590 0.057674s       - for number_filters=0 of 25; for bidder i=14 of 16; no bidder available
T29590 0.057699s       - for number_filters=0 of 25; for bidder i=15 of 16; no bidder available
T29590 0.057725s       + gzip_bidder_init() {} = 0 // #1
T29590 0.057754s       - for number_filters=1 of 25; for bidder i=0 of 16; calling bidder bzip2
T29590 0.057779s       + bzip2_reader_bid() {
T29590 0.057790s         + __archive_read_filter_ahead() {
T29590 0.057815s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=0; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.057841s           + gzip_filter_read() {
T29590 0.057866s             + consume_header() {
T29590 0.057891s               + peek_at_header() {
T29590 0.057916s                 + __archive_read_filter_ahead() {
T29590 0.057941s                   - min=10 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.057966s                   } // __archive_read_filter_ahead() #15
T29590 0.058007s                 } = ? // peek_at_header() #2
T29590 0.058034s               + __archive_read_filter_consume() {
T29590 0.058043s                 + advance_file_pointer() {} = ? // #1
T29590 0.058084s                 } = ? // __archive_read_filter_consume() #1
T29590 0.058110s               + __archive_read_filter_ahead() {
T29590 0.058119s                 - min=1 filter->avail=4 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.058145s                 } // __archive_read_filter_ahead() #16
T29590 0.058189s               } = 0 // consume_header() #1
T29590 0.058214s             + __archive_read_filter_ahead() {
T29590 0.058224s               - min=1 filter->avail=4 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.058250s               } // __archive_read_filter_ahead() #17
T29590 0.058303s             + __archive_read_filter_consume() {
T29590 0.058313s               + advance_file_pointer() {} = ? // #2
T29590 0.058354s               } = ? // __archive_read_filter_consume() #2
T29590 0.058379s             + __archive_read_filter_ahead() {
T29590 0.058389s               - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=7; Come here if moving data forward in copy buffer.; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.058414s               + client_read_proxy() {
T29590 0.058451s                 - libarchive_read_callback(0=id) {} = 7=bytes_available
T29590 0.058492s                 } = ? // client_read_proxy() #3
T29590 0.058517s               - bytes_read=7
T29590 0.058542s               - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from client buffer.
T29590 0.058552s               } // __archive_read_filter_ahead() #18
T29590 0.058596s             + __archive_read_filter_consume() {
T29590 0.058605s               + advance_file_pointer() {} = ? // #3
T29590 0.058646s               } = ? // __archive_read_filter_consume() #3
T29590 0.058671s             + __archive_read_filter_ahead() {
T29590 0.058681s               - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=7; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.058706s               + client_read_proxy() {
T29590 0.058742s                 - libarchive_read_callback(0=id) {} = 262,144=bytes_available
T29590 0.058783s                 } = ? // client_read_proxy() #4
T29590 0.058807s               - bytes_read=262144
T29590 0.058833s               - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=262144; Come here if we can satisfy from client buffer.
T29590 0.058843s               } // __archive_read_filter_ahead() #19
T29590 0.059137s             + __archive_read_filter_consume() {
T29590 0.059148s               + advance_file_pointer() {} = ? // #4
T29590 0.059191s               } = ? // __archive_read_filter_consume() #4
T29590 0.059216s             } = ? // gzip_filter_read() #1
T29590 0.059241s           - bytes_read=65536
T29590 0.059266s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.059277s           } // __archive_read_filter_ahead() #19
T29590 0.059318s         } = 0 // bzip2_reader_bid() #2
T29590 0.059343s       - for number_filters=1 of 25; for bidder i=1 of 16; calling bidder compress (.Z)
T29590 0.059368s       + compress_bidder_bid() {
T29590 0.059378s         + __archive_read_filter_ahead() {
T29590 0.059403s           - min=3 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.059428s           } // __archive_read_filter_ahead() #20
T29590 0.059469s         } = 0 // compress_bidder_bid() #2
T29590 0.059494s       - for number_filters=1 of 25; for bidder i=2 of 16; calling bidder gzip
T29590 0.059519s       + gzip_bidder_bid() {
T29590 0.059528s         + peek_at_header() {
T29590 0.059553s           + __archive_read_filter_ahead() {
T29590 0.059578s             - min=10 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.059604s             } // __archive_read_filter_ahead() #21
T29590 0.059644s           } = ? // peek_at_header() #3
T29590 0.059669s         } = 0 // gzip_bidder_bid() #2
T29590 0.059695s       - for number_filters=1 of 25; for bidder i=3 of 16; calling bidder lzip
T29590 0.059720s       + lzip_bidder_bid() {
T29590 0.059730s         + lzip_has_member() {
T29590 0.059754s           + __archive_read_filter_ahead() {
T29590 0.059779s             - min=6 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.059804s             } // __archive_read_filter_ahead() #22
T29590 0.059856s           } = 0 // lzip_has_member() #2
T29590 0.059881s         } = 0 // lzip_bidder_bid() #2
T29590 0.059906s       - for number_filters=1 of 25; for bidder i=4 of 16; calling bidder lzma
T29590 0.059931s       + lzma_bidder_bid() {
T29590 0.059941s         + __archive_read_filter_ahead() {
T29590 0.059966s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.059992s           } // __archive_read_filter_ahead() #23
T29590 0.060032s         } = 0 // lzma_bidder_bid() #2
T29590 0.060057s       - for number_filters=1 of 25; for bidder i=5 of 16; calling bidder xz
T29590 0.060082s       + xz_bidder_bid() {
T29590 0.060092s         + __archive_read_filter_ahead() {
T29590 0.060117s           - min=6 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060143s           } // __archive_read_filter_ahead() #24
T29590 0.060183s         } = 0 // xz_bidder_bid() #2
T29590 0.060208s       - for number_filters=1 of 25; for bidder i=6 of 16; calling bidder uu
T29590 0.060234s       + uudecode_bidder_bid() {
T29590 0.060244s         + __archive_read_filter_ahead() {
T29590 0.060269s           - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060295s           } // __archive_read_filter_ahead() #25
T29590 0.060337s         + bid_get_line() {
T29590 0.060347s           + get_line() {} = ? // #2
T29590 0.060388s           } = ? // bid_get_line() #2
T29590 0.060413s         } = 0 // uudecode_bidder_bid() #2
T29590 0.060438s       - for number_filters=1 of 25; for bidder i=7 of 16; calling bidder rpm
T29590 0.060463s       + rpm_bidder_bid() {
T29590 0.060473s         + __archive_read_filter_ahead() {
T29590 0.060498s           - min=8 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060524s           } // __archive_read_filter_ahead() #26
T29590 0.060564s         } = 0 // rpm_bidder_bid() #2
T29590 0.060589s       - for number_filters=1 of 25; for bidder i=8 of 16; calling bidder lrzip
T29590 0.060614s       + lrzip_bidder_bid() {
T29590 0.060624s         + __archive_read_filter_ahead() {
T29590 0.060649s           - min=6 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060674s           } // __archive_read_filter_ahead() #27
T29590 0.060714s         } = 0 // lrzip_bidder_bid() #2
T29590 0.060739s       - for number_filters=1 of 25; for bidder i=9 of 16; calling bidder (null)
T29590 0.060765s       + lzop_bidder_bid() {
T29590 0.060774s         + __archive_read_filter_ahead() {
T29590 0.060799s           - min=9 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060824s           } // __archive_read_filter_ahead() #28
T29590 0.060865s         } = 0 // lzop_bidder_bid() #2
T29590 0.060890s       - for number_filters=1 of 25; for bidder i=10 of 16; calling bidder (null)
T29590 0.060916s       + grzip_bidder_bid() {
T29590 0.060925s         + __archive_read_filter_ahead() {
T29590 0.060950s           - min=12 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.060975s           } // __archive_read_filter_ahead() #29
T29590 0.061016s         } = 0 // grzip_bidder_bid() #2
T29590 0.061041s       - for number_filters=1 of 25; for bidder i=11 of 16; calling bidder lz4
T29590 0.061066s       + lz4_reader_bid() {
T29590 0.061076s         + __archive_read_filter_ahead() {
T29590 0.061101s           - min=11 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.061127s           } // __archive_read_filter_ahead() #30
T29590 0.061167s         } = 0 // lz4_reader_bid() #2
T29590 0.061200s       - for number_filters=1 of 25; for bidder i=12 of 16; calling bidder zstd
T29590 0.061226s       + zstd_bidder_bid() {
T29590 0.061236s         + __archive_read_filter_ahead() {
T29590 0.061261s           - min=4 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.061286s           } // __archive_read_filter_ahead() #31
T29590 0.061327s         } = 0 // zstd_bidder_bid() #2
T29590 0.061352s       - for number_filters=1 of 25; for bidder i=13 of 16; no bidder available
T29590 0.061377s       - for number_filters=1 of 25; for bidder i=14 of 16; no bidder available
T29590 0.061402s       - for number_filters=1 of 25; for bidder i=15 of 16; no bidder available
T29590 0.061427s       - Come here if no bidder; we're done. Verify the filter by asking it for some data.
T29590 0.061453s       + __archive_read_filter_ahead() {
T29590 0.061462s         - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.061488s         } // __archive_read_filter_ahead() #32
T29590 0.061528s       - a->filter->name=gzip
T29590 0.061554s       } = 0 // choose_filters() #1
T29590 0.061579s     + choose_format() {
T29590 0.061589s       - Allow each registered format to bid on whether it wants to handle the next entry. Return index of winning bidder.
T29590 0.061629s       - for i=0 of 16; calling format
T29590 0.061655s       + archive_read_format_ar_bid() {
T29590 0.061665s         + __archive_read_ahead() {
T29590 0.061690s           + __archive_read_filter_ahead() {
T29590 0.061715s             - min=8 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.061740s             } // __archive_read_filter_ahead() #33
T29590 0.061781s           } // __archive_read_ahead() #1
T29590 0.061806s         } = -1 // archive_read_format_ar_bid() #1
T29590 0.061831s       - for i=1 of 16; calling format
T29590 0.061856s       + archive_read_format_cpio_bid() {
T29590 0.061866s         + __archive_read_ahead() {
T29590 0.061892s           + __archive_read_filter_ahead() {
T29590 0.061917s             - min=6 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.061942s             } // __archive_read_filter_ahead() #34
T29590 0.061982s           } // __archive_read_ahead() #2
T29590 0.062007s         } = -20 // archive_read_format_cpio_bid() #1
T29590 0.062033s       - for i=2 of 16; calling format
T29590 0.062058s       + archive_read_format_empty_bid() {
T29590 0.062069s         + __archive_read_ahead() {
T29590 0.062094s           + __archive_read_filter_ahead() {
T29590 0.062119s             - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.062144s             } // __archive_read_filter_ahead() #35
T29590 0.062185s           } // __archive_read_ahead() #3
T29590 0.062210s         } = -1 // archive_read_format_empty_bid() #1
T29590 0.062235s       - for i=3 of 16; calling format
T29590 0.062260s       + archive_read_format_lha_bid() {
T29590 0.062270s         + __archive_read_ahead() {
T29590 0.062295s           + __archive_read_filter_ahead() {
T29590 0.062320s             - min=22 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.062345s             } // __archive_read_filter_ahead() #36
T29590 0.062385s           } // __archive_read_ahead() #4
T29590 0.062410s         + lha_check_header_format() {} = ? // #1
T29590 0.062436s         } = 0 // archive_read_format_lha_bid() #1
T29590 0.062461s       - for i=4 of 16; calling format
T29590 0.062486s       + mtree_bid() {
T29590 0.062496s         + __archive_read_ahead() {
T29590 0.062521s           + __archive_read_filter_ahead() {
T29590 0.062546s             - min=6 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.062571s             } // __archive_read_filter_ahead() #37
T29590 0.062620s           } // __archive_read_ahead() #5
T29590 0.062645s         + detect_form() {
T29590 0.062655s           + __archive_read_ahead() {
T29590 0.062680s             + __archive_read_filter_ahead() {
T29590 0.062705s               - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.062730s               } // __archive_read_filter_ahead() #38
T29590 0.062770s             } // __archive_read_ahead() #6
T29590 0.062795s           + next_line() {
T29590 0.062805s             + get_line_size() {} = ? // #1
T29590 0.062846s             } = ? // next_line() #1
T29590 0.062872s           } = 0 // detect_form() #1
T29590 0.062897s         } = 0 // mtree_bid() #1
T29590 0.062922s       - for i=5 of 16; calling format
T29590 0.062947s       + archive_read_format_tar_bid() {
T29590 0.062958s         + __archive_read_ahead() {
T29590 0.062983s           + __archive_read_filter_ahead() {
T29590 0.063008s             - min=512 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.063033s             } // __archive_read_filter_ahead() #39
T29590 0.063074s           } // __archive_read_ahead() #7
T29590 0.063099s         + checksum() {
T29590 0.063109s           + tar_atol() {
T29590 0.063134s             + tar_atol_base_n() {} = ? // #1
T29590 0.063175s             } = ? // tar_atol() #1
T29590 0.063200s           } = 1 // checksum() #1
T29590 0.063226s         + validate_number_field() {} = 1 // #1
T29590 0.063251s         + validate_number_field() {} = 1 // #2
T29590 0.063277s         + validate_number_field() {} = 1 // #3
T29590 0.063303s         + validate_number_field() {} = 1 // #4
T29590 0.063328s         + validate_number_field() {} = 1 // #5
T29590 0.063353s         + validate_number_field() {} = 1 // #6
T29590 0.063378s         + validate_number_field() {} = 1 // #7
T29590 0.063403s         } = 106 // archive_read_format_tar_bid() #1
T29590 0.063428s       - for i=6 of 16; calling format
T29590 0.063453s       + xar_bid() {
T29590 0.063463s         + __archive_read_ahead() {
T29590 0.063488s           + __archive_read_filter_ahead() {
T29590 0.063513s             - min=28 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.063538s             } // __archive_read_filter_ahead() #40
T29590 0.063579s           } // __archive_read_ahead() #8
T29590 0.063603s         } = 0 // xar_bid() #1
T29590 0.063628s       - for i=7 of 16; calling format
T29590 0.063654s       + _warc_bid() {
T29590 0.063663s         + __archive_read_ahead() {
T29590 0.063688s           + __archive_read_filter_ahead() {
T29590 0.063713s             - min=12 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.063739s             } // __archive_read_filter_ahead() #41
T29590 0.063779s           } // __archive_read_ahead() #9
T29590 0.063804s         + _warc_rdver() {} = 0 // #1
T29590 0.063830s         } = -1 // _warc_bid() #1
T29590 0.063855s       - for i=8 of 16; calling format
T29590 0.063880s       + archive_read_format_7zip_bid() {} = -1 // #1
T29590 0.063905s       - for i=9 of 16; calling format
T29590 0.063931s       + archive_read_format_cab_bid() {} = -1 // #1
T29590 0.063956s       - for i=10 of 16; calling format
T29590 0.063982s       + archive_read_format_rar_bid() {} = -1 // #1
T29590 0.064007s       - for i=11 of 16; calling format
T29590 0.064032s       + rar5_bid() {} = -1 // #1
T29590 0.064057s       - for i=12 of 16; calling format
T29590 0.064083s       + archive_read_format_iso9660_bid() {} = -1 // #1
T29590 0.064109s       - for i=13 of 16; calling format
T29590 0.064135s       + archive_read_format_zip_streamable_bid() {
T29590 0.064144s         + __archive_read_ahead() {
T29590 0.064169s           + __archive_read_filter_ahead() {
T29590 0.064194s             - min=4 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.064220s             } // __archive_read_filter_ahead() #42
T29590 0.064268s           } // __archive_read_ahead() #10
T29590 0.064298s         } = 0 // archive_read_format_zip_streamable_bid() #1
T29590 0.064324s       - for i=14 of 16; calling format
T29590 0.064349s       + archive_read_format_zip_seekable_bid() {} = -1 // #1
T29590 0.064374s       - for i=15 of 16; no format available
T29590 0.064400s       } = 5 // choose_format() #1
T29590 0.064425s     + client_switch_proxy() {} = 0 // #1
T29590 0.064450s     } = 0 // archive_read_open1() #1
  • You can see that each choose_filters() iterates through many filter handlers; 1st bzip2, 2nd compress (.Z), 3rd gzip, etc.
  • Each filter handlers determines itself if more client raw bytes need to be read via the callback.
  • We can see this clearly for the first filter handler bzip2 which calls libarchive_read_callback() twice (in this example) because it only returned 7 bytes each time.
  • Note: Going into hand-wavy imagination mode:
  • So what if the 2nd libarchive_read_callback() call returned a value which means 'no more raw bytes at the moment'?
  • Modification #1: archive_read_open1() should be able to return prematurely without finishing the current filter handler etc.
  • Modification #2: archive_read_open1() should be able to restart and finish the current filter handler.
  • This means archive_read_open1() and choose_filters() need to be able to keep state regarding the hunt for a filter handler.
  • However the github issue response said "it would likely require major alterations to every single format and filter handler."
  • At first glance it does not look necessary to modify "every single format and filter handler"... but we will continue and see.
  • Whatever, it still looks like pretty complicated refactoring will be necessary to choose filters().
$ ./doit.sh
...
T29590 0.054947s   + archive_read_open1() {
T29590 0.054957s     + __archive_check_magic() {} = 0 // #50
T29590 0.054997s     + archive_clear_error() {} // #3
T29590 0.055023s     - Open data source.
T29590 0.055048s     - Build out the input pipeline.
T29590 0.055073s     + choose_filters() {
T29590 0.055083s       - Allow each registered stream transform to bid on whether it wants to handle this stream.  Repeat until we've finished building the pipeline.
T29590 0.055123s       - for number_filters=0 of 25; for bidder i=0 of 16; calling bidder bzip2
T29590 0.055150s       + bzip2_reader_bid() {
T29590 0.055160s         + __archive_read_filter_ahead() {
T29590 0.055185s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=0; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.055211s           + client_read_proxy() {
T29590 0.055248s             - libarchive_read_callback(0=id) {} = 7=bytes_available

After 1st libarchive_read_callback().

T29590 0.055289s             } = ? // client_read_proxy() #1
T29590 0.055314s           - bytes_read=7
T29590 0.055339s           - min=14 filter->avail=0 filter->buffer_size=0 filter->client_total=7; Come here if we can't satisfy the request from the copy buffer or the existing client data.
T29590 0.055357s           - min=14 filter->avail=7 filter->buffer_size=14 filter->client_total=7; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.055384s           + client_read_proxy() {
T29590 0.055420s             - libarchive_read_callback(0=id) {} = 7=bytes_available

After 2nd libarchive_read_callback(). What if this 2nd libarchive_read_callback() returned a value telling us there are no bytes available at this time?

T29590 0.055461s             } = ? // client_read_proxy() #2
T29590 0.055486s           - bytes_read=7
T29590 0.055511s           - min=14 filter->avail=7 filter->buffer_size=14 filter->client_total=7; Come here if we can't satisfy the request from the copy buffer or the existing client data.
T29590 0.055521s           - min=14 filter->avail=14 filter->buffer_size=14 filter->client_total=7; Come here if we can satisfy from the copy buffer.
T29590 0.055547s           } // __archive_read_filter_ahead() #1
T29590 0.055599s         } = 0 // bzip2_reader_bid() #1

[1] Read ahead source comment

Investigate where current read callback is called from inside archive_read_extract()

  • Within archive_read_extract(), after fiddling with the disk, the two main functions called repeatedly are archive_read_data_block() and archive_write_data_block().
  • Below we can see the first two iterations of archive_read_data_block() and archive_write_data_block(), and then later a 3rd iteration where the user read callback is called for the first time.
  • Unfortunately, the user read callback is called from within gzip_filter_read() which means the github issue response is likely true: "it would likely require major alterations to every single format and filter handler."
T29590 0.068422s + archive_read_extract() {
...
T29590 0.068822s   + archive_read_extract2() {
T29590 0.068832s     + archive_write_header() {
...
T29590 0.073431s       } = 0 // archive_write_header() #1
T29590 0.073454s     + archive_entry_size_is_set() {} = 64 // #2
T29590 0.073478s     + archive_entry_size() {} = ? // #2
T29590 0.073501s     + copy_data() {
T29590 0.073510s       + __archive_read_get_extract() {} = ? // #2
T29590 0.073547s       + archive_read_data_block() {
T29590 0.073557s         + _archive_read_data_block() {
T29590 0.073580s           + __archive_check_magic() {} = 0 // #55
T29590 0.073619s           + archive_read_format_tar_read_data() {
T29590 0.073627s             + __archive_read_ahead() {
T29590 0.073650s               + __archive_read_filter_ahead() {
T29590 0.073674s                 - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.073697s                 } // __archive_read_filter_ahead() #44
T29590 0.073734s               } // __archive_read_ahead() #12
T29590 0.073758s             } = 0 // archive_read_format_tar_read_data() #1
T29590 0.073781s           } = 0 // _archive_read_data_block() #1
T29590 0.073804s         } = 0 // archive_read_data_block() #1
T29590 0.073827s       + archive_write_data_block() {
T29590 0.073836s         + _archive_write_disk_data_block() {
T29590 0.073860s           + __archive_check_magic() {} = 0 // #56
T29590 0.073897s           + write_data_block() {} = ? // #1
T29590 0.074052s           } = ? // _archive_write_disk_data_block() #1
T29590 0.074076s         } = ? // archive_write_data_block() #1
T29590 0.074099s       + archive_read_data_block() {
T29590 0.074108s         + _archive_read_data_block() {
T29590 0.074131s           + __archive_check_magic() {} = 0 // #57
T29590 0.074169s           + archive_read_format_tar_read_data() {
T29590 0.074178s             + __archive_read_consume() {
T29590 0.074202s               + __archive_read_filter_consume() {
T29590 0.074225s                 + advance_file_pointer() {} = ? // #6
T29590 0.074262s                 } = ? // __archive_read_filter_consume() #6
T29590 0.074286s               } = ? // __archive_read_consume() #2
T29590 0.074309s             + __archive_read_ahead() {
T29590 0.074318s               + __archive_read_filter_ahead() {
T29590 0.074342s                 - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.074375s                 + gzip_filter_read() {
T29590 0.074399s                   + __archive_read_filter_ahead() {
T29590 0.074422s                     - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=262144; Come here if we can satisfy from client buffer.
T29590 0.074446s                     } // __archive_read_filter_ahead() #46
T29590 0.074659s                   + __archive_read_filter_consume() {
T29590 0.074669s                     + advance_file_pointer() {} = ? // #7
T29590 0.074708s                     } = ? // __archive_read_filter_consume() #7
T29590 0.074731s                   } = ? // gzip_filter_read() #2
T29590 0.074755s                 - bytes_read=65536
T29590 0.074778s                 - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.074787s                 } // __archive_read_filter_ahead() #46
T29590 0.074825s               } // __archive_read_ahead() #13
T29590 0.074849s             } = 0 // archive_read_format_tar_read_data() #2
T29590 0.074873s           } = 0 // _archive_read_data_block() #2
T29590 0.074896s         } = 0 // archive_read_data_block() #2
T29590 0.074920s       + archive_write_data_block() {
T29590 0.074928s         + _archive_write_disk_data_block() {
T29590 0.074951s           + __archive_check_magic() {} = 0 // #58
T29590 0.074989s           + write_data_block() {} = ? // #2
T29590 0.075151s           } = ? // _archive_write_disk_data_block() #2
T29590 0.075175s         } = ? // archive_write_data_block() #2
...
T29590 0.087642s       + archive_read_data_block() {
T29590 0.087650s         + _archive_read_data_block() {
T29590 0.087674s           + __archive_check_magic() {} = 0 // #81
T29590 0.087712s           + archive_read_format_tar_read_data() {
T29590 0.087721s             + __archive_read_consume() {
T29590 0.087744s               + __archive_read_filter_consume() {
T29590 0.087768s                 + advance_file_pointer() {} = ? // #30
T29590 0.087806s                 } = ? // __archive_read_filter_consume() #30
T29590 0.087829s               } = ? // __archive_read_consume() #14
T29590 0.087852s             + __archive_read_ahead() {
T29590 0.087861s               + __archive_read_filter_ahead() {
T29590 0.087884s                 - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.087908s -->             + gzip_filter_read() {
T29590 0.087932s                   + __archive_read_filter_ahead() {
T29590 0.087955s                     - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=262144; Come here if we can satisfy from client buffer.
T29590 0.087979s                     } // __archive_read_filter_ahead() #70
T29590 0.088178s                   + __archive_read_filter_consume() {
T29590 0.088188s                     + advance_file_pointer() {} = ? // #31
T29590 0.088227s                     } = ? // __archive_read_filter_consume() #31
T29590 0.088250s                   + __archive_read_filter_ahead() {
T29590 0.088259s                     - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=262144; Come here if we've used up the client data, get more.; call(filter->read);
T29590 0.088281s                     + client_read_proxy() {
T29590 0.088338s -->                   - libarchive_read_callback(0=id) {} = 262,144=bytes_available
T29590 0.088378s                       } = ? // client_read_proxy() #5
T29590 0.088401s                     - bytes_read=262144
T29590 0.088425s                     - min=1 filter->avail=0 filter->buffer_size=14 filter->client_total=262144; Come here if we can satisfy from client buffer.
T29590 0.088434s                     } // __archive_read_filter_ahead() #71
T29590 0.088496s                   + __archive_read_filter_consume() {
T29590 0.088505s                     + advance_file_pointer() {} = ? // #32
T29590 0.088544s                     } = ? // __archive_read_filter_consume() #32
T29590 0.088567s                   } = ? // gzip_filter_read() #14
T29590 0.088590s                 - bytes_read=65536
T29590 0.088613s                 - min=1 filter->avail=0 filter->buffer_size=0 filter->client_total=65536; Come here if we can satisfy from client buffer.
T29590 0.088622s                 } // __archive_read_filter_ahead() #71
T29590 0.088660s               } // __archive_read_ahead() #25
T29590 0.088684s             } = 0 // archive_read_format_tar_read_data() #14
T29590 0.088708s           } = 0 // _archive_read_data_block() #14
T29590 0.088731s         } = 0 // archive_read_data_block() #14
T29590 0.088754s       + archive_write_data_block() {
T29590 0.088763s         + _archive_write_disk_data_block() {
T29590 0.088786s           + __archive_check_magic() {} = 0 // #82
T29590 0.088824s           + write_data_block() {} = ? // #14
T29590 0.088982s           } = ? // _archive_write_disk_data_block() #14
T29590 0.089005s         } = ? // archive_write_data_block() #14
...

Manually instrument further

  • Todo.
#include <string.h>
#include <stdlib.h>
#include <locale.h>
#include <unistd.h>
#include <fcntl.h>
// libarchive
#include <archive.h>
#include <archive_entry.h>
#define CAST(TYPE, PTR) ((TYPE)(uintptr_t)(PTR))
#define BUFF_ARCHIVE_LEN (2 * 1024 * 1024)
#define ID_MAX (2)
int id_buff_archive_used[ID_MAX];
char id_buff_archive[ID_MAX][BUFF_ARCHIVE_LEN];
char * id_chunk_addr[ID_MAX];
int id_chunk_left[ID_MAX];
int id_chunk_size[ID_MAX];
int id_chunk_num [ID_MAX];
int id_chunk_todo[ID_MAX];
struct archive * id_archive[ID_MAX];
struct archive_entry * id_entry[ID_MAX] = {NULL};
la_ssize_t libarchive_read_callback(struct archive* a, void* client_data, const void** chunk)
{
int id = CAST(int, client_data);
*chunk = id_chunk_addr[id];
la_ssize_t bytes_available = id_chunk_left[id] < id_chunk_size[id] ? id_chunk_left[id] : id_chunk_size[id];
if ((id_chunk_num[id] < 3) && (id_chunk_left[id] > 123)) { // special business logic to cause archive_read_open() to call this callback multiple times
bytes_available = 7;
}
id_chunk_left[id] -= bytes_available;
id_chunk_addr[id] += bytes_available;
id_chunk_num [id] ++;
printf("%d=id - libarchive_read_callback() {} = %'ld=bytes_available\n", id, bytes_available);
cwrap_log("- libarchive_read_callback(%d=id) {} = %'ld=bytes_available", id, bytes_available);
return bytes_available;
}
int main(int argc, char** argv) {
setlocale(LC_NUMERIC, "");
if(argc < 3) {
printf("%s {-r | -w} file[s]\n", argv[0]);
return 1;
}
char * mode = *++argv; // "-r" or "-w"
char ** filenames = ++argv;
int id_max = argc - 2;
// archive entry/file variables
struct stat st;
char buff_entry[2 * 1024 * 1024];
int fd;
if(0 == strcmp("-r", mode) /* read mode */) {
printf(" - /* for each archive file (%d): read off disk, and archive_read_open() */\n", id_max);
for (int id = 0; id < id_max; id ++) {
printf("%d=id - attempting to open: %s\n", id, filenames[id]);
int fd = open(filenames[id], O_RDONLY);
id_buff_archive_used[id] = read(fd, &id_buff_archive[id][0], sizeof(id_buff_archive[id]));
printf("%d=id - read %'d bytes into id_buff_archive[%d]\n", id, id_buff_archive_used[id], id);
close(fd);
printf("%d=id - archive_read_new() {}\n", id);
id_archive[id] = archive_read_new();
archive_read_support_format_all(id_archive[id]);
archive_read_support_filter_all(id_archive[id]);
#ifdef FROM_STREAM
id_chunk_addr[id] = &id_buff_archive[id][0];
id_chunk_left[id] = id_buff_archive_used[id];
id_chunk_size[id] = 256 * 1024;
id_chunk_num [id] = 0;
id_chunk_todo[id] = 1;
printf("%d=id - archive_read_open() {\n", id);
int res = archive_read_open(id_archive[id], CAST(void *, id), NULL, (archive_read_callback*)libarchive_read_callback, NULL);
printf("%d=id } = %s\n", id,
res == ARCHIVE_OK ? "ARCHIVE_OK" :
res == ARCHIVE_EOF ? "ARCHIVE_EOF" : "ERROR: UNKNOWN");
#else
std::cout << "calling archive_read_open_memory.." << std::endl;
int res = archive_read_open_memory(id_archive[id], &id_buff_archive[id][0], id_buff_archive_used[id]);
#endif
}
int flags = 0;
flags = ARCHIVE_EXTRACT_TIME;
flags |= ARCHIVE_EXTRACT_PERM;
flags |= ARCHIVE_EXTRACT_ACL;
flags |= ARCHIVE_EXTRACT_FFLAGS;
int filenames_unpacked = 0;
while (filenames_unpacked < id_max) {
printf(" - /* for each archive file: archive_read_next_header() and archive_read_extract() */\n");
for (int id = 0; id < id_max; id ++) {
if (id_chunk_todo[id]) {
int res = archive_read_next_header(id_archive[id], &(id_entry[id]));
printf("%d=id - archive_read_next_header() {} = %s\n", id,
res == ARCHIVE_OK ? "ARCHIVE_OK" :
res == ARCHIVE_EOF ? "ARCHIVE_EOF" : "ERROR: UNKNOWN");
if (ARCHIVE_EOF == res) {
id_chunk_todo[id] = 0;
filenames_unpacked ++;
}
else {
printf("%d=id - archive_entry_pathname(entry) {} = %s // file to extract\n", id, archive_entry_pathname(id_entry[id]));
printf("%d=id - archive_read_extract() {\n", id);
res = archive_read_extract(id_archive[id], id_entry[id], flags);
printf("%d=id } = %s\n", id,
res == ARCHIVE_OK ? "ARCHIVE_OK" :
res == ARCHIVE_EOF ? "ARCHIVE_EOF" : "ERROR: UNKNOWN");
}
}
}
}
printf(" - /* for each archive file: archive_read_close() and archive_read_free() */\n");
for (int id = 0; id < id_max; id ++) {
printf("%d=id - archive_read_close() {}\n", id);
archive_read_close(id_archive[id]);
printf("%d=id - archive_read_free() {}\n", id);
archive_read_free(id_archive[id]);
}
}
else {
int id = 0;
id_archive[id] = archive_write_new(); // initializes a new archive
archive_write_add_filter_gzip(id_archive[id]); // gzips incoming data
archive_write_set_format_pax_restricted(id_archive[id]); // only use extensions when necessary
archive_write_set_bytes_per_block(id_archive[id], 8192);
archive_write_open_filename(id_archive[id], "test.tar.gz");
for (int id = 0; id < id_max; id ++) {
printf("%d=id - attempting to open: %s\n", id, filenames[id]);
printf("- compressing: %s\n", filenames[id]);
stat(filenames[id], &st);
if(id_entry[id] == NULL) {
printf("- archive_entry_new() {}\n");
id_entry[id] = archive_entry_new();
}
else {
printf("- archive_entry_clear() {}\n");
archive_entry_clear(id_entry[id]);
}
// the following attributes are mandatory
archive_entry_set_pathname(id_entry[id], filenames[id]);
archive_entry_set_size(id_entry[id], st.st_size);
archive_entry_set_filetype(id_entry[id], st.st_mode);
archive_write_header(id_archive[id], id_entry[id]);
fd = open(filenames[id], O_RDONLY);
int len = read(fd, buff_entry, sizeof(buff_entry));
printf("- read() = %d\n", len);
while ( len > 0 ) {
archive_write_data(id_archive[id], buff_entry, len);
len = read(fd, buff_entry, sizeof(buff_entry));
printf("- read() = %d\n", len);
}
close(fd);
}
// done looping over files
archive_entry_free(id_entry[id]);
archive_write_close(id_archive[id]);
archive_write_free(id_archive[id]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment