Skip to content

Instantly share code, notes, and snippets.

@kevinkreiser
Last active September 21, 2016 18:40
Show Gist options
  • Save kevinkreiser/4cfcc5be0d4bcd8c437385983af66b34 to your computer and use it in GitHub Desktop.
Save kevinkreiser/4cfcc5be0d4bcd8c437385983af66b34 to your computer and use it in GitHub Desktop.
Memory Mapped Tar File Contents
/**
* build with: g++ -std=c++11 -O2 mem_map_tar.cpp -o list_tar
* run as: ./list_tar some_file.tar
*/
#include <unordered_map>
#include <string>
#include <utility>
#include <cmath>
#include <cstring>
#include <cstdint>
#include <cstdlib>
#include <cerrno>
#include <stdexcept>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
template <class T>
class mem_map_t {
public:
//non-copyable
mem_map_t(mem_map_t&&) = default;
mem_map_t& operator=(mem_map_t&&) = default;
mem_map_t(const mem_map_t&) = delete;
mem_map_t& operator=(const mem_map_t&) = delete;
//default constructable to nothing loaded
mem_map_t(): ptr(nullptr), count(0), file_name("") { }
//construct with file
mem_map_t(const std::string& file_name, size_t size): ptr(nullptr), count(0), file_name("") {
map(file_name, size);
}
//unmap when done
~mem_map_t(){
unmap();
}
//reset to another file or another size
void map(const std::string& new_file_name, size_t new_count) {
//just in case there was already something
unmap();
//has to be something to map
if(new_count > 0) {
auto fd = open(new_file_name.c_str(), O_RDWR, 0);
if(fd == -1)
throw std::runtime_error(new_file_name + "(open): " + strerror(errno));
ptr = mmap(nullptr, new_count * sizeof(T), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if(ptr == MAP_FAILED)
throw std::runtime_error(new_file_name + "(mmap): " + strerror(errno));
auto cl = close(fd);
if(cl == -1)
throw std::runtime_error(new_file_name + "(close): " + strerror(errno));
count = new_count;
file_name = new_file_name;
}
}
//drop the map
void unmap() {
//has to be something to unmap
if(ptr) {
//unmap
auto un = munmap(ptr, count * sizeof(T));
if(un == -1)
throw std::runtime_error(file_name + "(munmap): " + strerror(errno));
//clear
ptr = nullptr;
count = 0;
file_name = "";
}
}
T* get() const {
return static_cast<T*>(ptr);
}
operator T*() {
return static_cast<T*>(ptr);
}
operator const T*() const {
return static_cast<const T*>(ptr);
}
operator bool() const {
return ptr != nullptr;
}
size_t size() const {
return count;
}
protected:
void* ptr;
size_t count;
std::string file_name;
};
struct tar_t {
struct header_t {
char name[100]; char mode[8]; char uid[8]; char gid[8]; char size[12]; char mtime[12]; char chksum[8];
char typeflag; char linkname[100]; char magic[6]; char version[2]; char uname[32]; char gname[32];
char devmajor[8]; char devminor[8]; char prefix[155]; char padding[12];
static uint64_t octal_to_int(const char* data, size_t size = 12) {
const unsigned char* ptr = (const unsigned char*) data + size;
uint64_t sum = 0;
uint64_t multiplier = 1;
//Skip everything after the last NUL/space character
//In some TAR archives the size field has non-trailing NULs/spaces, so this is neccessary
const unsigned char* check = ptr; //This is used to check where the last NUL/space char is
for (; check >= (unsigned char*) data; check--)
if ((*check) == 0 || (*check) == ' ')
ptr = check - 1;
for (; ptr >= (unsigned char*) data; ptr--) {
sum += ((*ptr) - 48) * multiplier;
multiplier *= 8;
}
return sum;
}
bool is_ustar() const { return (memcmp("ustar", magic, 5) == 0); }
size_t get_file_size() const { return octal_to_int(size); }
bool blank() const { constexpr header_t BLANK{}; return !memcmp(this, &BLANK, sizeof(header_t)); }
bool verify() const {
//make a copy and blank the checksum
header_t temp = *this; memset(temp.chksum, ' ', 8); int64_t usum = 0, sum = 0;
//compute the checksum
for(int i = 0; i < sizeof(header_t); i++) {
usum += ((unsigned char*)&temp)[i];
sum += ((char*)&temp)[i];
}
//check if its right
uint64_t rsum = octal_to_int(chksum);
return rsum == usum || rsum == sum;
}
};
tar_t(const std::string& tar_file, bool regular_files_only = true):tar_file(tar_file),corrupt_blocks(0) {
//map the file
struct stat s;
if(stat(tar_file.c_str(), &s) || s.st_size == 0 || (s.st_size % sizeof(header_t)) != 0)
return;
try { mm.map(tar_file, s.st_size); } catch (...) { return; }
//rip through the tar to see whats in it noting that most tars end with 2 empty blocks
//but we can concatenate tars and get empty blocks in between so we'll just be pretty
//lax about it and we'll count the ones we cant make sense of
constexpr header_t BLANK{};
const char* position = mm.get();
while(position < mm.get() + mm.size()) {
//get the header for this file
const header_t* h = static_cast<const header_t*>(static_cast<const void*>(position));
position += sizeof(header_t);
//if it doesnt checkout ignore it and move on one block at a time
if(!h->verify()) { corrupt_blocks += !h->blank(); continue; }
auto size = h->get_file_size();
//do we record entry file or not
if(!regular_files_only || (h->typeflag == '0' || h->typeflag == '\0'))
contents.emplace(std::piecewise_construct, std::forward_as_tuple(std::string{h->name}), std::forward_as_tuple(position, size));
//every entry's data is rounded to the nearst header_t sized "block"
size_t blocks = std::ceil(static_cast<double>(size) / sizeof(header_t));
position += blocks * sizeof(header_t);
}
}
std::string tar_file;
mem_map_t<char> mm;
using entry_name_t = std::string;
using entry_location_t = std::pair<const char*, size_t>;
std::unordered_map<entry_name_t, entry_location_t> contents;
size_t corrupt_blocks;
};
int main(int argc, char *argv[]) {
if(argc > 1) {
//try to load it
tar_t tar(argv[1]);
//look at whats in it
for(const auto& e : tar.contents)
std::cout << e.first << "\t" << e.second.second << std::endl;
//was anything bad
if(tar.corrupt_blocks)
std::cerr << tar.corrupt_blocks << " corrupt blocks within archive" << std::endl;
//did it load or not a tar
return tar.mm.get() == nullptr ? EXIT_FAILURE : EXIT_SUCCESS;
}
return EXIT_FAILURE;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment