Last active
September 21, 2016 18:40
-
-
Save kevinkreiser/4cfcc5be0d4bcd8c437385983af66b34 to your computer and use it in GitHub Desktop.
Memory Mapped Tar File Contents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* build with: g++ -std=c++11 -O2 mem_map_tar.cpp -o list_tar | |
* run as: ./list_tar some_file.tar | |
*/ | |
#include <unordered_map> | |
#include <string> | |
#include <utility> | |
#include <cmath> | |
#include <cstring> | |
#include <cstdint> | |
#include <cstdlib> | |
#include <cerrno> | |
#include <stdexcept> | |
#include <iostream> | |
#include <sys/mman.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
template <class T> | |
class mem_map_t { | |
public: | |
//non-copyable | |
mem_map_t(mem_map_t&&) = default; | |
mem_map_t& operator=(mem_map_t&&) = default; | |
mem_map_t(const mem_map_t&) = delete; | |
mem_map_t& operator=(const mem_map_t&) = delete; | |
//default constructable to nothing loaded | |
mem_map_t(): ptr(nullptr), count(0), file_name("") { } | |
//construct with file | |
mem_map_t(const std::string& file_name, size_t size): ptr(nullptr), count(0), file_name("") { | |
map(file_name, size); | |
} | |
//unmap when done | |
~mem_map_t(){ | |
unmap(); | |
} | |
//reset to another file or another size | |
void map(const std::string& new_file_name, size_t new_count) { | |
//just in case there was already something | |
unmap(); | |
//has to be something to map | |
if(new_count > 0) { | |
auto fd = open(new_file_name.c_str(), O_RDWR, 0); | |
if(fd == -1) | |
throw std::runtime_error(new_file_name + "(open): " + strerror(errno)); | |
ptr = mmap(nullptr, new_count * sizeof(T), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); | |
if(ptr == MAP_FAILED) | |
throw std::runtime_error(new_file_name + "(mmap): " + strerror(errno)); | |
auto cl = close(fd); | |
if(cl == -1) | |
throw std::runtime_error(new_file_name + "(close): " + strerror(errno)); | |
count = new_count; | |
file_name = new_file_name; | |
} | |
} | |
//drop the map | |
void unmap() { | |
//has to be something to unmap | |
if(ptr) { | |
//unmap | |
auto un = munmap(ptr, count * sizeof(T)); | |
if(un == -1) | |
throw std::runtime_error(file_name + "(munmap): " + strerror(errno)); | |
//clear | |
ptr = nullptr; | |
count = 0; | |
file_name = ""; | |
} | |
} | |
T* get() const { | |
return static_cast<T*>(ptr); | |
} | |
operator T*() { | |
return static_cast<T*>(ptr); | |
} | |
operator const T*() const { | |
return static_cast<const T*>(ptr); | |
} | |
operator bool() const { | |
return ptr != nullptr; | |
} | |
size_t size() const { | |
return count; | |
} | |
protected: | |
void* ptr; | |
size_t count; | |
std::string file_name; | |
}; | |
struct tar_t { | |
struct header_t { | |
char name[100]; char mode[8]; char uid[8]; char gid[8]; char size[12]; char mtime[12]; char chksum[8]; | |
char typeflag; char linkname[100]; char magic[6]; char version[2]; char uname[32]; char gname[32]; | |
char devmajor[8]; char devminor[8]; char prefix[155]; char padding[12]; | |
static uint64_t octal_to_int(const char* data, size_t size = 12) { | |
const unsigned char* ptr = (const unsigned char*) data + size; | |
uint64_t sum = 0; | |
uint64_t multiplier = 1; | |
//Skip everything after the last NUL/space character | |
//In some TAR archives the size field has non-trailing NULs/spaces, so this is neccessary | |
const unsigned char* check = ptr; //This is used to check where the last NUL/space char is | |
for (; check >= (unsigned char*) data; check--) | |
if ((*check) == 0 || (*check) == ' ') | |
ptr = check - 1; | |
for (; ptr >= (unsigned char*) data; ptr--) { | |
sum += ((*ptr) - 48) * multiplier; | |
multiplier *= 8; | |
} | |
return sum; | |
} | |
bool is_ustar() const { return (memcmp("ustar", magic, 5) == 0); } | |
size_t get_file_size() const { return octal_to_int(size); } | |
bool blank() const { constexpr header_t BLANK{}; return !memcmp(this, &BLANK, sizeof(header_t)); } | |
bool verify() const { | |
//make a copy and blank the checksum | |
header_t temp = *this; memset(temp.chksum, ' ', 8); int64_t usum = 0, sum = 0; | |
//compute the checksum | |
for(int i = 0; i < sizeof(header_t); i++) { | |
usum += ((unsigned char*)&temp)[i]; | |
sum += ((char*)&temp)[i]; | |
} | |
//check if its right | |
uint64_t rsum = octal_to_int(chksum); | |
return rsum == usum || rsum == sum; | |
} | |
}; | |
tar_t(const std::string& tar_file, bool regular_files_only = true):tar_file(tar_file),corrupt_blocks(0) { | |
//map the file | |
struct stat s; | |
if(stat(tar_file.c_str(), &s) || s.st_size == 0 || (s.st_size % sizeof(header_t)) != 0) | |
return; | |
try { mm.map(tar_file, s.st_size); } catch (...) { return; } | |
//rip through the tar to see whats in it noting that most tars end with 2 empty blocks | |
//but we can concatenate tars and get empty blocks in between so we'll just be pretty | |
//lax about it and we'll count the ones we cant make sense of | |
constexpr header_t BLANK{}; | |
const char* position = mm.get(); | |
while(position < mm.get() + mm.size()) { | |
//get the header for this file | |
const header_t* h = static_cast<const header_t*>(static_cast<const void*>(position)); | |
position += sizeof(header_t); | |
//if it doesnt checkout ignore it and move on one block at a time | |
if(!h->verify()) { corrupt_blocks += !h->blank(); continue; } | |
auto size = h->get_file_size(); | |
//do we record entry file or not | |
if(!regular_files_only || (h->typeflag == '0' || h->typeflag == '\0')) | |
contents.emplace(std::piecewise_construct, std::forward_as_tuple(std::string{h->name}), std::forward_as_tuple(position, size)); | |
//every entry's data is rounded to the nearst header_t sized "block" | |
size_t blocks = std::ceil(static_cast<double>(size) / sizeof(header_t)); | |
position += blocks * sizeof(header_t); | |
} | |
} | |
std::string tar_file; | |
mem_map_t<char> mm; | |
using entry_name_t = std::string; | |
using entry_location_t = std::pair<const char*, size_t>; | |
std::unordered_map<entry_name_t, entry_location_t> contents; | |
size_t corrupt_blocks; | |
}; | |
int main(int argc, char *argv[]) { | |
if(argc > 1) { | |
//try to load it | |
tar_t tar(argv[1]); | |
//look at whats in it | |
for(const auto& e : tar.contents) | |
std::cout << e.first << "\t" << e.second.second << std::endl; | |
//was anything bad | |
if(tar.corrupt_blocks) | |
std::cerr << tar.corrupt_blocks << " corrupt blocks within archive" << std::endl; | |
//did it load or not a tar | |
return tar.mm.get() == nullptr ? EXIT_FAILURE : EXIT_SUCCESS; | |
} | |
return EXIT_FAILURE; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment