Skip to content

Instantly share code, notes, and snippets.

@JesseKPhillips
Last active November 30, 2017 19:55
Show Gist options
  • Save JesseKPhillips/6051600 to your computer and use it in GitHub Desktop.
Save JesseKPhillips/6051600 to your computer and use it in GitHub Desktop.
This gives a very basic parsing of osm.pbf files.
/**
* Explaination comments taken from:
* http://wiki.openstreetmap.org/wiki/PBF_Format#Design
*
* This gives a very basic parsing of osm.pbf files. The purpose was several
* fold.
*
* - Read PBF files
* - Learn the file layout
* - Verify the bytes match
* - Test the Protocol Buffer compiler for D
*
* It uses a bremen file specified in the documentation (not tested with the
* same version). This should allow others to examine the data and specific
* bytes (through modification).
*
* This code does not provide any high level processing or logic. It only shows
* the low-level access to the data.
*
* Definitions:
* - Delta-encoding: consecutive nodes in a way or relation have a tendency
* that nearby nodes have IDs numerically close allowing storage by delta,
* resulting in small integers. (I.E., instead of encoding x_1, x_2, x_3,
* encoding is x_1, x_2-x_1, x_3-x_2, ...).
*
* License: Boost 1.0
*/
import osmpbf;
import osmpbffile;
import std.algorithm;
import std.exception;
import std.file;
import std.range;
import std.stdio : writeln, writefln, File;
import std.string;
import std.typecons;
import std.zlib;
// Converts ubyte num to proper endian
int toNative(ubyte[] num) {
import std.system;
union Hold {
ubyte[4] arr;
int number;
}
Hold a;
a.arr = num;
if(endian == Endian.littleEndian)
a.arr[0..4].reverse();
return a.number;
}
void main() {
// A file contains a header followed by a sequence of fileblocks.
auto datastream = FileRange(File("bremen-latest.osm.pbf"));
size_t headerCount, dataCount, bytesCount;
scope(exit) {
writeln("Headers: ", headerCount);
writeln("Datas: ", dataCount);
writeln("Bytes: ", bytesCount);
}
while(!datastream.empty) {
if(datastream.bufferLength < 4) {
// This shows bytes which haven't been read.
assert(false, "Should parse all bytes?");
}
// The format is a repeating sequence of:
// * int4: length of the BlobHeader message in network byte order
auto size = toNative(datastream[0..4]);
datastream.popFrontN(4);
bytesCount += 4;
// serialized BlobHeader message
auto osmData = datastream[0..size];
datastream.popFrontN(size);
bytesCount += size;
auto header = BlobHeader(osmData);
// contains the type of data in this block message.
writeln("Blob Type: ", header.type);
// index may include metadata about the following blob
writeln("Has Index Data: ", !header.indexdata.isNull);
// contains the serialized size of the subsequent Blob message
writeln("Blob Size: ", header.datasize);
writeln();
// * serialized Blob message (size is given in the header)
// Blob is currently used to store an arbitrary blob of data, either
// uncompressed or in zlib/deflate compressed format.
osmData = datastream[0..header.datasize];
datastream.popFrontN(header.datasize);
bytesCount += header.datasize;
auto blob = Blob(osmData);
// No compression
writeln("Has raw data: ", !blob.raw.isNull);
// Only set when compressed, to the uncompressed size
writeln("Blob raw_size: ", blob.raw_size);
writeln("Has zlib: ", !blob.zlib_data.isNull);
writeln();
// Obtain Blob data: See osmformat.proto
if(!blob.zlib_data.isNull)
osmData = cast(ubyte[]) uncompress(blob.zlib_data);
else if(!blob.raw.isNull)
osmData = blob.raw;
// In order to robustly detect illegal or corrupt files, the maximum
// size of BlobHeader and Blob messages is limited. The length of the
// BlobHeader *should* be less than 32 KiB (32*1024 bytes) and *must*
// be less than 64 KiB. The uncompressed length of a Blob *should* be
// less than 16 MiB (16*1024*1024 bytes) and *must* be less than 32
// MiB.
// There are currently two fileblock types for OSM data. These textual
// type strings are stored in the type field in the BlobHeader
//
// The design lets other software extend the format to include
// fileblocks of additional types for their own purposes. Parsers
// should ignore and skip fileblock types that they do not recognize.
if(header.type == "OSMHeader") {
headerCount++;
// Contains a serialized HeaderBlock message (See osmformat.proto).
// Every file must have one of these blocks before the first
// 'OSMData' block.
auto osmHeader = HeaderBlock(osmData);
writeln("OSM bbox ", osmHeader.bbox);
writeln("OSM author ", osmHeader.writingprogram);
writeln("OSM required ", osmHeader.required_features);
if(!osmHeader.source.isNull)
writeln("OSM source ", osmHeader.source);
} else if(header.type == "OSMData") {
dataCount++;
// Contains a serialized PrimitiveBlock message. (See
// osmformat.proto). These contain the entities.
auto osmDataBlock = PrimitiveBlock(osmData);
writeln("OSM lat_off ", osmDataBlock.lat_offset);
writeln("OSM lon_off ", osmDataBlock.lon_offset);
writeln("OSM date ", !osmDataBlock.date_granularity.isNull);
enforce(!osmDataBlock.stringtable.isNull);
auto stringTable = osmDataBlock.stringtable.s.get.
map!(x=>cast(char[])x);
writeln("OSM String: ", stringTable.take(5), "...");
writeln("OSM String Length: ", stringTable.length);
// Nodes can be encoded one of two ways, as a Node and a special
// dense format.
if(!osmDataBlock.primitivegroup.front.nodes.isNull) {
Node[] nodes = osmDataBlock.primitivegroup.front.nodes;
if(!nodes.front.keys.isNull)
writefln("Node Tags: %s...",
zip(nodes.front.keys.get, nodes.front.vals.get).
map!(x=>tuple(stringTable[x[0]], stringTable[x[1]])).
map!(x=> x[0] ~"="~ x[1]).take(2));
}
if(!osmDataBlock.primitivegroup.front.dense.isNull) {
// Keys and values for all nodes are encoded as a single array
// of stringid's. Each node's tags are encoded in alternating
// <keyid> <valid>. We use a single stringid of 0 to delimit
// when the tags of a node ends and the tags of the next node
// begin. The storage pattern is: ((<keyid> <valid>)* '0' )*
DenseNodes nodes = osmDataBlock.primitivegroup.front.dense;
if(!nodes.keys_vals.isNull) {
auto nodeTags = nodes.keys_vals.get().
splitter(0).array.front.chunks(2).
map!(x=>tuple(stringTable[x[0]],stringTable[x[1]])).
map!(x=> x[0] ~"="~ x[1]).take(2);
if(!nodeTags.empty)
writefln("Node Tags: %s...", nodeTags);
}
}
if(!osmDataBlock.primitivegroup.front.ways.isNull) {
Way[] ways = osmDataBlock.primitivegroup.front.ways;
if(!ways.front.keys.isNull)
writefln("Way Tags: %s...",
zip(ways.front.keys.get, ways.front.vals.get).
map!(x=>tuple(stringTable[x[0]],stringTable[x[1]])).
map!(x=> x[0] ~"="~ x[1]).take(2));
}
if(!osmDataBlock.primitivegroup.front.relations.isNull) {
Relation[] relations =
osmDataBlock.primitivegroup.front.relations;
if(!relations.front.keys.isNull)
writefln("Relation Tags: %s...",
zip(relations.front.keys.get,
relations.front.vals.get).
map!(x=>tuple(stringTable[x[0]],stringTable[x[1]])).
map!(x=> x[0] ~"="~ x[1]).take(2));
}
writeln("==============");
writeln();
}
}
}
enum size = 2048;
struct FileRange {
private:
size_t index;
ubyte[] buff;
File.ByChunk chunks;
public:
static auto opCall(File file) {
FileRange fr;
fr.chunks = file.byChunk(size);
fr.prime();
return fr;
}
auto empty() {
return buff.length == index;
}
auto popFront() {
index++;
if(index > buff.length - size/2 && !chunks.empty) {
buff = buff[index..$];
index = 0;
prime();
}
}
auto front() {
return buff[index..$].front;
}
private void prime(size_t total = size) {
while(!chunks.empty) {
buff ~= chunks.front;
chunks.popFront();
if(total > size)
total -= size;
else
break;
}
}
auto bufferLength() {
return buff.length - index;
}
auto opSlice(size_t x, size_t y) {
if(y+index < buff.length)
return buff[x+index..y+index];
prime(y+index - buff.length);
return buff[x+index..y+index];
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment