sasairc/uri.md

## uri.md

      
    Raw
  

              uri.md
            
          
    C/C++でパーセントエンコーディング・デコーディングをする

この記事は車輪の再発明 Advent Calendar 2014の記事です。
「それ、 C/C++ だとワンライナーじゃできないよ」
コードものですが、単純でかつ簡素なのでアドベント向きだと思い書いてみました。
URI 一般的構文

言わずまでもなく %XX のようにエスケープされた文字。お馴染みですね。
それをC/C++で書くとどうなるか、そんなお話です。
参考

Uniform Resource Identifier (URI): 一般的構文: https://triple-underscore.github.io/RFC3986-ja.html
UTF-8 - Wikipedia / エンコード体系: https://ja.wikipedia.org/wiki/UTF-8#%E3%82%A8%E3%83%B3%E3%82%B3%E3%83%BC%E3%83%89%E4%BD%93%E7%B3%BB
source code

example.c

呼び出し側。
/*
 * example.c
 */
#include <stdio.h>          /* fprntf(), NULL */
#include <stdlib.h>         /* free() */
#include <benly/uri.h>      /* encode_uri(), decode_uri */
#include <benly/column.h>   /* tablize() */

int main(void)
{
    char*   str     = "やすな\0",
        *   sym     = "=>\0",
        *   enc     = NULL,
        *   dec     = NULL;

    /* エンコード */
    encode_uri(str, &enc);
    /* デコード */
    decode_uri(enc, &dec);

    char*   encoded[]   = {
                str, sym, enc, NULL,
            },
        *   decoded[]   = {
                enc, sym, dec, NULL,
            },
        **  result[]    = {
                encoded, decoded, NULL,
            };

    /* テーブル化して標準出力へ書き込み */
    tablize(result);

    /* メモリ解放 */
    free(enc);
    free(dec);

    return 0;
}
uri.c

ライブラリ側。
/*
 * uri.c
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>

#ifndef DEFAULT_BUFSIZE
#define DEFAULT_BUFSIZE 1024
/* DEFAULT_BUFSIZE */
#endif

int encode_uri(char* const src, char** dest)
{
    int     status  = 0;

    size_t  bufsiz  = DEFAULT_BUFSIZE,
            current = 0;    /* dest のオフセット */

    char*   p       = src;  /* src をなめる */

    /* エンコード後の文字列を格納するメモリ領域の確保 */
    if ((*dest = (char*)
                malloc(sizeof(char) * bufsiz)) == NULL) {
        status = -1; goto ERR;
    } else {
        /* 確保された領域の初期化 */
        memset(*dest, '\0', bufsiz);
    }

    /* ヌル文字 '\0' が出現するまでぐるぐる回す */
    while (*p != '\0') {
        /* エンコード後の文字列を格納するメモリ領域の再確保 */
        if ((current + 3) >= bufsiz) {
            bufsiz += DEFAULT_BUFSIZE;
            if ((*dest = (char*)
                        realloc(*dest, sizeof(char) * bufsiz)) == NULL) {
                status = -2; goto ERR;
            } else {
                /* 再確保された領域の初期化 */
                memset(*dest + current, '\0', DEFAULT_BUFSIZE);
            }
        }
        /*
         * true:  アドレス p が指す領域の値は変換対象
         * false: アドレス p が指す領域の値は変換対象外
         */
        if (
                (*p >= 0x20 && *p <= 0x39)  ||  /* [0-9] */
                (*p >= 0x41 && *p <= 0x5A)  ||  /* [A-Z] */
                (*p >= 0x61 && *p <= 0x7A)  ||  /* [a-z] */
                (*p == 0x2E)                ||  /* . */
                (*p == 0x2F)                ||  /* / */
                (*p == 0x3A)                    /* : */
           ) {
            /* アドレス p が指す領域の値をそのまま代入 */
            *(*(dest) + current) = *p;
            /* dest のオフセットを加算 */
            current++;
        } else {
            /*
             * アドレス p が指す領域の値を、二桁の16進数へ変換
             * 途中、暗黙のキャストにより 4byte へ拡張され出力されてしまうので、
             * ビットへのAND操作により、下位 8bit (1byte) ぶんを取り出す
             */
            current += sprintf(*dest + current, "%%%02X",
                    *p & 0x000000FF);
        }
        /* アドレスを加算 */
        p++;
    }

    return 0;

ERR:
    /* エラー処理 */
    switch (status) {
        case    -1:
        case    -2:
            fprintf(stderr, "%s\n",
                    strerror(errno));
            break;
    }

    return status;
}

int decode_uri(char* const src, char** dest)
{
    int     status  = 0;

    size_t  bufsiz  = DEFAULT_BUFSIZE,
            current = 0;    /* dest のオフセット */

    char*   p       = src;  /* src を舐める */

    /* デコード後の文字列を格納するメモリ領域の確保 */
    if ((*dest = (char*)
                malloc(sizeof(char) * bufsiz)) == NULL) {
        status = -1; goto ERR;
    } else {
                /* 確保された領域の初期化 */
        memset(*dest, '\0', bufsiz);
    }

    /* ヌル文字 '\0' が出現するまでぐるぐる回す */
    while (*p != '\0') {
        /* デコード後の文字列を格納するメモリ領域の再確保 */
        if ((current + 2) >= bufsiz) {
            bufsiz += DEFAULT_BUFSIZE;
            if ((*dest = (char*)
                        realloc(*dest, sizeof(char) * bufsiz)) == NULL) {
                status = -2; goto ERR;
            } else {
                /* 再確保された領域の初期化 */
                memset(*dest + current, '\0', DEFAULT_BUFSIZE);
            }
        }
        /*
         * true:  *p == '%' なので変換対象
         * false: その他は変換対象外
         */
        if (*p == '%') {
            /* アドレスを加算 (%を飛ばす) */
            p++;
            /* 二桁 (文字なので2byte) の16進数をデコードし、 *(dest) + current へ代入 */
            sscanf(p, "%2X", *(dest) + current);
            /* アドレスを加算 (XXを飛ばす) */
            p += 2;
        } else {
            /* アドレス p が指す領域の値をそのまま代入 */
            *(*(dest) + current) = *p;
            /* アドレスを加算 */
            p++;
        }
        /* dest のオフセットを加算 */
        current++;
    }

    return 0;

ERR:
    /* エラー処理 */
    switch (status) {
        case    -1:
        case    -2:
            fprintf(stderr, "%s\n",
                    strerror(errno));
            break;
    }

    return status;
}
result

% gcc example.c -lbenly_uri -lbenly_column -o example
% ./example
やすな                      => %E3%82%84%E3%81%99%E3%81%AA
%E3%82%84%E3%81%99%E3%81%AA => やすな
きちんとエンコード・デコード共にする事ができました。

今の時代の言語ではワンライナーで書く事ができますが、あえて古い言語で書くと一層理解の助けになるかと思います。