Last active
August 29, 2015 14:14
-
-
Save ubnt-intrepid/24ddd1690da0b34c6c34 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <iostream> | |
#include <string> | |
#include <regex> | |
// 最初のレコードを抽出する | |
std::pair<std::string::const_iterator, std::string::const_iterator> | |
extract_record(std::string::const_iterator rstart, std::string::const_iterator rend) | |
{ | |
bool is_quote = false; | |
auto start = rstart; | |
for (; rstart != rend; ++rstart, is_quote = is_quote != (*rstart == '\"')) { | |
if (*rstart == '\n' && !is_quote) { | |
break; | |
} | |
} | |
return std::make_pair(start, rstart); | |
} | |
// フィールドの抽出 | |
void extract_fields(std::vector<std::string>& fields, std::string::const_iterator fstart, | |
std::string::const_iterator fend) | |
{ | |
static const std::regex re("\\s*([^,]*|\"(?:[^\"]|(\"\"))*\")\\s*(?:,\\s*|$)"); | |
std::transform(std::sregex_token_iterator(fstart, fend, re, 1), std::sregex_token_iterator(), | |
std::back_inserter(fields), | |
[](std::sregex_token_iterator::value_type const& val){ | |
if (*val.first == '\"') | |
return std::string(val.first + 1, val.second - 1); | |
else | |
return std::string(val.first, val.second); | |
}); | |
} | |
int main() | |
{ | |
std::string const s("a,b, \" c \",\"dd\nab\",\nabceddfea,\"\"\n\nofd"); | |
auto start = std::begin(s), end = std::end(s); | |
while (std::distance(start, end) > 0) { | |
/// レコードの抽出 | |
std::string::const_iterator fstart, fend; | |
std::tie(fstart, fend) = extract_record(start, end); | |
// フィールドの抽出 | |
std::vector<std::string> fields; | |
extract_fields(fields, fstart, fend); | |
for (auto& field : fields) | |
std::cout << " [" << field << "]"; | |
std::cout << "<\n"; | |
start = fend + 1; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <iostream> | |
#include <iterator> | |
#include <string> | |
#include <regex> | |
#include <map> | |
inline std::string normal_str(std::sub_match<std::string::const_iterator> src) | |
{ | |
if (*src.first != '\"') { | |
return src; | |
} else { | |
return std::regex_replace(std::string(src.first + 1, src.second - 1), std::regex("\"\""), | |
std::string("\"")); | |
} | |
} | |
// CSVを解析(レコード単位) | |
auto parse_csv(std::string::const_iterator first, std::string::const_iterator last) | |
-> std::vector<std::vector<std::string>> | |
{ | |
using namespace std; | |
vector<vector<string>> result; | |
result.push_back(vector<string>()); | |
result.back().push_back(""); | |
static const regex re(",|\\r?\\n|[^,\"\\r\\n][^,\\r\\n]*|\"(?:[^\"]|\"\")*\""); | |
for (sregex_token_iterator rfirst(first, last, re), rlast; rfirst != rlast; ++rfirst) { | |
if (*rfirst == "\n" || *rfirst == "\r\n") { | |
// レコードを挿入する | |
result.push_back(vector<string>()); | |
result.back().push_back(""); | |
} else if (*rfirst == ",") { | |
// フィールドを挿入する | |
result.back().push_back(""); | |
} else { | |
result.back().back() = normal_str(*rfirst); | |
} | |
} | |
return result; | |
} | |
// CSVを解析(フィールド単位) | |
// * フィールド数は最初に読み込んだレコードで決定 | |
// * 最初のフィールド数以上の要素は以後無視 | |
// * 最初のフィールド数に満たない場合は空白で埋める | |
auto parse_csv_2(std::string::const_iterator first, std::string::const_iterator last) | |
-> std::vector<std::vector<std::string>> | |
{ | |
using namespace std; | |
vector<vector<string>> result; | |
static const regex re(",|\\r?\\n|[^,\"\\r\\n][^,\\r\\n]*|\"(?:[^\"]|\"\")*\""); | |
sregex_token_iterator rfirst(first, last, re), rlast; | |
vector<string> tmp(1); | |
// 1行目 | |
for (; rfirst != rlast; ++rfirst) { | |
if (*rfirst == "\n" || *rfirst == "\r\n") { | |
++rfirst; | |
break; | |
} else if (*rfirst == ",") | |
tmp.push_back(""); | |
else | |
tmp.back() = normal_str(*rfirst); | |
} | |
// フィールド数の決定 | |
static const size_t N = tmp.size(); | |
result.resize(N); | |
for (size_t i = 0; i < N; ++i) { | |
result[i].push_back(tmp[i]); | |
} | |
tmp.resize(1); | |
// 2行目以降 | |
for (; rfirst != rlast; ++rfirst) { | |
if (*rfirst == "\n" || *rfirst == "\r\n") { | |
for (size_t i = 0; i < N; ++i) { | |
result[i].push_back(i < tmp.size() ? tmp[i] : ""); | |
} | |
tmp.resize(1); | |
} else if (*rfirst == ",") { | |
tmp.push_back(""); | |
} else { | |
tmp.back() = normal_str(*rfirst); | |
} | |
} | |
return result; | |
} | |
// CSVを解析 std::map<> ver. | |
auto parse_csv_3(std::string::const_iterator first, std::string::const_iterator last) | |
-> std::map<std::string, std::vector<std::string>> | |
{ | |
using namespace std; | |
static const regex re(",|\\r?\\n|[^,\"\\r\\n][^,\\r\\n]*|\"(?:[^\"]|\"\")*\""); | |
sregex_token_iterator rfirst(first, last, re), rlast; | |
// 1行目 | |
vector<string> keys(1); | |
for (; rfirst != rlast; ++rfirst) { | |
if (*rfirst == "\n" || *rfirst == "\r\n") { | |
rfirst++; | |
break; | |
} else if (*rfirst == ",") | |
keys.push_back(""); | |
else | |
keys.back() = normal_str(*rfirst); | |
} | |
static const size_t N = keys.size(); | |
vector<vector<string>> values(N); | |
// 2行目以降 | |
vector<string> tmp(1); | |
for (; rfirst != rlast; ++rfirst) { | |
if (*rfirst == "\n" || *rfirst == "\r\n") { | |
for (size_t i = 0; i < N; ++i) { | |
values[i].push_back((i < tmp.size()) ? tmp[i] : ""); | |
} | |
tmp.resize(1); | |
tmp.reserve(N); | |
} else if (*rfirst == ",") { | |
tmp.push_back(""); | |
} else { | |
tmp.back() = normal_str(*rfirst); | |
} | |
} | |
map<string, vector<string>> result; | |
for (size_t i = 0; i < N; ++i) { | |
result.insert(std::make_pair(move(keys[i]), move(values[i]))); | |
} | |
return result; | |
} | |
int main() | |
{ | |
using namespace std; | |
string const content = "a,\"bc\"\" de\"\"\"\" e\",c\nd,e,f\ng,h,,i,j,k\nl,m,n,\no"; | |
cout << "[test1]\n"; | |
{ | |
auto result = parse_csv(content.begin(), content.end()); | |
for (auto& record : result) { | |
copy(begin(record), end(record), ostream_iterator<string>(cout, ":")); | |
cout << "\n"; | |
} | |
} | |
cout << endl; | |
cout << "[test2]\n"; | |
{ | |
auto result = parse_csv_2(content.begin(), content.end()); | |
for (auto& record : result) { | |
copy(begin(record), end(record), ostream_iterator<string>(cout, ":")); | |
cout << "\n"; | |
} | |
} | |
cout << endl; | |
cout << "[test3]\n"; | |
{ | |
auto result = parse_csv_3(content.begin(), content.end()); | |
for (auto& record : result) { | |
cout << "[" + record.first + "] "; | |
copy(begin(record.second), end(record.second), ostream_iterator<string>(cout, ":")); | |
cout << "\n"; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <iterator> | |
#include <string> | |
#include <utility> | |
#include <map> | |
#include <regex> | |
#include <fstream> | |
#include <iostream> | |
#include <sstream> | |
// memo: | |
// * フィールドの検索・置換処理に std::regex を使用しているため VS2010 | |
// より古いコンパイラでは使用不可 | |
// (boost::regexを使えば良いが未検証) | |
// * 現状では文字列として読み込むのみ | |
namespace csv { | |
using std::string; | |
using std::vector; | |
using std::map; | |
using std::pair; | |
using std::make_pair; | |
using std::sub_match; | |
using std::regex; | |
using std::regex_replace; | |
using std::sregex_token_iterator; | |
// implementation of lexical_cast | |
namespace detail { | |
template <typename T> struct lexical_cast_t { | |
inline T operator()(std::string const& src) | |
{ | |
std::istringstream isstr(src); | |
T result; | |
isstr >> result; | |
return result; | |
} | |
}; | |
template <> struct lexical_cast_t<std::string> { | |
inline std::string operator()(std::string const& src) { return src; } | |
}; | |
} // namespace detail; | |
template <typename T> inline T lexical_cast(std::string const& src) | |
{ | |
return detail::lexical_cast_t<T>()(src); | |
} | |
/// CSVの形式で格納されているフィールドを元の形式に直す | |
inline string normal_str(sub_match<string::const_iterator> src) | |
{ | |
if (*src.first != '\"') { | |
return src; | |
} else { | |
static const regex re("\"\""); | |
static const string replacer("\""); | |
return regex_replace(string(src.first + 1, src.second - 1), re, replacer); | |
} | |
} | |
// インデックス取得を表すタグ構造体 | |
static const struct index_tag { | |
} index; | |
// 時系列を表す | |
class data_frame { | |
typedef vector<string> index_t; | |
typedef vector<string> column_t; | |
typedef map<string, column_t> dict_t; | |
index_t index; | |
dict_t dict; | |
public: | |
data_frame() {} | |
column_t& operator[](std::string key) { return dict.at(key); } | |
column_t const& operator[](std::string key) const { return dict.at(key); } | |
index_t& operator[](index_tag) { return index; } | |
index_t const& operator[](index_tag) const { return index; } | |
public: | |
/// CSV文字列の解析 | |
/// ・1行目はキー文字列の取得に使用する | |
/// ・1行目のフィールド数以上は以後の行では無視される | |
/// ・1行目のフィールド数に満たない場合は空白が挿入される | |
/// [要件] | |
/// ・content は RFC 4180に従うこと | |
/// ・改行コードは'\n'あるいは'\r\n'のいずれかであること | |
/// ・content の終端はかならず改行コードで終了すること | |
/// ・1行目の2列目以降に空白のみのフィールドがないこと(キー文字列を取得できない) | |
/// [出力] | |
/// ・map< string, vector<string> > 各キーの値が格納された辞書型 | |
/// ・vector<string> 各インデックスの文字列が格納されたリスト | |
inline void read(std::string const& filepath) { read(std::ifstream(filepath)); } | |
inline void read(std::istream& is) | |
{ | |
// std::istream_iterator<>はBidirectionalIteratorに属さないため直接sregex_match()に渡せない | |
// そのため,一度stringに読み込んでから検索を行う | |
// BidirectionalIteratorが取得できるならばそれを使用したほうが良い | |
string content = string(std::istream_iterator<char>(is), std::istream_iterator<char>()); | |
read(content.begin(), content.end()); | |
} | |
template <typename BidirectionalIterator> | |
void read(BidirectionalIterator start, BidirectionalIterator end) | |
{ | |
static const regex re(",|\\r?\\n|[^,\"\\r\\n][^,\\r\\n]*|\"(?:[^\"]|\"\")*\""); | |
sregex_token_iterator it(start, end, re), last; | |
// 1行目はmapのキー取得に用いる | |
vector<string> keys(1, ""); | |
for (; it != last; ++it) { | |
if (*it == "\n" || *it == "\r\n") { | |
++it; | |
break; | |
} else if (*it == ",") | |
keys.push_back(""); | |
else | |
keys.back() = normal_str(*it); | |
} | |
// 2行目以降は各キーに入力される値を取得していく | |
// ・入力したフィールド数が指定したキー数未満の場合,空欄で埋める | |
// ・指定したキー数以上のフィールドは無視 | |
vector<vector<string> > values(keys.size()); | |
vector<string> buf(1, ""); | |
for (; it != last; ++it) { | |
if (*it == "\n" || *it == "\r\n") { | |
for (size_t i = 0, N = keys.size(); i < N; ++i) { | |
values[i].push_back(i < buf.size() ? buf[i] : ""); | |
} | |
buf.resize(1, ""); | |
} else if (*it == ",") { | |
buf.push_back(""); | |
} else { | |
buf.back() = normal_str(*it); | |
} | |
} | |
// 各キー毎に値を代入していく | |
map<string, vector<string> > result; | |
for (size_t i = 1, N = keys.size(); i < N; ++i) { | |
// 1列目はインデックスで使用するため無視 | |
result.insert(make_pair(move(keys[i]), move(values[i]))); | |
} | |
this->index = move(values[0]); | |
this->dict = move(result); | |
} | |
}; | |
} // namespace csv; | |
int main() | |
{ | |
std::string const content = ",\"bc\"\" de\"\"\"\" e\",c\nd,e,f\ng,h,,i,j,k\nl,m,n,\no"; | |
// 読み込み | |
csv::data_frame data; | |
data.read(content.begin(), content.end()); // BidirectionalIteratorが取得可能な場合 | |
// data.read(std::stringstream(content)); // streamから直接読み込む場合 | |
// data.read("result.csv"); // ファイルから直接読み込む場合 | |
// data.read("result2.csv", csv::append); // 既存のキーに重複しないものを追加で読み込む | |
// 要素へのアクセス | |
auto index = data[csv::index]; // [ "d", "g", "l" ] | |
auto value1 = data["bc\" de\"\" e"]; // [ "e", "h", "m" ] | |
auto value2 = data["c"]; // [ "f", "", "n" ] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[test1] | |
a:bc" de"" e:c: | |
d:e:f: | |
g:h::i:j:k: | |
l:m:n:: | |
o: | |
[test2] | |
a:d:g:l: | |
bc" de"" e:e:h:m: | |
c:f::n: | |
[test3] | |
[a] d:g:l: | |
[bc" de"" e] e:h:m: | |
[c] f::n: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment