Skip to content

Instantly share code, notes, and snippets.

@Starl1ght
Last active February 27, 2017 07:14
Show Gist options
  • Save Starl1ght/0c1effd58dc580d5bc389e9b0f13fb32 to your computer and use it in GitHub Desktop.
Save Starl1ght/0c1effd58dc580d5bc389e9b0f13fb32 to your computer and use it in GitHub Desktop.
unigine test
#include <fstream>
#include <iostream>
#include <map>
#include <vector>
#include <set>
// PREFACE
// Никогда не писал такие парсилки текста, как-то очень некрасиво вышло.. ;(
// Точно работает на MSVC2015 (по идее должно работать на MSVC2013, но не тестил)
// Парсит русский UTF-8, и лексикографически пытается сравнивать.
// С линуксом\виндой разницы быть не должно, благо UTF-8 везде одинаковый.
enum class FS {
FirstChar,
SecondChar,
};
FS state = FS::FirstChar;
std::vector<uint8_t> buf;
std::map<std::vector<uint8_t>, uint32_t> wordMap;
uint8_t ruLowerCase(uint8_t ch) {
if (ch <= 0x9F) {
return ch += 32;
}
++buf.back(); // D1
return ch - 0x1f;
}
FS ProcessFirstChar(uint8_t ch) {
if (ch >= 0x41 && ch <= 0x5A) {
buf.push_back(ch + 0x20); // To lowercase
return FS::FirstChar;
}
if (ch >= 0x61 && ch <= 0x7A) {
buf.push_back(ch);
return FS::FirstChar;
}
if (ch == 0xD0 || ch == 0xD1) {
buf.push_back(ch);
return FS::SecondChar;
}
if (!buf.empty()) {
buf.push_back(0);
++wordMap[buf];
buf.clear();
}
return FS::FirstChar;
}
FS ProcessSecondChar(uint8_t ch) {
if (uint8_t(buf.back()) == 0xD0 && ch >= 0x90 && ch <= 0xAF) {
ch = ruLowerCase(ch);
buf.push_back(ch);
return FS::FirstChar;
}
if (uint8_t(buf.back()) == 0xD0 && ch >= 0xB0 && ch <= 0xBF) {
buf.push_back(ch);
return FS::FirstChar;
}
if (uint8_t(buf.back()) == 0xD1 && ch >= 0x80 && ch <= 0x8F) {
buf.push_back(ch);
return FS::FirstChar;
}
buf.pop_back();
if (!buf.empty()) {
buf.push_back(0);
++wordMap[buf];
buf.clear();
}
return FS::FirstChar;
}
void fsm(uint8_t ch) {
switch (state) {
case (FS::FirstChar):
state = ProcessFirstChar(ch);
break;
case (FS::SecondChar):
state = ProcessSecondChar(ch);
break;
}
}
uint64_t GetSize(std::ifstream& ifs) {
ifs.seekg(0, ifs.end);
const uint64_t fsize = ifs.tellg();
ifs.seekg(0, ifs.beg);
return fsize;
}
int main(int argc, char** argv) {
if (argc != 3) {
std::cout << "Specify I\O files.\n";
return -3;
}
std::ifstream ifs{ argv[1], std::ios::binary };
if (!ifs.is_open()) {
std::cout << "Input '" << argv[1] << "' not opened. Exiting.\n";
return -1;
}
const auto fsize = GetSize(ifs);
uint8_t ch;
for (uint64_t i = 0; i < fsize; ++i) {
ifs.read((char*)&ch, 1);
fsm(ch);
}
if (!buf.empty()) {
buf.push_back(0);
++wordMap[buf];
}
using wordPair = std::pair<std::vector<uint8_t>, uint32_t>;
struct wpcompare {
bool operator()(const wordPair& pr1, const wordPair& pr2) const {
if (pr1.second > pr2.second) {
return true;
}
if (pr1.second < pr2.second) {
return false;
}
if (strcmp((const char*)pr1.first.data(), (const char*)pr2.first.data()) < 0) {
return true;
}
return false;
}
};
std::set<wordPair, wpcompare> pairset;
for (const auto& pair : wordMap) {
pairset.emplace(pair.first, pair.second);
}
std::ofstream ofs{ argv[2], std::ios::trunc };
if (!ofs.is_open()) {
std::cout << "Output '" << argv[2] << "' not opened. Exiting.\n";
return -2;
}
for (const auto& pr : pairset) {
ofs << pr.first.data() << " " << pr.second << '\n';
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment