Skip to content

Instantly share code, notes, and snippets.

@JIghtuse
Created July 13, 2017 06:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JIghtuse/0ad3e3b876a0ad4bc48cef9fedff514b to your computer and use it in GitHub Desktop.
Save JIghtuse/0ad3e3b876a0ad4bc48cef9fedff514b to your computer and use it in GitHub Desktop.
HTTP log parser
#include <cctype>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <unordered_map>
#include <queue>
const int kDefaultNumberOfResults = 100;
struct Configuration {
std::string in_path{};
std::string out_path{};
int n = kDefaultNumberOfResults;
};
struct Statistics {
std::unordered_map<std::string, int> domains{};
std::unordered_map<std::string, int> paths{};
int urlCount{};
};
Configuration parse_args(int argc, char **argv)
{
Configuration config;
if (argc == 5) {
config.n = std::atoi(argv[2]);
config.in_path = argv[3];
config.out_path = argv[4];
} else if (argc == 3) {
config.in_path = argv[1];
config.out_path = argv[2];
}
return config;
}
bool is_domain_character(char c)
{
return std::isalnum(c) || c == '.' || c == '-';
}
bool is_path_character(char c)
{
return std::isalnum(c) || c == '/' || c == '.' || c == '_' || c == ',' || c == '+';
}
Statistics gatherStats(std::ifstream& in)
{
Statistics stat;
const char * http = "http";
for (std::string s; std::getline(in, s);) {
auto pos = s.find(http);
while (pos < s.length() && pos != std::string::npos) {
pos += 4;
if (s[pos] == 's') {
++pos;
}
if (s.length() <= pos + 3) {
break;
}
if (s[pos] != ':' || s[pos + 1] != '/' || s[pos + 2] != '/') {
pos = s.find(http, pos);
continue;
}
pos += 3;
std::string domain;
while (pos < s.length() && is_domain_character(s[pos])) {
domain += s[pos];
++pos;
}
if (domain.empty()) {
pos = s.find(http, pos);
continue;
}
std::string path;
while (pos < s.length() && is_path_character(s[pos])) {
if (path.empty() && s[pos] != '/') {
pos = s.find(http, pos - domain.length());
continue;
}
path += s[pos];
++pos;
}
if (path.empty()) {
path = "/";
}
++stat.domains[domain];
++stat.paths[path];
++stat.urlCount;
pos = s.find(http, pos);
}
}
return stat;
}
void printStat(std::ofstream& out, const char* title, int n, const std::unordered_map<std::string, int>& stat)
{
out << title << '\n';
using pair = std::pair<int, std::string>;
auto cmp = [](const pair& a, const pair& b) {
return a.first < b.first || (a.first == b.first && a.second > b.second);
};
auto pq = std::priority_queue<pair, std::vector<pair>, decltype(cmp)>{cmp};
for (const std::pair<std::string, int>& p : stat) {
pq.emplace(p.second, p.first);
}
for (auto i = 0; !pq.empty() && i < n; ++i) {
const auto item = pq.top();
out << item.first << ' ' << item.second << '\n';
pq.pop();
}
}
void printStats(std::ofstream& out, const Configuration& config, const Statistics& stat)
{
out << "total urls " << stat.urlCount
<< ", domains " << stat.domains.size()
<< ", paths " << stat.paths.size()
<< "\n\n";
printStat(out, "top domains", config.n, stat.domains);
out << '\n';
printStat(out, "top paths", config.n, stat.paths);
}
int main(int argc, char **argv)
{
auto configuration = parse_args(argc, argv);
auto in = std::ifstream{configuration.in_path};
if (!in) {
std::cerr << "Failed to open input file\n";
return 1;
}
auto out = std::ofstream{configuration.out_path};
if (!out) {
std::cerr << "Failed to open output file\n";
return 1;
}
printStats(out, configuration, gatherStats(in));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment