Created
July 12, 2017 22:36
-
-
Save pankkor/91deddf9ddcfe93738fb3885329a5e62 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Unigine C++ School 2 | |
// Homework 1 | |
// | |
// by griff | |
// | |
#include <fstream> | |
#include <iostream> | |
#include <iterator> | |
#include <sstream> | |
#include <string> | |
#include <regex> | |
#include <unordered_map> | |
#include <string.h> | |
#include <stddef.h> | |
#include <assert.h> | |
// optional optimisations | |
#define RESERVE // reserve buckets in hashmap +- 2% | |
#if _WIN32 || _WIN64 | |
#if _WIN64 | |
#define ENV_x64 | |
#else | |
#define ENV_x32 | |
#endif | |
#endif | |
#if __GNUC__ | |
#if __x86_64__ || __ppc64__ | |
#define ENV_x64 | |
#else | |
#define ENV_x32 | |
#endif | |
#endif | |
// MSVC12 hash copy-paste | |
#ifndef _M_X64 | |
#ifdef ENV_x64 | |
#define _M_X64 | |
#endif | |
#endif | |
#ifndef _HASH_SEQ_DEFINED | |
#define _HASH_SEQ_DEFINED | |
inline size_t _Hash_seq(const unsigned char *_First, size_t _Count) | |
{ // FNV-1a hash function for bytes in [_First, _First+_Count) | |
#ifdef _M_X64 | |
static_assert(sizeof(size_t) == 8, "This code is for 64-bit size_t."); | |
const size_t _FNV_offset_basis = 14695981039346656037ULL; | |
const size_t _FNV_prime = 1099511628211ULL; | |
#else /* _M_X64 */ | |
static_assert(sizeof(size_t) == 4, "This code is for 32-bit size_t."); | |
const size_t _FNV_offset_basis = 2166136261U; | |
const size_t _FNV_prime = 16777619U; | |
#endif /* _M_X64 */ | |
size_t _Val = _FNV_offset_basis; | |
for (size_t _Next = 0; _Next < _Count; ++_Next) | |
{ // fold in another byte | |
_Val ^= (size_t)_First[_Next]; | |
_Val *= _FNV_prime; | |
} | |
#ifdef _M_X64 | |
static_assert(sizeof(size_t) == 8, "This code is for 64-bit size_t."); | |
_Val ^= _Val >> 32; | |
#else /* _M_X64 */ | |
static_assert(sizeof(size_t) == 4, "This code is for 32-bit size_t."); | |
#endif /* _M_X64 */ | |
return (_Val); | |
} | |
#endif | |
// small & incomplete string_view impl | |
namespace my { | |
class string_view { | |
protected: | |
const char *data_; | |
size_t size_; | |
public: | |
string_view(const char *s, size_t size) : data_(s), size_(size) {} | |
string_view(const char *s) : data_(s), size_(strlen(s)) {} | |
const char *data() const { | |
return data_; | |
} | |
size_t size() const { | |
return size_; | |
} | |
int compare(const string_view v) const { | |
int r = std::char_traits<char>::compare(data_, v.data_, std::min(size_, v.size_)); | |
return r == 0 ? static_cast<int>(size_) - static_cast<int>(v.size_) : r; | |
} | |
string_view substr(size_t pos = 0, size_t count = std::string::npos) const { | |
if (count > size_ - pos) { | |
return substr(pos, size_ - pos); | |
} else { | |
return string_view(data_ + pos, count); | |
} | |
} | |
bool operator==(const string_view& rhs) const { | |
return compare(rhs) == 0; | |
} | |
bool operator!=(const string_view& rhs) const { | |
return compare(rhs) != 0; | |
} | |
bool operator<(const string_view& rhs) const { | |
return compare(rhs) < 0; | |
} | |
bool operator>(const string_view& rhs) const { | |
return rhs < *this; | |
} | |
bool operator<=(const string_view& rhs) const { | |
return !(*this > rhs); | |
} | |
bool operator>=(const string_view& rhs) const { | |
return !(*this < rhs); | |
} | |
const char operator[](size_t pos) const { | |
assert(pos < size_); | |
return *(data_ + pos); | |
} | |
}; | |
} | |
namespace std { | |
template<class _CharT, class _Traits> | |
std::basic_ostream<_CharT, _Traits>& | |
operator<<(std::basic_ostream<_CharT, _Traits>& __os, my::string_view __sv) { | |
__os.write(__sv.data(),__sv.size()); | |
return __os; | |
} | |
template<> | |
struct hash<my::string_view> : public unary_function<my::string_view, size_t> { | |
size_t operator()(const my::string_view& __val) const { | |
return _Hash_seq((const unsigned char *)__val.data(), __val.size()); | |
} | |
}; | |
} | |
using string_view = my::string_view; | |
// mmap file read only (defined later in this file) | |
// | |
// Return values: | |
// mmaped buffer if succeeds or NULL if fails | |
// mmaped size in out_size parameter | |
void *gr_mmap_file(const char *filepath, size_t *out_size); | |
// munmmap file | |
// Returns values: | |
// false if fails | |
bool gr_munmap_file(void *data, size_t size); | |
using namespace std; | |
template <typename T, typename U> | |
vector<pair<T, U> > vector_from_unordered_map(const unordered_map<T, U> &map) { | |
vector<pair<T, U> > out; | |
out.reserve(map.size()); | |
for (auto pair : map) { | |
out.push_back(pair); | |
} | |
return out; | |
} | |
template <typename T, typename U> | |
void print_freqs_vector(const vector<pair<T, U> > &vector, size_t top_n) { | |
for (size_t i = 0; i < min(vector.size(), top_n); ++i) { | |
auto pair = vector[i]; | |
cout << pair.second << " " << pair.first << endl; | |
} | |
} | |
static bool sort_by_freq(const pair<string_view, size_t> &a, const pair<string_view, size_t> &b) { | |
return ((a.second == b.second) && (a.first < b.first)) || (a.second > b.second); | |
} | |
static void usage_error(const char *ex) { | |
cerr << "Usage : " << ex << " [-n NNN] <in_file> <out_file>" << endl; | |
} | |
int main(int argc, char **argv) try { | |
// argument checks | |
if (argc != 3 && argc != 5) { | |
usage_error(argv[0]); | |
return 1; | |
} | |
size_t top_n = SIZE_MAX; | |
int arg_in_file = 1; | |
if (string(argv[1]) == "-n") { | |
istringstream iss(argv[2]); | |
if (!(iss >> top_n)) { | |
usage_error(argv[0]); return 1; | |
} | |
arg_in_file = 3; | |
} | |
ios_base::sync_with_stdio(false); | |
size_t len = 0; | |
const char *data = (char *)gr_mmap_file(argv[arg_in_file], &len); | |
if (!data) { | |
cerr << "mmap failed" << endl; | |
return 1; | |
} | |
const char *data_end = data + len; | |
ofstream outf; | |
outf.open(argv[arg_in_file + 1]), cout.rdbuf(outf.rdbuf()); | |
unordered_map<string_view, size_t> domains; | |
unordered_map<string_view, size_t> paths; | |
#ifdef RESERVE | |
domains.reserve(4756); // know your data, muahaha xD | |
paths.reserve(240352); | |
#endif | |
size_t url_count = 0; | |
char c; | |
size_t i = 0; | |
size_t dl, ds = 0; // domain left, size | |
size_t pl, ps = 0; // path left, size | |
size_t size; // s size | |
// iterate over URLs and fill domain and paths maps | |
for(; data < data_end;) { | |
const char *eol = strchr(data, '\n'); | |
if (!eol) { | |
eol = data_end; | |
} | |
size = eol - data; | |
string_view s = string_view(data, size); | |
data = eol + 1; | |
// find http | |
i = 0; | |
const char *p; | |
while (p = strstr(s.data() + i, "http"), p != NULL) { | |
i = p - s.data(); | |
// *http | |
// skip 's' in https | |
if ((i += 4) >= size) { | |
break; // end | |
} | |
if (s[i] == 's') { | |
++i; | |
} | |
// https*:// | |
// skip '://' | |
if (((i += 2) >= size) || | |
(s[i - 2] != ':' || | |
s[i - 1] != '/' || | |
s[i] != '/')) { | |
continue; | |
} | |
// https:/*/ | |
// domain | |
dl = i + 1; | |
ds = 0; | |
while (++i < size) { | |
c = s[i]; | |
if ((c >='a' && c <= 'z') || | |
(c >='A' && c <='Z') || | |
(c >= '0' && c <= '9') || | |
c == '.' || c == '-') { | |
++ds; | |
} else { | |
break; | |
} | |
} | |
if (ds == 0) { | |
break; | |
} | |
// https://domain*/ | |
ps = 0; | |
if (s[i] == '/') { | |
pl = i; | |
ps = 1; | |
while (++i < size) { | |
c = s[i]; | |
if ((c >='a' && c <= 'z') || | |
(c >='A' && c <='Z') || | |
(c >= '0' && c <= '9') || | |
c == '.' || c == '_' || c == ',' || c == '/' || c == '+') { | |
++ps; | |
} else { | |
break; | |
} | |
} | |
} | |
auto domain = ds ? s.substr(dl, ds) : ""; | |
auto path = ps ? s.substr(pl, ps) : "/"; | |
domains[domain] += 1; | |
paths[path] += 1; | |
++url_count; | |
} | |
} | |
// sort domains & paths | |
vector<pair<string_view, size_t> > domain_freqs = vector_from_unordered_map(domains); | |
sort(domain_freqs.begin(), domain_freqs.end(), sort_by_freq); | |
vector<pair<string_view, size_t> > path_freqs = vector_from_unordered_map(paths); | |
sort(path_freqs.begin(), path_freqs.end(), sort_by_freq); | |
// write to file | |
cout << "total urls " << url_count << ", domains " << domains.size() << ", paths " << paths.size() << endl; | |
cout << endl; | |
cout << "top domains" << endl; | |
print_freqs_vector(domain_freqs, top_n); | |
cout << endl; | |
cout << "top paths" << endl; | |
print_freqs_vector(path_freqs, top_n); | |
return 0; | |
} catch (exception& e) { | |
cerr << e.what(); | |
} | |
#if defined(unix) || defined(__unix__) || defined(__unix) || defined(__MACH__) | |
#include <unistd.h> | |
#if defined(_POSIX_MAPPED_FILES) | |
#define POSIX_MMAP | |
#endif | |
#endif | |
#if defined(_WIN32) | |
#define WIN32_LEAN_AND_MEAN | |
#include <windows.h> | |
#elif defined(POSIX_MMAP) | |
#include <fcntl.h> | |
#include <sys/types.h> | |
#include <sys/mman.h> | |
#else | |
#error "Your system doesn't have mmap support" | |
#endif | |
void *gr_mmap_file(const char *filepath, size_t *out_size) { | |
void *data = NULL; | |
size_t size = 0; | |
#if defined(_WIN32) | |
HANDLE map; | |
HANDLE file = CreateFileA(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); | |
if (file == INVALID_HANDLE_VALUE) { | |
return NULL; | |
} | |
size = GetFileSize(file, NULL); | |
if (size != INVALID_FILE_SIZE && size != 0) { | |
map = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, size, NULL); | |
if (map) { | |
data = (char *)MapViewOfFile(map, FILE_MAP_READ, 0, 0, size); | |
CloseHandle(map); | |
} | |
} | |
CloseHandle(file); | |
#elif defined(POSIX_MMAP) | |
int fd = open(filepath, O_RDONLY); | |
if (fd == -1) { | |
return NULL; | |
} | |
size = lseek(fd, 0, SEEK_END); | |
if (size > 0) { | |
data = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); | |
if (data == MAP_FAILED) { | |
data = NULL; | |
} | |
} | |
close(fd); | |
#endif | |
if (out_size) { | |
*out_size = size; | |
} | |
return data; | |
} | |
bool gr_munmap_file(void *data, size_t size) { | |
#if defined(_WIN32) | |
// suppress warning | |
(void)size; | |
return UnmapViewOfFile(data); | |
#elif defined(POSIX_MMAP) | |
return munmap(data, size) != -1; | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment