Skip to content

Instantly share code, notes, and snippets.

@pankkor
Created July 12, 2017 22:36
Show Gist options
  • Save pankkor/91deddf9ddcfe93738fb3885329a5e62 to your computer and use it in GitHub Desktop.
Save pankkor/91deddf9ddcfe93738fb3885329a5e62 to your computer and use it in GitHub Desktop.
// Unigine C++ School 2
// Homework 1
//
// by griff
//
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <regex>
#include <unordered_map>
#include <string.h>
#include <stddef.h>
#include <assert.h>
// optional optimisations
#define RESERVE // reserve buckets in hashmap +- 2%
#if _WIN32 || _WIN64
#if _WIN64
#define ENV_x64
#else
#define ENV_x32
#endif
#endif
#if __GNUC__
#if __x86_64__ || __ppc64__
#define ENV_x64
#else
#define ENV_x32
#endif
#endif
// MSVC12 hash copy-paste
#ifndef _M_X64
#ifdef ENV_x64
#define _M_X64
#endif
#endif
#ifndef _HASH_SEQ_DEFINED
#define _HASH_SEQ_DEFINED
inline size_t _Hash_seq(const unsigned char *_First, size_t _Count)
{ // FNV-1a hash function for bytes in [_First, _First+_Count)
#ifdef _M_X64
static_assert(sizeof(size_t) == 8, "This code is for 64-bit size_t.");
const size_t _FNV_offset_basis = 14695981039346656037ULL;
const size_t _FNV_prime = 1099511628211ULL;
#else /* _M_X64 */
static_assert(sizeof(size_t) == 4, "This code is for 32-bit size_t.");
const size_t _FNV_offset_basis = 2166136261U;
const size_t _FNV_prime = 16777619U;
#endif /* _M_X64 */
size_t _Val = _FNV_offset_basis;
for (size_t _Next = 0; _Next < _Count; ++_Next)
{ // fold in another byte
_Val ^= (size_t)_First[_Next];
_Val *= _FNV_prime;
}
#ifdef _M_X64
static_assert(sizeof(size_t) == 8, "This code is for 64-bit size_t.");
_Val ^= _Val >> 32;
#else /* _M_X64 */
static_assert(sizeof(size_t) == 4, "This code is for 32-bit size_t.");
#endif /* _M_X64 */
return (_Val);
}
#endif
// small & incomplete string_view impl
namespace my {
class string_view {
protected:
const char *data_;
size_t size_;
public:
string_view(const char *s, size_t size) : data_(s), size_(size) {}
string_view(const char *s) : data_(s), size_(strlen(s)) {}
const char *data() const {
return data_;
}
size_t size() const {
return size_;
}
int compare(const string_view v) const {
int r = std::char_traits<char>::compare(data_, v.data_, std::min(size_, v.size_));
return r == 0 ? static_cast<int>(size_) - static_cast<int>(v.size_) : r;
}
string_view substr(size_t pos = 0, size_t count = std::string::npos) const {
if (count > size_ - pos) {
return substr(pos, size_ - pos);
} else {
return string_view(data_ + pos, count);
}
}
bool operator==(const string_view& rhs) const {
return compare(rhs) == 0;
}
bool operator!=(const string_view& rhs) const {
return compare(rhs) != 0;
}
bool operator<(const string_view& rhs) const {
return compare(rhs) < 0;
}
bool operator>(const string_view& rhs) const {
return rhs < *this;
}
bool operator<=(const string_view& rhs) const {
return !(*this > rhs);
}
bool operator>=(const string_view& rhs) const {
return !(*this < rhs);
}
const char operator[](size_t pos) const {
assert(pos < size_);
return *(data_ + pos);
}
};
}
namespace std {
template<class _CharT, class _Traits>
std::basic_ostream<_CharT, _Traits>&
operator<<(std::basic_ostream<_CharT, _Traits>& __os, my::string_view __sv) {
__os.write(__sv.data(),__sv.size());
return __os;
}
template<>
struct hash<my::string_view> : public unary_function<my::string_view, size_t> {
size_t operator()(const my::string_view& __val) const {
return _Hash_seq((const unsigned char *)__val.data(), __val.size());
}
};
}
using string_view = my::string_view;
// mmap file read only (defined later in this file)
//
// Return values:
// mmaped buffer if succeeds or NULL if fails
// mmaped size in out_size parameter
void *gr_mmap_file(const char *filepath, size_t *out_size);
// munmmap file
// Returns values:
// false if fails
bool gr_munmap_file(void *data, size_t size);
using namespace std;
template <typename T, typename U>
vector<pair<T, U> > vector_from_unordered_map(const unordered_map<T, U> &map) {
vector<pair<T, U> > out;
out.reserve(map.size());
for (auto pair : map) {
out.push_back(pair);
}
return out;
}
template <typename T, typename U>
void print_freqs_vector(const vector<pair<T, U> > &vector, size_t top_n) {
for (size_t i = 0; i < min(vector.size(), top_n); ++i) {
auto pair = vector[i];
cout << pair.second << " " << pair.first << endl;
}
}
static bool sort_by_freq(const pair<string_view, size_t> &a, const pair<string_view, size_t> &b) {
return ((a.second == b.second) && (a.first < b.first)) || (a.second > b.second);
}
static void usage_error(const char *ex) {
cerr << "Usage : " << ex << " [-n NNN] <in_file> <out_file>" << endl;
}
int main(int argc, char **argv) try {
// argument checks
if (argc != 3 && argc != 5) {
usage_error(argv[0]);
return 1;
}
size_t top_n = SIZE_MAX;
int arg_in_file = 1;
if (string(argv[1]) == "-n") {
istringstream iss(argv[2]);
if (!(iss >> top_n)) {
usage_error(argv[0]); return 1;
}
arg_in_file = 3;
}
ios_base::sync_with_stdio(false);
size_t len = 0;
const char *data = (char *)gr_mmap_file(argv[arg_in_file], &len);
if (!data) {
cerr << "mmap failed" << endl;
return 1;
}
const char *data_end = data + len;
ofstream outf;
outf.open(argv[arg_in_file + 1]), cout.rdbuf(outf.rdbuf());
unordered_map<string_view, size_t> domains;
unordered_map<string_view, size_t> paths;
#ifdef RESERVE
domains.reserve(4756); // know your data, muahaha xD
paths.reserve(240352);
#endif
size_t url_count = 0;
char c;
size_t i = 0;
size_t dl, ds = 0; // domain left, size
size_t pl, ps = 0; // path left, size
size_t size; // s size
// iterate over URLs and fill domain and paths maps
for(; data < data_end;) {
const char *eol = strchr(data, '\n');
if (!eol) {
eol = data_end;
}
size = eol - data;
string_view s = string_view(data, size);
data = eol + 1;
// find http
i = 0;
const char *p;
while (p = strstr(s.data() + i, "http"), p != NULL) {
i = p - s.data();
// *http
// skip 's' in https
if ((i += 4) >= size) {
break; // end
}
if (s[i] == 's') {
++i;
}
// https*://
// skip '://'
if (((i += 2) >= size) ||
(s[i - 2] != ':' ||
s[i - 1] != '/' ||
s[i] != '/')) {
continue;
}
// https:/*/
// domain
dl = i + 1;
ds = 0;
while (++i < size) {
c = s[i];
if ((c >='a' && c <= 'z') ||
(c >='A' && c <='Z') ||
(c >= '0' && c <= '9') ||
c == '.' || c == '-') {
++ds;
} else {
break;
}
}
if (ds == 0) {
break;
}
// https://domain*/
ps = 0;
if (s[i] == '/') {
pl = i;
ps = 1;
while (++i < size) {
c = s[i];
if ((c >='a' && c <= 'z') ||
(c >='A' && c <='Z') ||
(c >= '0' && c <= '9') ||
c == '.' || c == '_' || c == ',' || c == '/' || c == '+') {
++ps;
} else {
break;
}
}
}
auto domain = ds ? s.substr(dl, ds) : "";
auto path = ps ? s.substr(pl, ps) : "/";
domains[domain] += 1;
paths[path] += 1;
++url_count;
}
}
// sort domains & paths
vector<pair<string_view, size_t> > domain_freqs = vector_from_unordered_map(domains);
sort(domain_freqs.begin(), domain_freqs.end(), sort_by_freq);
vector<pair<string_view, size_t> > path_freqs = vector_from_unordered_map(paths);
sort(path_freqs.begin(), path_freqs.end(), sort_by_freq);
// write to file
cout << "total urls " << url_count << ", domains " << domains.size() << ", paths " << paths.size() << endl;
cout << endl;
cout << "top domains" << endl;
print_freqs_vector(domain_freqs, top_n);
cout << endl;
cout << "top paths" << endl;
print_freqs_vector(path_freqs, top_n);
return 0;
} catch (exception& e) {
cerr << e.what();
}
#if defined(unix) || defined(__unix__) || defined(__unix) || defined(__MACH__)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#define POSIX_MMAP
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#elif defined(POSIX_MMAP)
#include <fcntl.h>
#include <sys/types.h>
#include <sys/mman.h>
#else
#error "Your system doesn't have mmap support"
#endif
void *gr_mmap_file(const char *filepath, size_t *out_size) {
void *data = NULL;
size_t size = 0;
#if defined(_WIN32)
HANDLE map;
HANDLE file = CreateFileA(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
if (file == INVALID_HANDLE_VALUE) {
return NULL;
}
size = GetFileSize(file, NULL);
if (size != INVALID_FILE_SIZE && size != 0) {
map = CreateFileMappingA(file, NULL, PAGE_READONLY, 0, size, NULL);
if (map) {
data = (char *)MapViewOfFile(map, FILE_MAP_READ, 0, 0, size);
CloseHandle(map);
}
}
CloseHandle(file);
#elif defined(POSIX_MMAP)
int fd = open(filepath, O_RDONLY);
if (fd == -1) {
return NULL;
}
size = lseek(fd, 0, SEEK_END);
if (size > 0) {
data = (char *)mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) {
data = NULL;
}
}
close(fd);
#endif
if (out_size) {
*out_size = size;
}
return data;
}
bool gr_munmap_file(void *data, size_t size) {
#if defined(_WIN32)
// suppress warning
(void)size;
return UnmapViewOfFile(data);
#elif defined(POSIX_MMAP)
return munmap(data, size) != -1;
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment