Skip to content

Instantly share code, notes, and snippets.

@temoto
Created August 7, 2009 10:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save temoto/163825 to your computer and use it in GitHub Desktop.
Save temoto/163825 to your computer and use it in GitHub Desktop.
crawler/crawler
*.o
*.pyc
*.pyo
[submodule "lib/evcom"]
path = lib/evcom
url = git://github.com/ry/evcom.git

What

HTTPS certificate checker.

# Load a set of domains into database # Set up workers # Worker gets a bunch of domains from DB, lock them and run crawler # Crawler checks if each domain is running HTTPS, and if it does, retreives its certificate info # Worker stores crawl data back into DB

You can poll certificates checking status with following SQL queries:

* `select count(*) from domain where status is not null;` number of processed domains
* `select count(*) from domain where locked is not null;` number of currently processing domains
* `select count(*) from domain where (now() - locked > interval '6 hours');` number of problematic domains. This happends if worker dies and never reports status.

How

Worker is implemented in Python, crawler in C. Worker is configured to connect to some PostgreSQL instance. It tries to execute a simple transaction after connect:

# `begin;`
# `select * from domain limit 1000 where locked is null for update nowait;`
# (read data)
# `update domain set locked = now() where name in (name-list-from-data);`
# `commit;`

That is, bunch of not-yet-locked domains are locked and returned. Next step is spawning some number of crawlers to handle those domains. You may want to set number of crawlers to (number-of-CPU-cores * some coefficient) depending on your network latency. The faster the network - the more sense to spawn more crawlers. Good number to start with coefficient is 1.7. So, if you have 1 core - start experimenting with 2 crawlers. If you have 2 cores - start experimenting with 3 crawlers. Lucky owners of 4 cores - start experimenting with 7 crawlers. Each crawler is assigned a pack of domains to check. After doing its job, crawler returns either status:

* `domain<TAB>ok<TAB>certificate-info` - certificate info is JSON encoded
* `domain<TAB>timeout` - connection timeout. No answer on 443 and remote didn't refuse connection.
* `domain<TAB>no-https` - there is webserver, but no HTTPS support
* `domain<TAB>error<TAB>error description` - some unexpected error happened

After all domains (in chunk) are checked, worker connects to database to report results:

`update domain set status = 'ok/dead/no-https', locked = NULL, cert = 'cert-info'/NULL where name in (name-list-from-data);`

Troubles

If, for some reason, worker died and not reported locked domains back, they remain locked. Following query releases old locks (more than 6 hours):

`update domain set locked = null where locked is not null and (now - locked > interval '6 hours');`

The query is safe to be run anytime. It won't release normal locks. So you could run it in cron, say, every 10 minutes and have very stable system.

Utilities

bin/ directory contains utilities, text-db-cut-column.sh for one. That's a simple wrapper around cut -f.

# Settings
# ========
CC := gcc
MAKE := make
PYTHON := python
CFLAGS := -I. -Wall -Wextra
LDFLAGS :=
#EVDIR = /usr
#GNUTLSDIR = /usr
DEBUG := 1
#include <assert.h>
#include <error.h>
#include <gnutls/gnutls.h>
#include <gnutls/x509.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <udns.h>
#include "evcom.h" // change to <> when evcom will be not inside cert_checker source tree
#include "crawler.h"
static /*@null@*/ struct addrinfo *
build_addrinfo (const char *str_addr, const char *str_port) {
int r = 0;
struct addrinfo *ai;
static const struct addrinfo hints = {
.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV,
.ai_family = AF_UNSPEC,
.ai_socktype = 0, .ai_protocol = 0,
.ai_addrlen = 0, .ai_addr = NULL,
.ai_canonname = NULL, .ai_next = NULL,
};
assert(NULL != str_addr);
assert(NULL != str_port);
r = getaddrinfo(str_addr, str_port, &hints, &ai);
if (0 != r) {
error(0, 0, "crawl_item: error building addrinfo from %s:%s", str_addr, str_port);
return NULL;
}
return ai;
}
static int
check_socket_errors (evcom_socket *socket) {
crawl_item *self;
const char *domain;
assert(NULL != socket);
self = (crawl_item *)socket->data;
domain = (const char *)self->domain;
if (0 != socket->errorno) {
printf("%s\terror\tsocket\n", domain);
error(0, socket->errorno, "socket error");
evcom_socket_force_close(socket);
return socket->errorno;
}
if (GNUTLS_E_SUCCESS != socket->gnutls_errorno) {
printf("%s\terror\tGNU TLS error\n", domain);
if (GNUTLS_E_FATAL_ALERT_RECEIVED == socket->gnutls_errorno)
error(0, 0, "GNU TLS fatal alert: %s\n", gnutls_alert_get_name(gnutls_alert_get(socket->session)));
else
error(0, 0, "GNU TLS: %s\n", gnutls_strerror(socket->gnutls_errorno));
evcom_socket_force_close(socket);
return socket->gnutls_errorno;
}
return 0;
}
static void
on_connect (evcom_socket *socket) {
int r = 0;
crawl_item *self;
assert(NULL != socket);
self = (crawl_item *)socket->data;
r = check_socket_errors(socket);
if (0 == r)
crawl_item_process_cert(self);
evcom_socket_force_close(socket);
}
static void
on_close (evcom_socket *socket) {
crawl_item *self;
assert(NULL != socket);
self = (crawl_item *)socket->data;
(void)check_socket_errors(socket);
// important resource freeing is done here
crawl_item_free(self);
}
static void
on_timeout (evcom_socket *socket) {
const crawl_item *self;
const char *domain;
assert(NULL != socket);
self = (const crawl_item *)socket->data;
domain = (const char *)self->domain;
printf("%s\ttimeout\t\n", domain);
evcom_socket_force_close(socket);
}
int
crawl_item_init (/*@notnull@*/ crawl_item *self,
/*@notnull@*/ const char *domain,
float timeout) {
self->domain[0] = '\0';
strncat(self->domain, domain, sizeof(self->domain) - 1);
evcom_socket_init(&self->socket, timeout);
self->socket.on_connect = on_connect;
self->socket.on_close = on_close;
self->socket.on_timeout = on_timeout;
// self backpointer for callbacks
self->socket.data = self;
return 0;
}
int
crawl_item_start (/*@notnull@*/ crawl_item *self) {
int r = 0;
const char *domain;
assert(NULL != self);
domain = (const char *)self->domain;
assert(NULL != domain);
r = crawler_dns_resolve4_start(self);
if (0 != r) {
printf("%s\terror\tresolve start error\n", self->domain);
return r;
}
return 0;
}
int
crawl_item_connect (/*@notnull@*/ crawl_item *self) {
int r = 0;
size_t i = 0;
struct addrinfo *addrinfo;
assert(NULL != self);
for (i = 0; i < self->addr_count; i++) {
// some particular address may not fit into our static buffer
if ('\0' == self->str_addr[i][0])
continue;
addrinfo = build_addrinfo(self->str_addr[i], "443");
if (NULL == addrinfo)
continue;
tls_session_init(self);
r = evcom_socket_connect(&self->socket, addrinfo);
freeaddrinfo(addrinfo);
if (0 == r) {
evcom_socket_attach(EV_DEFAULT_ &self->socket);
return 0;
}
gnutls_deinit(self->socket.session);
self->socket.session = NULL;
}
// no successful connect happened. propagate fail
return r;
}
/*
* Reads certificate info, if any. Outputs crawling results.
*/
void
crawl_item_process_cert (/*@notnull@*/ crawl_item *self) {
// error-checking closure :) PC stands for process certificate
#define CRAWLER_PC_CHECK_GNUTLS_RESULT_DEINIT_CERT do { \
if (GNUTLS_E_SUCCESS != r) { \
printf("%s\terror\tin GNU TLS while processing cert", domain); \
fprintf(stderr, "crawl_item: process_cert:%d domain: %s GNU TLS error: ", \
__LINE__, domain); \
gnutls_perror(r); \
gnutls_x509_crt_deinit (cert); \
return; \
} \
} while(0)
int r;
const gnutls_datum_t *cert_list = NULL;
unsigned int cert_list_size = 0;
gnutls_x509_crt_t cert;
time_t expiration_time, activation_time;
const struct tm *expiration_time_tm = NULL, *activation_time_tm = NULL;
char expiration_time_str[64], activation_time_str[64];
gnutls_pk_algorithm_t algo = 0;
unsigned int bits = 0;
const char *algo_name = NULL;
size_t size = 0;
char dn[1024], issuer_dn[1024];
evcom_socket *client;
const char *domain;
assert(NULL != self);
client = &self->socket;
assert(NULL != client);
assert(NULL != client->session);
domain = (const char *)self->domain;
DPRINT1("crawl_item: processing certificate for domain: %s\n", domain);
if (GNUTLS_CRT_X509 != gnutls_certificate_type_get(client->session)) {
printf("%s\terror\tprocessing certificate\n", domain);
error(0, 0, "crawl_item: process_cert: domain: %s certificate type is not x.509\n", domain);
return;
}
cert_list = gnutls_certificate_get_peers(client->session, &cert_list_size);
if (NULL == cert_list) {
printf("%s\terror\tprocessing certificate\n", domain);
error(0, 0, "crawl_item: process_cert: domain: %s error getting certificates\n", domain);
return;
}
r = gnutls_x509_crt_init(&cert);
CRAWLER_PC_CHECK_GNUTLS_RESULT_DEINIT_CERT;
r = gnutls_x509_crt_import(cert, &cert_list[0], GNUTLS_X509_FMT_DER);
CRAWLER_PC_CHECK_GNUTLS_RESULT_DEINIT_CERT;
activation_time = gnutls_x509_crt_get_activation_time(cert);
activation_time_tm = gmtime(&activation_time);
size = strftime(activation_time_str, sizeof(activation_time_str) - 1, "%FT%T", activation_time_tm);
if (0 == size) {
printf("%s\terror\tprocessing certificate\n", domain);
error(0, 0, "crawl_item: process_cert: domain: %s error in strftime for activation time: %ld\n", domain, activation_time);
gnutls_x509_crt_deinit(cert);
return;
}
expiration_time = gnutls_x509_crt_get_expiration_time(cert);
expiration_time_tm = gmtime(&expiration_time);
size = strftime(expiration_time_str, sizeof(expiration_time_str) - 1, "%FT%T", expiration_time_tm);
if (0 == size) {
printf("%s\terror\tin strftime for cert expiration time: %ld\n", domain, expiration_time);
error(0, 0, "crawl_item: process_cert: domain: %s error in strftime for expiration time: %ld\n", domain, activation_time);
gnutls_x509_crt_deinit(cert);
return;
}
algo = gnutls_x509_crt_get_pk_algorithm (cert, &bits);
size = sizeof(dn);
memset(dn, 0, size);
r = gnutls_x509_crt_get_dn(cert, dn, &size);
CRAWLER_PC_CHECK_GNUTLS_RESULT_DEINIT_CERT;
size = sizeof(issuer_dn);
memset(issuer_dn, 0, size);
r = gnutls_x509_crt_get_issuer_dn(cert, issuer_dn, &size);
CRAWLER_PC_CHECK_GNUTLS_RESULT_DEINIT_CERT;
algo_name = gnutls_pk_algorithm_get_name(algo);
printf("%s\tok\tDN\t%s\tIDN\t%s\tSINCE\t%s\tEXPIRES\t%s\tPK-ALGO\t%s\n",
domain, dn, issuer_dn, activation_time_str, expiration_time_str, algo_name);
// important cleanup
gnutls_x509_crt_deinit(cert);
}
void
crawl_item_free (/*@notnull@*/ crawl_item *self) {
evcom_socket_detach(&self->socket);
if (NULL != self->socket.session) {
gnutls_deinit(self->socket.session);
self->socket.session = NULL;
}
}
#include <assert.h>
#include <error.h>
#include <ev.h>
#include <stdio.h>
#include <string.h>
#include "evcom.h"
#include "crawler.h"
int main (void) {
int r = 0;
char domain[1000], *line = NULL;
crawl_item items[CRAWLER_MAX_DOMAINS];
size_t num = 0, i = 0;
memset(domain, 0, (size_t)1000);
memset(items, 0, sizeof(crawl_item) * CRAWLER_MAX_DOMAINS);
tls_global_init(CRAWLER_GNUTLS_PRIORITY_STRING);
// initialize the default ev loop.
if (!ev_default_loop(EVFLAG_AUTO))
error(1, 0, "libev default loop init failed.");
// initialize DNS subsystem. ev loop must be initialized before dns_init
if (0 != crawler_dns_init())
error(1, 0, "DNS subsystem init failed.");
do {
domain[0] = '\0';
line = fgets(domain, 1000, stdin);
//strcpy(domain, "no-such-domain--------.ru\n");
//line = domain;
if (NULL == line)
break;
if (NULL == strchr(line, '\n')) {
error(0, 0, "main: expected list of \\n separated domain names on stdin.\n");
break;
}
// strip newline
line[strcspn(line, "\n")] = '\0';
// now \n is stripped, we refer to domain instead of line
if (0 == strlen(domain))
// domain list ended
break;
r = crawl_item_init(&items[num], domain, CRAWLER_TIMEOUT);
if (0 != r)
error(1, 0, "main: error initializing new crawl_item for domain: %s\n", domain);
r = crawl_item_start(&items[num]);
if (0 != r)
error(1, 0, "main: error starting crawl for domain: %s\n", domain);
num++;
if (num >= CRAWLER_MAX_DOMAINS) {
error(0, 0, "main: maximum number of domains: %d reached\n", num);
break;
}
} while (0 != strlen(line));
ev_loop(EV_DEFAULT_ 0);
for (i = 0; i < num; i++) {
// free as in Freedom
crawl_item_free(&items[i]);
}
if (NULL != ev_default_loop_uc())
ev_default_destroy();
tls_global_deinit();
return 0;
}
#ifndef CRAWLER_H
#define CRAWLER_H
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <stdbool.h>
#include "evcom.h"
#ifndef NDEBUG
#define DPRINT(s) printf("debug: " s)
#define DPRINT1(s, arg1) printf("debug: " s, arg1)
#define DPRINT2(s, arg1, arg2) printf("debug: " s, arg1, arg2)
#else
#define DPRINT(s)
#define DPRINT1(s, arg1)
#define DPRINT2(s, arg1, arg2)
#endif
// settings
#define CRAWLER_MAX_ADDR_COUNT 16 // 16 fails is pretty much definite domain failure.
#define CRAWLER_MAX_STRADDR_LEN 46 // largest string representation of address we can hold
// 46 was calculated as 1 + length of
// ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255
#define CRAWLER_TIMEOUT 7.0
#define CRAWLER_DNS_TIMEOUT 7
#define CRAWLER_MAX_DOMAINS 2000
#define CRAWLER_GNUTLS_PRIORITY_STRING "PERFORMANCE:NORMAL:EXPORT:-CTYPE-OPENPGP:%SSL3_RECORD_VERSION"
typedef struct crawl_item {
// 254 is max domain name length
char domain[255];
// resolved addresses array.
// crawler dns resolving will read max of MAX_ADDR_COUNT IP addresses per DNS A request.
// Then, we try each address.
char str_addr[CRAWLER_MAX_STRADDR_LEN][CRAWLER_MAX_ADDR_COUNT];
// real number of addresses resolved
size_t addr_count;
evcom_socket socket;
} crawl_item;
/*
* Initializes crawl_item.
* Copies zero-terminated domain into internal structure.
* Initializes socket.
*/
int
crawl_item_init (/*@notnull@*/ crawl_item *self,
/*@notnull@*/ const char *domain,
float timeout
);
/*
* Initiates crawling.
* Resolves domain, if necessary.
*/
int
crawl_item_start (/*@notnull@*/ crawl_item *self);
/*
* Connects, attaches socket to libev loop.
* crawl_item.addr must contain valid IP addresses before calling start.
* You may use crawl_item_resolve to fill it.
*/
int
crawl_item_connect (/*@notnull@*/ crawl_item *self);
/*
* Reads certificate info, if any. Outputs crawling results.
*/
void
crawl_item_process_cert (/*@notnull@*/ crawl_item *self);
void
crawl_item_free (/*@notnull@*/ crawl_item *self)
/*@releases self->socket.session@*/
/*@ensures isnull self->socket.session@*/
;
/*
* Global initializer of DNS resolving subsystem.
* Must be called once after initializing libev loop.
*/
int
crawler_dns_init (void);
int
crawler_dns_resolve4_start (/*@notnull@*/ const crawl_item *self);
void
tls_session_init (crawl_item *item);
void
tls_global_init (const char *priority_string);
void
tls_global_deinit (void);
#endif // CRAWLER_H
#include <assert.h>
#include <error.h>
#include <ev.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <udns.h>
#include "crawler.h"
#define MIN(a, b) (a < b ? a : b)
static ev_io dns_io_watcher;
static ev_timer dns_timer_watcher;
static void
set_timeout (void) {
int maxwait = CRAWLER_DNS_TIMEOUT;
int wait = dns_timeouts(NULL, maxwait, ev_now(EV_DEFAULT_UC));
ev_timer_stop(EV_DEFAULT_UC_ &dns_timer_watcher);
if (!dns_active(NULL)) {
ev_timer_stop(EV_DEFAULT_UC_ &dns_timer_watcher);
return;
}
if (wait >= 0) {
ev_timer_set(&dns_timer_watcher, (double)wait, 0.0);
ev_timer_start(EV_DEFAULT_UC_ &dns_timer_watcher);
}
}
static void
maybe_start () {
ev_io_start(EV_DEFAULT_UC_ &dns_io_watcher);
set_timeout();
}
static void
ioevent (EV_P_ ev_io *_watcher, int revents) {
assert(revents == EV_READ);
assert(_watcher == &dns_io_watcher);
dns_ioevent(NULL, ev_now(EV_DEFAULT_UC));
if (!dns_active(NULL))
ev_io_stop(EV_DEFAULT_UC_ &dns_io_watcher);
set_timeout();
}
static void
timeout (EV_P_ ev_timer *_watcher, int revents) {
assert(revents == EV_TIMEOUT);
assert(_watcher == &dns_timer_watcher);
set_timeout();
}
int
crawler_dns_init (void) {
int r, fd;
r = dns_init(NULL, 0);
if (r < 0) {
error(0, 0, "dns: error initializing UDNS context\n");
return r;
}
fd = dns_open(NULL);
ev_io_init(&dns_io_watcher, ioevent, fd, EV_READ);
ev_init(&dns_timer_watcher, timeout);
return 0;
}
static void
resolve_error (/*@notnull@*/ crawl_item *self) {
const char *msg;
int status = dns_status(NULL);
assert(status < 0);
msg = dns_strerror(status);
printf("%s\terror\tresolve: %s\n", self->domain, msg);
crawl_item_free(self);
}
static void
on_dns_resolve4 (struct dns_ctx *ctx, struct dns_rr_a4 *result, void *data) {
size_t i;
crawl_item *self = data;
char buf[CRAWLER_MAX_STRADDR_LEN];
const char *ntop_r;
assert(ctx == &dns_defctx);
if (result == NULL) {
resolve_error(self);
return;
}
// don't work with more than CRAWLER_MAX_ADDR_COUNT addresses
self->addr_count = MIN(result->dnsa4_nrr, CRAWLER_MAX_ADDR_COUNT);
// store string representation of all resolved addresses
for (i = 0; i < self->addr_count; i++) {
memset(buf, 0, CRAWLER_MAX_STRADDR_LEN);
ntop_r = dns_ntop(AF_INET, &(result->dnsa4_addr[i]),
buf, CRAWLER_MAX_STRADDR_LEN - 1);
if (NULL != ntop_r)
memcpy(self->str_addr[i], buf, CRAWLER_MAX_STRADDR_LEN);
}
free(result); // man 3 udns: it's the application who is responsible for freeing result memory
(void)crawl_item_connect(self);
}
int
crawler_dns_resolve4_start (/*@notnull@*/ const crawl_item *self) {
struct dns_query *query;
assert(self);
query = dns_submit_a4(NULL, self->domain, 0, on_dns_resolve4, (void*)self);
assert(NULL != query); // TODO better error handling.
maybe_start();
return 0;
}
#include <assert.h>
#include <error.h>
#include <stdio.h>
#include <gnutls/gnutls.h>
#include <gnutls/x509.h>
#include <unistd.h>
#include "crawler.h"
static const int kx_priority[] = {
GNUTLS_KX_RSA, GNUTLS_KX_DHE_DSS, GNUTLS_KX_DHE_RSA, GNUTLS_KX_RSA_EXPORT, 0
};
static gnutls_priority_t priorities_cache;
static gnutls_certificate_credentials_t xcred;
void
tls_global_init (const char *priority_string) {
int r = 0;
const char *err_pos = NULL;
r = gnutls_global_init();
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "init_tls: global GNU TLS init error: %s\n", gnutls_strerror(r));
(void)fsync((int)stderr);
/* the most liberal priority */
r = gnutls_priority_init(&priorities_cache, priority_string, &err_pos);
if (GNUTLS_E_INVALID_REQUEST == r) {
error(0, 0, "%s", gnutls_strerror(r));
if (NULL != err_pos)
error(1, 0, "Syntax error in priority string:\n%s\n%*s error here\n",
priority_string, err_pos - priority_string + 1, "^");
else
error(1, 0, "bug in GNU TLS library: gnutls_priority_init returned GNUTLS_E_INVALID_REQUEST without setting error position\n");
}
r = gnutls_certificate_allocate_credentials(&xcred);
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "anon_tls_client: error allocating certificate credentials: %s\n", gnutls_strerror(r));
}
void
tls_global_deinit (void) {
gnutls_certificate_free_credentials(xcred);
gnutls_priority_deinit(priorities_cache);
gnutls_global_deinit();
}
void
tls_session_init (crawl_item *self) /*@uses self@*/ {
int r = 0;
evcom_socket *socket;
assert(NULL != self);
socket = &self->socket;
assert(NULL != socket);
assert(NULL == socket->session);
r = gnutls_init(&socket->session, GNUTLS_CLIENT);
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "anon_tls_client: GNU TLS session init: %s\n", gnutls_strerror(r));
r = gnutls_priority_set(socket->session, priorities_cache);
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "anon_tls_client: error setting priority: %s\n", gnutls_strerror(r));
r = gnutls_kx_set_priority(socket->session, kx_priority);
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "anon_tls_client: error setting kx priority: %s\n", gnutls_strerror(r));
/* Need to enable anonymous KX specifically. */
r = gnutls_credentials_set(socket->session, GNUTLS_CRD_CERTIFICATE, xcred);
if (GNUTLS_E_SUCCESS != r)
error(1, 0, "anon_tls_client: error setting credentials: %s\n", gnutls_strerror(r));
evcom_socket_set_secure_session(socket, socket->session);
assert(EVCOM_SECURE == (socket->flags & EVCOM_SECURE));
}
include config.mk
ifdef DEBUG
CFLAGS += -g
else
CFLAGS += -DNDEBUG=1 -O3
endif
# ======================
# crawler specific begin
# ======================
CRAWLER_OBJ := crawler/crawler.o \
crawler/crawl_item.o \
crawler/dns.o \
crawler/gnutls_helpers.o
# dependencies
LDFLAGS += -lev
ifdef EVDIR
CFLAGS += -I$(EVDIR)/include
LDFLAGS += -L$(EVDIR)/lib
endif
# evcom dependency is embedded in source tree
CFLAGS += -Ilib/evcom
CRAWLER_OBJ += lib/evcom/evcom.o
# evcom must be built with GNU TLS support
CFLAGS += -DEVCOM_HAVE_GNUTLS=1
LDFLAGS += -lgnutls
ifdef GNUTLSDIR
CFLAGS += -I$(GNUTLSDIR)/include
LDFLAGS += -L$(GNUTLSDIR)/lib
endif
LDFLAGS += -ludns
ifdef UDNSDIR
CFLAGS += -I$(UDNSDIR)/include
LDFLAGS += -L$(UDNSDIR)/lib
endif
# ====================
# crawler specific end
# ====================
.PHONY: all
all: crawler worker test
# === crawler rules begin ===
.PHONY: crawler
crawler: crawler/crawler
crawler/crawler.o crawler/crawl_item.o crawler/dns.o crawler/gnutls_helpers.o: crawler/crawler.h
lib/evcom/evcom.o:
$(MAKE) -C lib/evcom evcom.o
crawler/crawler: $(CRAWLER_OBJ)
$(CC) -o $@ $(CRAWLER_OBJ) $(CFLAGS) $(LDFLAGS)
.PHONY: crawler_clean
crawler_clean:
-rm -f crawler/crawler $(CRAWLER_OBJ)
$(MAKE) -C lib/evcom clean
.PHONY: crawler_test
crawler_test:
#$(PYTHON) crawler/test.py crawler/crawler
echo localhost | crawler/crawler
# === crawler rules end ===
# === worker rules begin ===
.PHONY: worker
worker:
echo "worker build stub"
#$(PYTHON)
.PHONY: worker_clean
worker_clean:
-rm -f worker/*.pyc worker/*.pyo
.PHONY: worker_test
worker_test:
-echo "Starting worker tests"
( cd worker && nosetests )
# === worker rules end ===
.PHONY: clean
clean: crawler_clean worker_clean
.PHONY: test
test: crawler_test #worker_test
import os
from subprocess import Popen, PIPE
import sys
# helper functions
def _run(exe, input_):
"""Run executable and feed `input_` to stdin.
Returns tuple(returncode, stdout, stderr)"""
p = Popen([exe], stdin=PIPE, stdout=PIPE, stderr=PIPE)
out, err = p.communicate(input_)
# returncode == None - process not terminated yet
assert p.returncode is not None
return p.returncode, out, err
def run(input_):
exe = sys.argv[1]
rc, out, err = _run(exe, input_)
assert rc == 0
assert err == ""
return out
# -----------
# tests begin
# -----------
def test_empty():
assert run("\n") == "\n"
def test_nonexistant():
d = "non-existant-domain.domain.tld"
assert run("%s\n" % d) == "%s\tdead\n" % d
# ---------
# tests end
# ---------
if __name__ == '__main__':
if len(sys.argv) < 2:
print "test.py: Error: Supply path of tested crawler executable"
exit(1)
exe = sys.argv[1]
expanded_path = os.path.expandvars(os.path.expanduser(exe))
if not os.path.exists(expanded_path):
print "test.py: Error: Crawler executable %s not found" % (expanded_path,)
exit(2)
G = globals()
for name in dir():
if name.startswith("test_"):
t = G[name]
if callable(t):
t()
else:
print "Warning: found test-like object that is not callable: %s" % name
#!/bin/bash
usage() {
echo "Text-database columns filtering tool. Deletes all columns but specified in second argument."
echo ""
echo "usage: $(basename $0) FILENAME COLUMN_INDEXES"
echo ""
echo "Modifies file inplace. (Via creating temporary file in /tmp and renaming)"
echo "Columns are TAB separated."
echo ""
echo "Example:"
echo " $(basename $0) domains-2009-03-05.txt 1,3,4 # to select 1,3 and 4 columns"
}
db_filename=$1
columns=$2
tmp_filename="/tmp/text-db-$db_filename"
if [[ -z "$db_filename" || -z "$columns" ]]; then
usage
exit 1
fi
if [[ "$db_filename" = "-" ]]; then
cut -f $columns
else
cat "$db_filename" | cut -f $columns > "$tmp_filename"
mv "$tmp_filename" "$db_filename"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment