tuxology/lttng-sched-filter.c

## lttng-sched-filter.c
/*
 * addons/lttng-sched-filter.c
 *
 * A filtered version of sched_switch
 *
 * Copyright (C) 2014 Suchakra Sharma <suchakrapani.sharma@polymtl.ca>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; only
 * version 2.1 of the License.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/uaccess.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <trace/bpf_trace.h>
#include <asm/syscall.h>
#include <linux/interrupt.h>
#include <linux/time.h>
#include <uapi/linux/time.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
//#include <linux/kallsyms.h>

#include <linux/sched.h>
#include <linux/binfmts.h>
#include <linux/version.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0))
#include <linux/sched/rt.h>
#endif

#include "../wrapper/tracepoint.h"
#include "../instrumentation/events/lttng-module/addons.h"

#define BPF 1
#define SIMPLE 0
#define NOFILT 0

DEFINE_TRACE(sched_switch_filter);

/* Procfs stuff */
#define MAX_LEN	16000000
static struct proc_dir_entry *proc_entry;
static char *accum_time;
u64 len = 0;

static int ebpf_proc_show(struct seq_file *m, void *v) {
    seq_printf(m, accum_time);
    return 0;
}

static int ebpf_proc_open(struct inode *inode, struct  file *file) {
    return single_open(file, ebpf_proc_show, NULL);
}

static const struct file_operations ebpf_proc_fops = {
    .owner = THIS_MODULE,
    .open = ebpf_proc_open,
    .read = seq_read,
    .llseek = seq_lseek,
    .release = single_release,
};

/* Timing stuff */
atomic_t count = ATOMIC_INIT(0);

/* Global definitions */
struct bpf_prog *prog;

/* The actual eBPF prog instructions */
static struct bpf_insn insn_prog[] = {
    BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 0), /* r2 = bctx (which is therefore arg1, and thus, prev->comm) */
    BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_2, 0), /* r3 = *(prev->comm) */
    BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, 8), /* r4 = comm */
    BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), /* r4 =  which is "sshd" */
    BPF_JMP_REG(BPF_JEQ, BPF_REG_5, BPF_REG_3, 3),
    BPF_LD_IMM64(BPF_REG_0, 0), /* FALSE */
    BPF_EXIT_INSN(),
    BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 16), /* r3 = *(prev->state) */
    BPF_LD_IMM64(BPF_REG_4, 0), /* r4 = 0 */
    BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_4, 3),
    BPF_LD_IMM64(BPF_REG_0, 0), /* FALSE */
    BPF_EXIT_INSN(),
    BPF_LD_IMM64(BPF_REG_0, 1), /* TRUE */
    BPF_EXIT_INSN(),
};


static void  *u64_to_ptr(__u64 val){
    return (void *) (unsigned long) val;
}

static __u64 ptr_to_u64(void *ptr){
    return (__u64) (unsigned long) ptr;
}

void bpf_map_free_deferred(struct work_struct *work)
{
    struct bpf_map *map = container_of(work, struct bpf_map, work);

    /* implementation dependent freeing */
    map->ops->map_free(map);
}

void bpf_map_put(struct bpf_map *map)
{
    if (atomic_dec_and_test(&map->refcnt)) {
        INIT_WORK(&map->work, bpf_map_free_deferred);
        schedule_work(&map->work);
    }
}

static void free_used_maps(struct bpf_prog_aux *aux)
{
    int i;

    for (i = 0; i < aux->used_map_cnt; i++)
        bpf_map_put(aux->used_maps[i]);

    kfree(aux->used_maps);
}

unsigned int run_bpf_filter(struct bpf_prog *prog1, struct bpf_context *ctx){
    rcu_read_lock();
    u64 ret = BPF_PROG_RUN(prog1, (void*) ctx);
    rcu_read_unlock();
    return ret;
}

/* Inititlize and prepare the eBPF prog */
unsigned int init_ebpf_prog(void)
{
    int ret = 0;
    char bpf_log_buf[1024];
    unsigned int insn_count = sizeof(insn_prog) / sizeof(struct bpf_insn);

    union bpf_attr attr = {
        .prog_type = BPF_PROG_TYPE_UNSPEC,
        .insns = ptr_to_u64((void*) insn_prog),
        .insn_cnt = insn_count,
        .license = ptr_to_u64((void *) "GPL"),
        .log_buf = ptr_to_u64(bpf_log_buf),
        .log_size = 1024,
        .log_level = 1,
    };

    prog = bpf_prog_alloc(bpf_prog_size(attr.insn_cnt), GFP_USER);
    if (!prog)
        return -ENOMEM;
    prog->jited = false;
    prog->orig_prog = NULL;
    prog->len = attr.insn_cnt;

    if (memcpy(prog->insnsi, u64_to_ptr(attr.insns), prog->len * sizeof(struct bpf_insn)) != 0)
        atomic_set(&prog->aux->refcnt, 1);
    prog->aux->is_gpl_compatible = true;

    /* TODO eBPF verifier */
    // char *sym_name = "bpf_check";
    // unsigned long sym_addr = kallsyms_lookup_name(sym_name);
    // int (*bpf_check)(struct bpf_prog*, union bpf_attr*) =
    //     (int (*)(struct bpf_prog*, union bpf_attr*) ) sym_addr;
    // ret = bpf_check(prog, &attr);

    /* ready for JIT */
    bpf_prog_select_runtime(prog);
    printk("prog jited? : %d\n", prog->jited);

    return 0;
}

unsigned int filter_dev_probe_handler(void* __data, struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
    struct timespec begin, end, diff;
    char comm[8] = {};
    strcpy(comm, "sshd");
    char pcomm[8] = {};
    strcpy(pcomm, prev->comm);

    struct bpf_context bctx = {};
    bctx.arg1 = (u64) pcomm;
    bctx.arg2 = (u64) comm;
    bctx.arg3 = (u64) prev->state;

    /* tick */
    getrawmonotonic(&begin);

#if (NOFILT)
    trace_sched_switch_filter(prev, next);

#elif (SIMPLE)
    if ((memcmp(prev->comm, comm, 4) == 0) && (prev->state == 0))
    {
        trace_sched_switch_filter(prev, next);
    }
#elif (BPF)
    unsigned int ret = 0;
    ret = run_bpf_filter(prog, &bctx);
    if (ret == 1){
        trace_sched_switch_filter(prev, next);
    }
#endif

    /* tock */
    getrawmonotonic(&end);
    diff = timespec_sub(end, begin);
    atomic_inc(&count);
    sprintf(accum_time + strlen(accum_time), "%d\t%lu\n", atomic_read(&count), diff.tv_nsec);

    return 0;
}


static int __init sched_switch_filter_init(void)
{
    int ret = 0;

#if (SIMPLE)
    printk("SIMPLE RUN\n");

#elif (BPF)
    printk("BPF RUN\n");

    /* Prepare eBPF prog*/
    ret = init_ebpf_prog();
#endif

    /* Init procfs entry */
    accum_time = (char*) vmalloc(MAX_LEN);
    memset(accum_time, 0, MAX_LEN);
    proc_entry = proc_create("eBPFsched", 0, NULL, &ebpf_proc_fops);

    if (proc_entry == NULL)
    {
        ret = -1;
        vfree(accum_time);
        printk(KERN_INFO "eBPFsched could not be created\n");
    }
    else
    {
        printk(KERN_INFO "eBPFsched created.\n");
    }

    (void) wrapper_lttng_fixup_sig(THIS_MODULE);

    ret = lttng_wrapper_tracepoint_probe_register("sched_switch",
            filter_dev_probe_handler, NULL);
    if (ret)
        goto error;

    printk("sched_switch_filter loaded\n");
    return 0;


error:
    return ret;
}

static void __exit sched_switch_filter_exit(void)
{
    int ret;

#if (BPF)
    free_used_maps(prog->aux);
    printk("Freed maps\n");
    bpf_prog_free(prog);
    printk("Freed bpf prog\n");
#endif

    /* Remove procfs entry */
    remove_proc_entry("eBPFsched", NULL);
    printk(KERN_INFO "eBPFsched removed\n");
    vfree(accum_time);

    ret = lttng_wrapper_tracepoint_probe_unregister("sched_switch",
            filter_dev_probe_handler, NULL);

    printk("sched_switch_filter unloaded\n");
    return;
}

module_init(sched_switch_filter_init);
module_exit(sched_switch_filter_exit);

MODULE_LICENSE("GPL and additional rights");
MODULE_AUTHOR("Suchakra Sharma <suchakrapani.sharma@polymtl.ca>");
MODULE_DESCRIPTION("LTTng filtered sched_switch");
	/*
	* addons/lttng-sched-filter.c
	*
	* A filtered version of sched_switch
	*
	* Copyright (C) 2014 Suchakra Sharma <suchakrapani.sharma@polymtl.ca>
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; only
	* version 2.1 of the License.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <linux/module.h>
	#include <linux/netdevice.h>
	#include <linux/skbuff.h>
	#include <linux/uaccess.h>
	#include <linux/bpf.h>
	#include <linux/filter.h>
	#include <trace/bpf_trace.h>
	#include <asm/syscall.h>
	#include <linux/interrupt.h>
	#include <linux/time.h>
	#include <uapi/linux/time.h>
	#include <linux/proc_fs.h>
	#include <linux/seq_file.h>
	#include <linux/string.h>
	#include <linux/vmalloc.h>
	//#include <linux/kallsyms.h>

	#include <linux/sched.h>
	#include <linux/binfmts.h>
	#include <linux/version.h>
	#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0))
	#include <linux/sched/rt.h>
	#endif

	#include "../wrapper/tracepoint.h"
	#include "../instrumentation/events/lttng-module/addons.h"

	#define BPF 1
	#define SIMPLE 0
	#define NOFILT 0

	DEFINE_TRACE(sched_switch_filter);

	/* Procfs stuff */
	#define MAX_LEN 16000000
	static struct proc_dir_entry *proc_entry;
	static char *accum_time;
	u64 len = 0;

	static int ebpf_proc_show(struct seq_file m, void v) {
	seq_printf(m, accum_time);
	return 0;
	}

	static int ebpf_proc_open(struct inode inode, struct file file) {
	return single_open(file, ebpf_proc_show, NULL);
	}

	static const struct file_operations ebpf_proc_fops = {
	.owner = THIS_MODULE,
	.open = ebpf_proc_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = single_release,
	};

	/* Timing stuff */
	atomic_t count = ATOMIC_INIT(0);

	/* Global definitions */
	struct bpf_prog *prog;

	/* The actual eBPF prog instructions */
	static struct bpf_insn insn_prog[] = {
	BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 0), /* r2 = bctx (which is therefore arg1, and thus, prev->comm) */
	BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_2, 0), /* r3 = (prev->comm) /
	BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, 8), /* r4 = comm */
	BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0), /* r4 = which is "sshd" */
	BPF_JMP_REG(BPF_JEQ, BPF_REG_5, BPF_REG_3, 3),
	BPF_LD_IMM64(BPF_REG_0, 0), /* FALSE */
	BPF_EXIT_INSN(),
	BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 16), /* r3 = (prev->state) /
	BPF_LD_IMM64(BPF_REG_4, 0), /* r4 = 0 */
	BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_4, 3),
	BPF_LD_IMM64(BPF_REG_0, 0), /* FALSE */
	BPF_EXIT_INSN(),
	BPF_LD_IMM64(BPF_REG_0, 1), /* TRUE */
	BPF_EXIT_INSN(),
	};


	static void *u64_to_ptr(__u64 val){
	return (void *) (unsigned long) val;
	}

	static __u64 ptr_to_u64(void *ptr){
	return (__u64) (unsigned long) ptr;
	}

	void bpf_map_free_deferred(struct work_struct *work)
	{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

	/* implementation dependent freeing */
	map->ops->map_free(map);
	}

	void bpf_map_put(struct bpf_map *map)
	{
	if (atomic_dec_and_test(&map->refcnt)) {
	INIT_WORK(&map->work, bpf_map_free_deferred);
	schedule_work(&map->work);
	}
	}

	static void free_used_maps(struct bpf_prog_aux *aux)
	{
	int i;

	for (i = 0; i < aux->used_map_cnt; i++)
	bpf_map_put(aux->used_maps[i]);

	kfree(aux->used_maps);
	}

	unsigned int run_bpf_filter(struct bpf_prog prog1, struct bpf_context ctx){
	rcu_read_lock();
	u64 ret = BPF_PROG_RUN(prog1, (void*) ctx);
	rcu_read_unlock();
	return ret;
	}

	/* Inititlize and prepare the eBPF prog */
	unsigned int init_ebpf_prog(void)
	{
	int ret = 0;
	char bpf_log_buf[1024];
	unsigned int insn_count = sizeof(insn_prog) / sizeof(struct bpf_insn);

	union bpf_attr attr = {
	.prog_type = BPF_PROG_TYPE_UNSPEC,
	.insns = ptr_to_u64((void*) insn_prog),
	.insn_cnt = insn_count,
	.license = ptr_to_u64((void *) "GPL"),
	.log_buf = ptr_to_u64(bpf_log_buf),
	.log_size = 1024,
	.log_level = 1,
	};

	prog = bpf_prog_alloc(bpf_prog_size(attr.insn_cnt), GFP_USER);
	if (!prog)
	return -ENOMEM;
	prog->jited = false;
	prog->orig_prog = NULL;
	prog->len = attr.insn_cnt;

	if (memcpy(prog->insnsi, u64_to_ptr(attr.insns), prog->len * sizeof(struct bpf_insn)) != 0)
	atomic_set(&prog->aux->refcnt, 1);
	prog->aux->is_gpl_compatible = true;

	/* TODO eBPF verifier */
	// char *sym_name = "bpf_check";
	// unsigned long sym_addr = kallsyms_lookup_name(sym_name);
	// int (bpf_check)(struct bpf_prog, union bpf_attr*) =
	// (int ()(struct bpf_prog, union bpf_attr*) ) sym_addr;
	// ret = bpf_check(prog, &attr);

	/* ready for JIT */
	bpf_prog_select_runtime(prog);
	printk("prog jited? : %d\n", prog->jited);

	return 0;
	}

	unsigned int filter_dev_probe_handler(void* __data, struct rq rq, struct task_struct prev, struct task_struct *next)
	{
	struct timespec begin, end, diff;
	char comm[8] = {};
	strcpy(comm, "sshd");
	char pcomm[8] = {};
	strcpy(pcomm, prev->comm);

	struct bpf_context bctx = {};
	bctx.arg1 = (u64) pcomm;
	bctx.arg2 = (u64) comm;
	bctx.arg3 = (u64) prev->state;

	/* tick */
	getrawmonotonic(&begin);

	#if (NOFILT)
	trace_sched_switch_filter(prev, next);

	#elif (SIMPLE)
	if ((memcmp(prev->comm, comm, 4) == 0) && (prev->state == 0))
	{
	trace_sched_switch_filter(prev, next);
	}
	#elif (BPF)
	unsigned int ret = 0;
	ret = run_bpf_filter(prog, &bctx);
	if (ret == 1){
	trace_sched_switch_filter(prev, next);
	}
	#endif

	/* tock */
	getrawmonotonic(&end);
	diff = timespec_sub(end, begin);
	atomic_inc(&count);
	sprintf(accum_time + strlen(accum_time), "%d\t%lu\n", atomic_read(&count), diff.tv_nsec);

	return 0;
	}


	static int __init sched_switch_filter_init(void)
	{
	int ret = 0;

	#if (SIMPLE)
	printk("SIMPLE RUN\n");

	#elif (BPF)
	printk("BPF RUN\n");

	/* Prepare eBPF prog*/
	ret = init_ebpf_prog();
	#endif

	/* Init procfs entry */
	accum_time = (char*) vmalloc(MAX_LEN);
	memset(accum_time, 0, MAX_LEN);
	proc_entry = proc_create("eBPFsched", 0, NULL, &ebpf_proc_fops);

	if (proc_entry == NULL)
	{
	ret = -1;
	vfree(accum_time);
	printk(KERN_INFO "eBPFsched could not be created\n");
	}
	else
	{
	printk(KERN_INFO "eBPFsched created.\n");
	}

	(void) wrapper_lttng_fixup_sig(THIS_MODULE);

	ret = lttng_wrapper_tracepoint_probe_register("sched_switch",
	filter_dev_probe_handler, NULL);
	if (ret)
	goto error;

	printk("sched_switch_filter loaded\n");
	return 0;


	error:
	return ret;
	}

	static void __exit sched_switch_filter_exit(void)
	{
	int ret;

	#if (BPF)
	free_used_maps(prog->aux);
	printk("Freed maps\n");
	bpf_prog_free(prog);
	printk("Freed bpf prog\n");
	#endif

	/* Remove procfs entry */
	remove_proc_entry("eBPFsched", NULL);
	printk(KERN_INFO "eBPFsched removed\n");
	vfree(accum_time);

	ret = lttng_wrapper_tracepoint_probe_unregister("sched_switch",
	filter_dev_probe_handler, NULL);

	printk("sched_switch_filter unloaded\n");
	return;
	}

	module_init(sched_switch_filter_init);
	module_exit(sched_switch_filter_exit);

	MODULE_LICENSE("GPL and additional rights");
	MODULE_AUTHOR("Suchakra Sharma <suchakrapani.sharma@polymtl.ca>");
	MODULE_DESCRIPTION("LTTng filtered sched_switch");