Skip to content

Instantly share code, notes, and snippets.

@sgoedecke
Created March 8, 2022 02:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sgoedecke/f2234da40e4f5aaf016ee09510e36b8d to your computer and use it in GitHub Desktop.
Save sgoedecke/f2234da40e4f5aaf016ee09510e36b8d to your computer and use it in GitHub Desktop.
This gist exceeds the recommended number of files (~10). To access all files, please clone this gist.
// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/acct.c
*
* BSD Process Accounting for Linux
*
* Author: Marco van Wieringen <mvw@planets.elm.net>
*
* Some code based on ideas and code from:
* Thomas K. Dyas <tdyas@eden.rutgers.edu>
*
* This file implements BSD-style process accounting. Whenever any
* process exits, an accounting record of type "struct acct" is
* written to the file specified with the acct() system call. It is
* up to user-level programs to do useful things with the accounting
* log. The kernel just provides the raw accounting information.
*
* (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
*
* Plugged two leaks. 1) It didn't return acct_file into the free_filps if
* the file happened to be read-only. 2) If the accounting was suspended
* due to the lack of space it happily allowed to reopen it and completely
* lost the old acct_file. 3/10/98, Al Viro.
*
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
* Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
* CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
* unless we are messing with the root. In that case we are getting a
* real mess with do_remount_sb(). 9/11/98, AV.
*
* Fixed a bunch of races (and pair of leaks). Probably not the best way,
* but this one obviously doesn't introduce deadlocks. Later. BTW, found
* one race (and leak) in BSD implementation.
* OK, that's better. ANOTHER race and leak in BSD variant. There always
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
* ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/tty.h>
#include <linux/security.h>
#include <linux/vfs.h>
#include <linux/jiffies.h>
#include <linux/times.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/uaccess.h>
#include <linux/sched/cputime.h>
#include <asm/div64.h>
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
* resume the process accounting system, and the time delay between
* each check.
* Turned into sysctl-controllable parameters. AV, 12/11/98
*/
int acct_parm[3] = {4, 2, 30};
#define RESUME (acct_parm[0]) /* >foo% free space - resume */
#define SUSPEND (acct_parm[1]) /* <foo% free space - suspend */
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
/*
* External references and all of the globals.
*/
struct bsd_acct_struct {
struct fs_pin pin;
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
};
static void do_acct_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
acct->active = 0;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
acct->active = 1;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
out:
return acct->active;
}
static void acct_put(struct bsd_acct_struct *p)
{
if (atomic_long_dec_and_test(&p->count))
kfree_rcu(p, rcu);
}
static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
{
return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
}
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
}
if (!atomic_long_inc_not_zero(&res->count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
}
return res;
}
static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
do_acct_process(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
mutex_unlock(&acct->lock);
pin_remove(pin);
acct_put(acct);
}
static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
complete(&acct->done);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
struct bsd_acct_struct *acct;
struct fs_pin *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
if (!acct)
return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
if (IS_ERR(file)) {
kfree(acct);
return PTR_ERR(file);
}
if (!S_ISREG(file_inode(file)->i_mode)) {
kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
return -EIO;
}
internal = mnt_clone_internal(&file->f_path);
if (IS_ERR(internal)) {
kfree(acct);
filp_close(file, NULL);
return PTR_ERR(internal);
}
err = __mnt_want_write(internal);
if (err) {
mntput(internal);
kfree(acct);
filp_close(file, NULL);
return err;
}
mnt = file->f_path.mnt;
file->f_path.mnt = internal;
atomic_long_set(&acct->count, 1);
init_fs_pin(&acct->pin, acct_pin_kill);
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
rcu_read_lock();
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
__mnt_drop_write(mnt);
mntput(mnt);
return 0;
}
static DEFINE_MUTEX(acct_on_mutex);
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
*
* Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
int error = 0;
if (!capable(CAP_SYS_PACCT))
return -EPERM;
if (name) {
struct filename *tmp = getname(name);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
rcu_read_lock();
pin_kill(task_active_pid_ns(current)->bacct);
}
return error;
}
void acct_exit_ns(struct pid_namespace *ns)
{
rcu_read_lock();
pin_kill(ns->bacct);
}
/*
* encode an unsigned long into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
* is a 13-bit fraction with a 3-bit (base 8) exponent.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
static comp_t encode_comp_t(unsigned long value)
{
int exp, rnd;
exp = rnd = 0;
while (value > MAXFRACT) {
rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */
value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT)) {
value >>= EXPSIZE;
exp++;
}
/*
* Clean it up and polish it off.
*/
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += value; /* and add on the mantissa. */
return exp;
}
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
* Format: 5 bit base 2 exponent, 20 bits mantissa.
* The leading bit of the mantissa is not stored, but implied for
* non-zero exponents.
* Largest encodable value is 50 bits.
*/
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
int exp, rnd;
exp = (value > (MAXFRACT2>>1));
rnd = 0;
while (value > MAXFRACT2) {
rnd = value & 1;
value >>= 1;
exp++;
}
/*
* If we need to round up, do it (and handle overflow correctly).
*/
if (rnd && (++value > MAXFRACT2)) {
value >>= 1;
exp++;
}
if (exp > MAXEXP2) {
/* Overflow. Return largest representable number instead. */
return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
} else {
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
#elif ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
static u32 encode_float(u64 value)
{
unsigned exp = 190;
unsigned u;
if (value == 0)
return 0;
while ((s64)value > 0) {
value <<= 1;
exp--;
}
u = (u32)(value >> 40) & 0x7fffffu;
return u | (exp << 23);
}
#endif
/*
* Write an accounting entry for an exiting process
*
* The acct_process() call is the workhorse of the process
* accounting system. The struct acct is built here and then written
* into the accounting file. This function should only be called from
* do_exit() or when switching to a different output file.
*/
static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
memset(ac, 0, sizeof(acct_t));
ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
run_time -= current->group_leader->start_time;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION == 3
ac->ac_etime = encode_float(elapsed);
#else
ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
(unsigned long) elapsed : (unsigned long) -1l);
#endif
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
ac->ac_etime_hi = etime >> 16;
ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
btime = ktime_get_real_seconds() - elapsed;
ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
#if ACCT_VERSION==2
ac->ac_ahz = AHZ;
#endif
spin_lock_irq(&current->sighand->siglock);
tty = current->signal->tty; /* Safe as we hold the siglock */
ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime));
ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime));
ac->ac_flag = pacct->ac_flag;
ac->ac_mem = encode_comp_t(pacct->ac_mem);
ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
}
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct)
{
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
struct file *file = acct->file;
/*
* Accounting records are not subject to resource limits.
*/
flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct))
goto out;
fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
ns);
rcu_read_unlock();
}
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
if (file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
__kernel_write(file, &ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
/**
* acct_collect - collect accounting information into pacct_struct
* @exitcode: task exit code
* @group_dead: not 0, if this thread is the last one in the process.
*/
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
u64 utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
struct vm_area_struct *vma;
mmap_read_lock(current->mm);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
mmap_read_unlock(current->mm);
}
spin_lock_irq(&current->sighand->siglock);
if (group_dead)
pacct->ac_mem = vsize / 1024;
if (thread_group_leader(current)) {
pacct->ac_exitcode = exitcode;
if (current->flags & PF_FORKNOEXEC)
pacct->ac_flag |= AFORK;
}
if (current->flags & PF_SUPERPRIV)
pacct->ac_flag |= ASU;
if (current->flags & PF_DUMPCORE)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
task_cputime(current, &utime, &stime);
pacct->ac_utime += utime;
pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
}
static void slow_acct_process(struct pid_namespace *ns)
{
for ( ; ns; ns = ns->parent) {
struct bsd_acct_struct *acct = acct_get(ns);
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
acct_put(acct);
}
}
}
/**
* acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
struct pid_namespace *ns;
/*
* This loop is safe lockless, since current is still
* alive and holds its namespace, which in turn holds
* its parent.
*/
for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
if (ns->bacct)
break;
}
if (unlikely(ns))
slow_acct_process(ns);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2016 Thomas Gleixner.
* Copyright (C) 2016-2017 Christoph Hellwig.
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/sort.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_vec)
{
const struct cpumask *siblmsk;
int cpu, sibl;
for ( ; cpus_per_vec > 0; ) {
cpu = cpumask_first(nmsk);
/* Should not happen, but I'm too lazy to think about it */
if (cpu >= nr_cpu_ids)
return;
cpumask_clear_cpu(cpu, nmsk);
cpumask_set_cpu(cpu, irqmsk);
cpus_per_vec--;
/* If the cpu has siblings, use them first */
siblmsk = topology_sibling_cpumask(cpu);
for (sibl = -1; cpus_per_vec > 0; ) {
sibl = cpumask_next(sibl, siblmsk);
if (sibl >= nr_cpu_ids)
break;
if (!cpumask_test_and_clear_cpu(sibl, nmsk))
continue;
cpumask_set_cpu(sibl, irqmsk);
cpus_per_vec--;
}
}
}
static cpumask_var_t *alloc_node_to_cpumask(void)
{
cpumask_var_t *masks;
int node;
masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
if (!masks)
return NULL;
for (node = 0; node < nr_node_ids; node++) {
if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
goto out_unwind;
}
return masks;
out_unwind:
while (--node >= 0)
free_cpumask_var(masks[node]);
kfree(masks);
return NULL;
}
static void free_node_to_cpumask(cpumask_var_t *masks)
{
int node;
for (node = 0; node < nr_node_ids; node++)
free_cpumask_var(masks[node]);
kfree(masks);
}
static void build_node_to_cpumask(cpumask_var_t *masks)
{
int cpu;
for_each_possible_cpu(cpu)
cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
}
static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
const struct cpumask *mask, nodemask_t *nodemsk)
{
int n, nodes = 0;
/* Calculate the number of nodes in the supplied affinity mask */
for_each_node(n) {
if (cpumask_intersects(mask, node_to_cpumask[n])) {
node_set(n, *nodemsk);
nodes++;
}
}
return nodes;
}
struct node_vectors {
unsigned id;
union {
unsigned nvectors;
unsigned ncpus;
};
};
static int ncpus_cmp_func(const void *l, const void *r)
{
const struct node_vectors *ln = l;
const struct node_vectors *rn = r;
return ln->ncpus - rn->ncpus;
}
/*
* Allocate vector number for each node, so that for each node:
*
* 1) the allocated number is >= 1
*
* 2) the allocated numbver is <= active CPU number of this node
*
* The actual allocated total vectors may be less than @numvecs when
* active total CPU number is less than @numvecs.
*
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
* for each node.
*/
static void alloc_nodes_vectors(unsigned int numvecs,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
const nodemask_t nodemsk,
struct cpumask *nmsk,
struct node_vectors *node_vectors)
{
unsigned n, remaining_ncpus = 0;
for (n = 0; n < nr_node_ids; n++) {
node_vectors[n].id = n;
node_vectors[n].ncpus = UINT_MAX;
}
for_each_node_mask(n, nodemsk) {
unsigned ncpus;
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
ncpus = cpumask_weight(nmsk);
if (!ncpus)
continue;
remaining_ncpus += ncpus;
node_vectors[n].ncpus = ncpus;
}
numvecs = min_t(unsigned, remaining_ncpus, numvecs);
sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
ncpus_cmp_func, NULL);
/*
* Allocate vectors for each node according to the ratio of this
* node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
* bigger than number of active numa nodes. Always start the
* allocation from the node with minimized nr_cpus.
*
* This way guarantees that each active node gets allocated at
* least one vector, and the theory is simple: over-allocation
* is only done when this node is assigned by one vector, so
* other nodes will be allocated >= 1 vector, since 'numvecs' is
* bigger than number of numa nodes.
*
* One perfect invariant is that number of allocated vectors for
* each node is <= CPU count of this node:
*
* 1) suppose there are two nodes: A and B
* ncpu(X) is CPU count of node X
* vecs(X) is the vector count allocated to node X via this
* algorithm
*
* ncpu(A) <= ncpu(B)
* ncpu(A) + ncpu(B) = N
* vecs(A) + vecs(B) = V
*
* vecs(A) = max(1, round_down(V * ncpu(A) / N))
* vecs(B) = V - vecs(A)
*
* both N and V are integer, and 2 <= V <= N, suppose
* V = N - delta, and 0 <= delta <= N - 2
*
* 2) obviously vecs(A) <= ncpu(A) because:
*
* if vecs(A) is 1, then vecs(A) <= ncpu(A) given
* ncpu(A) >= 1
*
* otherwise,
* vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
*
* 3) prove how vecs(B) <= ncpu(B):
*
* if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
* over-allocated, so vecs(B) <= ncpu(B),
*
* otherwise:
*
* vecs(A) =
* round_down(V * ncpu(A) / N) =
* round_down((N - delta) * ncpu(A) / N) =
* round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
* round_down((N * ncpu(A) - delta * N) / N) =
* cpu(A) - delta
*
* then:
*
* vecs(A) - V >= ncpu(A) - delta - V
* =>
* V - vecs(A) <= V + delta - ncpu(A)
* =>
* vecs(B) <= N - ncpu(A)
* =>
* vecs(B) <= cpu(B)
*
* For nodes >= 3, it can be thought as one node and another big
* node given that is exactly what this algorithm is implemented,
* and we always re-calculate 'remaining_ncpus' & 'numvecs', and
* finally for each node X: vecs(X) <= ncpu(X).
*
*/
for (n = 0; n < nr_node_ids; n++) {
unsigned nvectors, ncpus;
if (node_vectors[n].ncpus == UINT_MAX)
continue;
WARN_ON_ONCE(numvecs == 0);
ncpus = node_vectors[n].ncpus;
nvectors = max_t(unsigned, 1,
numvecs * ncpus / remaining_ncpus);
WARN_ON_ONCE(nvectors > ncpus);
node_vectors[n].nvectors = nvectors;
remaining_ncpus -= ncpus;
numvecs -= nvectors;
}
}
static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = firstvec + numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
struct node_vectors *node_vectors;
if (!cpumask_weight(cpu_mask))
return 0;
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
/*
* If the number of nodes in the mask is greater than or equal the
* number of vectors we just spread the vectors across the nodes.
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = firstvec;
}
return numvecs;
}
node_vectors = kcalloc(nr_node_ids,
sizeof(struct node_vectors),
GFP_KERNEL);
if (!node_vectors)
return -ENOMEM;
/* allocate vector number for each node */
alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
nodemsk, nmsk, node_vectors);
for (i = 0; i < nr_node_ids; i++) {
unsigned int ncpus, v;
struct node_vectors *nv = &node_vectors[i];
if (nv->nvectors == UINT_MAX)
continue;
/* Get the cpus on this node which are in the mask */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
ncpus = cpumask_weight(nmsk);
if (!ncpus)
continue;
WARN_ON_ONCE(nv->nvectors > ncpus);
/* Account for rounding errors */
extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
/* Spread allocated vectors on CPUs of the current node */
for (v = 0; v < nv->nvectors; v++, curvec++) {
cpus_per_vec = ncpus / nv->nvectors;
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
--extra_vecs;
}
/*
* wrapping has to be considered given 'startvec'
* may start anywhere
*/
if (curvec >= last_affv)
curvec = firstvec;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
done += nv->nvectors;
}
kfree(node_vectors);
return done;
}
/*
* build affinity in two stages:
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return ret;
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
goto fail_nmsk;
node_to_cpumask = alloc_node_to_cpumask();
if (!node_to_cpumask)
goto fail_npresmsk;
/* Stabilize the cpumasks */
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
node_to_cpumask, cpu_present_mask,
nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
nr_present = ret;
/*
* Spread on non present CPUs starting from the next vector to be
* handled. If the spreading of present CPUs already exhausted the
* vector space, assign the non present CPUs to the already spread
* out vectors.
*/
if (nr_present >= numvecs)
curvec = firstvec;
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
node_to_cpumask, npresmsk, nmsk,
masks);
if (ret >= 0)
nr_others = ret;
fail_build_affinity:
cpus_read_unlock();
if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);
free_node_to_cpumask(node_to_cpumask);
fail_npresmsk:
free_cpumask_var(npresmsk);
fail_nmsk:
free_cpumask_var(nmsk);
return ret < 0 ? ret : 0;
}
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
{
affd->nr_sets = 1;
affd->set_size[0] = affvecs;
}
/**
* irq_create_affinity_masks - Create affinity masks for multiqueue spreading
* @nvecs: The total number of vectors
* @affd: Description of the affinity requirements
*
* Returns the irq_affinity_desc pointer or NULL if allocation failed.
*/
struct irq_affinity_desc *
irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
{
unsigned int affvecs, curvec, usedvecs, i;
struct irq_affinity_desc *masks = NULL;
/*
* Determine the number of vectors which need interrupt affinities
* assigned. If the pre/post request exhausts the available vectors
* then nothing to do here except for invoking the calc_sets()
* callback so the device driver can adjust to the situation.
*/
if (nvecs > affd->pre_vectors + affd->post_vectors)
affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
else
affvecs = 0;
/*
* Simple invocations do not provide a calc_sets() callback. Install
* the generic one.
*/
if (!affd->calc_sets)
affd->calc_sets = default_calc_sets;
/* Recalculate the sets */
affd->calc_sets(affd, affvecs);
if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
return NULL;
/* Nothing to assign? */
if (!affvecs)
return NULL;
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
return NULL;
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
/*
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
int ret;
ret = irq_build_affinity_masks(curvec, this_vecs,
curvec, masks);
if (ret) {
kfree(masks);
return NULL;
}
curvec += this_vecs;
usedvecs += this_vecs;
}
/* Fill out vectors at the end that don't need affinity */
if (usedvecs >= affvecs)
curvec = affd->pre_vectors + affvecs;
else
curvec = affd->pre_vectors + usedvecs;
for (; curvec < nvecs; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
/* Mark the managed interrupts */
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
return masks;
}
/**
* irq_calc_affinity_vectors - Calculate the optimal number of vectors
* @minvec: The minimum number of vectors available
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
const struct irq_affinity *affd)
{
unsigned int resv = affd->pre_vectors + affd->post_vectors;
unsigned int set_vecs;
if (resv > minvec)
return 0;
if (affd->calc_sets) {
set_vecs = maxvec - resv;
} else {
cpus_read_lock();
set_vecs = cpumask_weight(cpu_possible_mask);
cpus_read_unlock();
}
return resv + min(set_vecs, maxvec - resv);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Alarmtimer interface
*
* This interface provides a timer which is similar to hrtimers,
* but triggers a RTC alarm if the box is suspend.
*
* This interface is influenced by the Android RTC Alarm timer
* interface.
*
* Copyright (C) 2010 IBM Corporation
*
* Author: John Stultz <john.stultz@linaro.org>
*/
#include <linux/time.h>
#include <linux/hrtimer.h>
#include <linux/timerqueue.h>
#include <linux/rtc.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/alarmtimer.h>
#include <linux/mutex.h>
#include <linux/platform_device.h>
#include <linux/posix-timers.h>
#include <linux/workqueue.h>
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/module.h>
#include <linux/time_namespace.h>
#include "posix-timers.h"
#define CREATE_TRACE_POINTS
#include <trace/events/alarmtimer.h>
/**
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
ktime_t (*get_ktime)(void);
void (*get_timespec)(struct timespec64 *tp);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
/* freezer information to handle clock_nanosleep triggered wakeups */
static enum alarmtimer_type freezer_alarmtype;
static ktime_t freezer_expires;
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
#endif
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
static DEFINE_SPINLOCK(rtcdev_lock);
/**
* alarmtimer_get_rtcdev - Return selected rtcdevice
*
* This function returns the rtc device to use for wakealarms.
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
struct rtc_device *ret;
spin_lock_irqsave(&rtcdev_lock, flags);
ret = rtcdev;
spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev,
struct class_interface *class_intf)
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
if (rtcdev)
return -EBUSY;
if (!test_bit(RTC_FEATURE_ALARM, rtc->features))
return -1;
if (!device_may_wakeup(rtc->dev.parent))
return -1;
pdev = platform_device_register_data(dev, "alarmtimer",
PLATFORM_DEVID_AUTO, NULL, 0);
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
spin_lock_irqsave(&rtcdev_lock, flags);
if (!IS_ERR(pdev) && !rtcdev) {
if (!try_module_get(rtc->owner)) {
ret = -1;
goto unlock;
}
rtcdev = rtc;
/* hold a reference so it doesn't go away */
get_device(dev);
pdev = NULL;
} else {
ret = -1;
}
unlock:
spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
return ret;
}
static inline void alarmtimer_rtc_timer_init(void)
{
rtc_timer_init(&rtctimer, NULL, NULL);
}
static struct class_interface alarmtimer_rtc_interface = {
.add_dev = &alarmtimer_rtc_add_device,
};
static int alarmtimer_rtc_interface_setup(void)
{
alarmtimer_rtc_interface.class = rtc_class;
return class_interface_register(&alarmtimer_rtc_interface);
}
static void alarmtimer_rtc_interface_remove(void)
{
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
static inline void alarmtimer_rtc_timer_init(void) { }
#endif
/**
* alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
* @base: pointer to the base where the timer is being run
* @alarm: pointer to alarm being enqueued.
*
* Adds alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
{
if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
timerqueue_del(&base->timerqueue, &alarm->node);
timerqueue_add(&base->timerqueue, &alarm->node);
alarm->state |= ALARMTIMER_STATE_ENQUEUED;
}
/**
* alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
* @base: pointer to the base where the timer is running
* @alarm: pointer to alarm being removed
*
* Removes alarm to a alarm_base timerqueue
*
* Must hold base->lock when calling.
*/
static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
{
if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
return;
timerqueue_del(&base->timerqueue, &alarm->node);
alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
}
/**
* alarmtimer_fired - Handles alarm hrtimer being fired.
* @timer: pointer to hrtimer being run
*
* When a alarm timer fires, this runs through the timerqueue to
* see which alarms expired, and runs those. If there are more alarm
* timers queued for the future, we set the hrtimer to fire when
* the next future alarm timer expires.
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
int ret = HRTIMER_NORESTART;
int restart = ALARMTIMER_NORESTART;
spin_lock_irqsave(&base->lock, flags);
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
if (alarm->function)
restart = alarm->function(alarm, base->get_ktime());
spin_lock_irqsave(&base->lock, flags);
if (restart != ALARMTIMER_NORESTART) {
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
alarmtimer_enqueue(base, alarm);
ret = HRTIMER_RESTART;
}
spin_unlock_irqrestore(&base->lock, flags);
trace_alarmtimer_fired(alarm, base->get_ktime());
return ret;
}
ktime_t alarm_expires_remaining(const struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
return ktime_sub(alarm->node.expires, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_expires_remaining);
#ifdef CONFIG_RTC_CLASS
/**
* alarmtimer_suspend - Suspend time callback
* @dev: unused
*
* When we are going into suspend, we look through the bases
* to see which is the soonest timer to expire. We then
* set an rtc timer to fire that far into the future, which
* will wake us from suspend.
*/
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
int i, ret, type;
struct rtc_device *rtc;
unsigned long flags;
struct rtc_time tm;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
expires = freezer_expires;
type = freezer_alarmtype;
freezer_delta = 0;
spin_unlock_irqrestore(&freezer_delta_lock, flags);
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
if (!rtc)
return 0;
/* Find the soonest timer to expire*/
for (i = 0; i < ALARM_NUMTYPE; i++) {
struct alarm_base *base = &alarm_bases[i];
struct timerqueue_node *next;
ktime_t delta;
spin_lock_irqsave(&base->lock, flags);
next = timerqueue_getnext(&base->timerqueue);
spin_unlock_irqrestore(&base->lock, flags);
if (!next)
continue;
delta = ktime_sub(next->expires, base->get_ktime());
if (!min || (delta < min)) {
expires = next->expires;
min = delta;
type = i;
}
}
if (min == 0)
return 0;
if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
pm_wakeup_event(dev, 2 * MSEC_PER_SEC);
return -EBUSY;
}
trace_alarmtimer_suspend(expires, type);
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
/* Set alarm, if in the past reject suspend briefly to handle */
ret = rtc_timer_start(rtc, &rtctimer, now, 0);
if (ret < 0)
pm_wakeup_event(dev, MSEC_PER_SEC);
return ret;
}
static int alarmtimer_resume(struct device *dev)
{
struct rtc_device *rtc;
rtc = alarmtimer_get_rtcdev();
if (rtc)
rtc_timer_cancel(rtc, &rtctimer);
return 0;
}
#else
static int alarmtimer_suspend(struct device *dev)
{
return 0;
}
static int alarmtimer_resume(struct device *dev)
{
return 0;
}
#endif
static void
__alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
timerqueue_init(&alarm->node);
alarm->timer.function = alarmtimer_fired;
alarm->function = function;
alarm->type = type;
alarm->state = ALARMTIMER_STATE_INACTIVE;
}
/**
* alarm_init - Initialize an alarm structure
* @alarm: ptr to alarm to be initialized
* @type: the type of the alarm
* @function: callback that is run when the alarm fires
*/
void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
HRTIMER_MODE_ABS);
__alarm_init(alarm, type, function);
}
EXPORT_SYMBOL_GPL(alarm_init);
/**
* alarm_start - Sets an absolute alarm to fire
* @alarm: ptr to alarm to set
* @start: time to run the alarm
*/
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
spin_lock_irqsave(&base->lock, flags);
alarm->node.expires = start;
alarmtimer_enqueue(base, alarm);
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
trace_alarmtimer_start(alarm, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_start);
/**
* alarm_start_relative - Sets a relative alarm to fire
* @alarm: ptr to alarm to set
* @start: time relative to now to run the alarm
*/
void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
start = ktime_add_safe(start, base->get_ktime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
spin_lock_irqsave(&base->lock, flags);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
/**
* alarm_try_to_cancel - Tries to cancel an alarm timer
* @alarm: ptr to alarm to be canceled
*
* Returns 1 if the timer was canceled, 0 if it was not running,
* and -1 if the callback was running
*/
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
unsigned long flags;
int ret;
spin_lock_irqsave(&base->lock, flags);
ret = hrtimer_try_to_cancel(&alarm->timer);
if (ret >= 0)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
/**
* alarm_cancel - Spins trying to cancel an alarm timer until it is done
* @alarm: ptr to alarm to be canceled
*
* Returns 1 if the timer was canceled, 0 if it was not active.
*/
int alarm_cancel(struct alarm *alarm)
{
for (;;) {
int ret = alarm_try_to_cancel(alarm);
if (ret >= 0)
return ret;
hrtimer_cancel_wait_running(&alarm->timer);
}
}
EXPORT_SYMBOL_GPL(alarm_cancel);
u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
{
u64 overrun = 1;
ktime_t delta;
delta = ktime_sub(now, alarm->node.expires);
if (delta < 0)
return 0;
if (unlikely(delta >= interval)) {
s64 incr = ktime_to_ns(interval);
overrun = ktime_divns(delta, incr);
alarm->node.expires = ktime_add_ns(alarm->node.expires,
incr*overrun);
if (alarm->node.expires > now)
return overrun;
/*
* This (and the ktime_add() below) is the
* correction for exact:
*/
overrun++;
}
alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
{
struct alarm_base *base = &alarm_bases[alarm->type];
return alarm_forward(alarm, base->get_ktime(), interval);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
#ifdef CONFIG_POSIX_TIMERS
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
unsigned long flags;
ktime_t delta;
switch(type) {
case ALARM_REALTIME:
base = &alarm_bases[ALARM_REALTIME];
type = ALARM_REALTIME_FREEZER;
break;
case ALARM_BOOTTIME:
base = &alarm_bases[ALARM_BOOTTIME];
type = ALARM_BOOTTIME_FREEZER;
break;
default:
WARN_ONCE(1, "Invalid alarm type: %d\n", type);
return;
}
delta = ktime_sub(absexp, base->get_ktime());
spin_lock_irqsave(&freezer_delta_lock, flags);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
* clock2alarm - helper that converts from clockid to alarmtypes
* @clockid: clockid.
*/
static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
if (clockid == CLOCK_BOOTTIME_ALARM)
return ALARM_BOOTTIME;
return -1;
}
/**
* alarm_handle_timer - Callback for posix timers
* @alarm: alarm that fired
* @now: time at the timer expiration
*
* Posix timer callback for expired alarm timers.
*
* Return: whether the timer is to be restarted
*/
static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
ktime_t now)
{
struct k_itimer *ptr = container_of(alarm, struct k_itimer,
it.alarm.alarmtimer);
enum alarmtimer_restart result = ALARMTIMER_NORESTART;
unsigned long flags;
int si_private = 0;
spin_lock_irqsave(&ptr->it_lock, flags);
ptr->it_active = 0;
if (ptr->it_interval)
si_private = ++ptr->it_requeue_pending;
if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
/*
* Handle ignored signals and rearm the timer. This will go
* away once we handle ignored signals proper.
*/
ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
++ptr->it_requeue_pending;
ptr->it_active = 1;
result = ALARMTIMER_RESTART;
}
spin_unlock_irqrestore(&ptr->it_lock, flags);
return result;
}
/**
* alarm_timer_rearm - Posix timer callback for rearming timer
* @timr: Pointer to the posixtimer data struct
*/
static void alarm_timer_rearm(struct k_itimer *timr)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
alarm_start(alarm, alarm->node.expires);
}
/**
* alarm_timer_forward - Posix timer callback for forwarding timer
* @timr: Pointer to the posixtimer data struct
* @now: Current time to forward the timer against
*/
static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
return alarm_forward(alarm, timr->it_interval, now);
}
/**
* alarm_timer_remaining - Posix timer callback to retrieve remaining time
* @timr: Pointer to the posixtimer data struct
* @now: Current time to calculate against
*/
static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
return ktime_sub(alarm->node.expires, now);
}
/**
* alarm_timer_try_to_cancel - Posix timer callback to cancel a timer
* @timr: Pointer to the posixtimer data struct
*/
static int alarm_timer_try_to_cancel(struct k_itimer *timr)
{
return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
}
/**
* alarm_timer_wait_running - Posix timer callback to wait for a timer
* @timr: Pointer to the posixtimer data struct
*
* Called from the core code when timer cancel detected that the callback
* is running. @timr is unlocked and rcu read lock is held to prevent it
* from being freed.
*/
static void alarm_timer_wait_running(struct k_itimer *timr)
{
hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
}
/**
* alarm_timer_arm - Posix timer callback to arm a timer
* @timr: Pointer to the posixtimer data struct
* @expires: The new expiry time
* @absolute: Expiry value is absolute time
* @sigev_none: Posix timer does not deliver signals
*/
static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
bool absolute, bool sigev_none)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
struct alarm_base *base = &alarm_bases[alarm->type];
if (!absolute)
expires = ktime_add_safe(expires, base->get_ktime());
if (sigev_none)
alarm->node.expires = expires;
else
alarm_start(&timr->it.alarm.alarmtimer, expires);
}
/**
* alarm_clock_getres - posix getres interface
* @which_clock: clockid
* @tp: timespec to fill
*
* Returns the granularity of underlying alarm base clock
*/
static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
if (!alarmtimer_get_rtcdev())
return -EINVAL;
tp->tv_sec = 0;
tp->tv_nsec = hrtimer_resolution;
return 0;
}
/**
* alarm_clock_get_timespec - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
* Provides the underlying alarm base time in a tasks time namespace.
*/
static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
base->get_timespec(tp);
return 0;
}
/**
* alarm_clock_get_ktime - posix clock_get_ktime interface
* @which_clock: clockid
*
* Provides the underlying alarm base time in the root namespace.
*/
static ktime_t alarm_clock_get_ktime(clockid_t which_clock)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
return -EINVAL;
return base->get_ktime();
}
/**
* alarm_timer_create - posix timer_create interface
* @new_timer: k_itimer pointer to manage
*
* Initializes the k_itimer structure.
*/
static int alarm_timer_create(struct k_itimer *new_timer)
{
enum alarmtimer_type type;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
type = clock2alarm(new_timer->it_clock);
alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
return 0;
}
/**
* alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
* @alarm: ptr to alarm that fired
* @now: time at the timer expiration
*
* Wakes up the task that set the alarmtimer
*
* Return: ALARMTIMER_NORESTART
*/
static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
ktime_t now)
{
struct task_struct *task = (struct task_struct *)alarm->data;
alarm->data = NULL;
if (task)
wake_up_process(task);
return ALARMTIMER_NORESTART;
}
/**
* alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
* @alarm: ptr to alarmtimer
* @absexp: absolute expiration time
* @type: alarm type (BOOTTIME/REALTIME).
*
* Sets the alarm timer and sleeps until it is fired or interrupted.
*/
static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
enum alarmtimer_type type)
{
struct restart_block *restart;
alarm->data = (void *)current;
do {
set_current_state(TASK_INTERRUPTIBLE);
alarm_start(alarm, absexp);
if (likely(alarm->data))
schedule();
alarm_cancel(alarm);
} while (alarm->data && !signal_pending(current));
__set_current_state(TASK_RUNNING);
destroy_hrtimer_on_stack(&alarm->timer);
if (!alarm->data)
return 0;
if (freezing(current))
alarmtimer_freezerset(absexp, type);
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
struct timespec64 rmt;
ktime_t rem;
rem = ktime_sub(absexp, alarm_bases[type].get_ktime());
if (rem <= 0)
return 0;
rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
return -ERESTART_RESTARTBLOCK;
}
static void
alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type,
enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
{
hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid,
HRTIMER_MODE_ABS);
__alarm_init(alarm, type, function);
}
/**
* alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
* @restart: ptr to restart block
*
* Handles restarted clock_nanosleep calls
*/
static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
{
enum alarmtimer_type type = restart->nanosleep.clockid;
ktime_t exp = restart->nanosleep.expires;
struct alarm alarm;
alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
return alarmtimer_do_nsleep(&alarm, exp, type);
}
/**
* alarm_timer_nsleep - alarmtimer nanosleep
* @which_clock: clockid
* @flags: determines abstime or relative
* @tsreq: requested sleep time (abs or rel)
*
* Handles clock_nanosleep calls against _ALARM clockids
*/
static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *tsreq)
{
enum alarmtimer_type type = clock2alarm(which_clock);
struct restart_block *restart = &current->restart_block;
struct alarm alarm;
ktime_t exp;
int ret = 0;
if (!alarmtimer_get_rtcdev())
return -EOPNOTSUPP;
if (flags & ~TIMER_ABSTIME)
return -EINVAL;
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup);
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
} else {
exp = timens_ktime_to_host(which_clock, exp);
}
ret = alarmtimer_do_nsleep(&alarm, exp, type);
if (ret != -ERESTART_RESTARTBLOCK)
return ret;
/* abs timers don't set remaining time or restart */
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
.clock_get_ktime = alarm_clock_get_ktime,
.clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
.timer_get = common_timer_get,
.timer_arm = alarm_timer_arm,
.timer_rearm = alarm_timer_rearm,
.timer_forward = alarm_timer_forward,
.timer_remaining = alarm_timer_remaining,
.timer_try_to_cancel = alarm_timer_try_to_cancel,
.timer_wait_running = alarm_timer_wait_running,
.nsleep = alarm_timer_nsleep,
};
#endif /* CONFIG_POSIX_TIMERS */
/* Suspend hook structures */
static const struct dev_pm_ops alarmtimer_pm_ops = {
.suspend = alarmtimer_suspend,
.resume = alarmtimer_resume,
};
static struct platform_driver alarmtimer_driver = {
.driver = {
.name = "alarmtimer",
.pm = &alarmtimer_pm_ops,
}
};
static void get_boottime_timespec(struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
timens_add_boottime(tp);
}
/**
* alarmtimer_init - Initialize alarm timer code
*
* This function initializes the alarm bases and registers
* the posix clock ids.
*/
static int __init alarmtimer_init(void)
{
int error;
int i;
alarmtimer_rtc_timer_init();
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64;
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
}
error = alarmtimer_rtc_interface_setup();
if (error)
return error;
error = platform_driver_register(&alarmtimer_driver);
if (error)
goto out_if;
return 0;
out_if:
alarmtimer_rtc_interface_remove();
return error;
}
device_initcall(alarmtimer_init);
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016,2017 Facebook
*/
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate_trace.h>
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
int i;
for (i = 0; i < array->map.max_entries; i++) {
free_percpu(array->pptrs[i]);
cond_resched();
}
}
static int bpf_array_alloc_percpu(struct bpf_array *array)
{
void __percpu *ptr;
int i;
for (i = 0; i < array->map.max_entries; i++) {
ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
GFP_USER | __GFP_NOWARN);
if (!ptr) {
bpf_array_free_percpu(array);
return -ENOMEM;
}
array->pptrs[i] = ptr;
cond_resched();
}
return 0;
}
/* Called from syscall */
int array_map_alloc_check(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 ||
attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
!bpf_map_flags_access_ok(attr->map_flags) ||
(percpu && numa_node != NUMA_NO_NODE))
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
*/
return -E2BIG;
return 0;
}
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
bool bypass_spec_v1 = bpf_bypass_spec_v1();
u64 array_size, mask64;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
max_entries = attr->max_entries;
/* On 32 bit archs roundup_pow_of_two() with max_entries that has
* upper most bit set in u32 space is undefined behavior due to
* resulting 1U << 32, so do it manually here in u64 space.
*/
mask64 = fls_long(max_entries - 1);
mask64 = 1ULL << mask64;
mask64 -= 1;
index_mask = mask64;
if (!bypass_spec_v1) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
max_entries = index_mask + 1;
/* Check for overflows. */
if (max_entries < attr->max_entries)
return ERR_PTR(-E2BIG);
}
array_size = sizeof(*array);
if (percpu) {
array_size += (u64) max_entries * sizeof(void *);
} else {
/* rely on vmalloc() to return page-aligned memory and
* ensure array->value is exactly page-aligned
*/
if (attr->map_flags & BPF_F_MMAPABLE) {
array_size = PAGE_ALIGN(array_size);
array_size += PAGE_ALIGN((u64) max_entries * elem_size);
} else {
array_size += (u64) max_entries * elem_size;
}
}
/* allocate all map elements and zero-initialize them */
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
if (!data)
return ERR_PTR(-ENOMEM);
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node);
}
if (!array)
return ERR_PTR(-ENOMEM);
array->index_mask = index_mask;
array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
return &array->map;
}
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
if (unlikely(index >= array->map.max_entries))
return NULL;
return array->value + array->elem_size * (index & array->index_mask);
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
u32 off)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
if (map->max_entries != 1)
return -ENOTSUPP;
if (off >= map->value_size)
return -EINVAL;
*imm = (unsigned long)array->value;
return 0;
}
static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
u32 *off)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u64 base = (unsigned long)array->value;
u64 range = array->elem_size;
if (map->max_entries != 1)
return -ENOTSUPP;
if (imm < base || imm >= base + range)
return -ENOENT;
*off = imm - base;
return 0;
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
if (map->map_flags & BPF_F_INNER_MAP)
return -EOPNOTSUPP;
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
}
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
} else {
*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
}
*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*insn++ = BPF_MOV64_IMM(ret, 0);
return insn - insn_buf;
}
/* Called from eBPF program */
static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
if (unlikely(index >= array->map.max_entries))
return NULL;
return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
int cpu, off = 0;
u32 size;
if (unlikely(index >= array->map.max_entries))
return -ENOENT;
/* per_cpu areas are zero-filled and bpf programs can only
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
}
rcu_read_unlock();
return 0;
}
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
*next = 0;
return 0;
}
if (index == array->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
{
if (unlikely(map_value_has_timer(&arr->map)))
bpf_timer_cancel_and_free(val + arr->map.timer_off);
}
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
char *val;
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(map)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
} else {
val = array->value +
array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
check_and_free_timer_in_array(array, val);
}
return 0;
}
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
int cpu, off = 0;
u32 size;
if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
/* the user space will provide round_up(value_size, 8) bytes that
* will be copied into per-cpu area. bpf programs can only access
* value_size of it. During lookup the same extra bytes will be
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
}
rcu_read_unlock();
return 0;
}
/* Called from syscall or from eBPF program */
static int array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
static void *array_map_vmalloc_addr(struct bpf_array *array)
{
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
static void array_map_free_timers(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
if (likely(!map_value_has_timer(map)))
return;
for (i = 0; i < array->map.max_entries; i++)
bpf_timer_cancel_and_free(array->value + array->elem_size * i +
map->timer_off);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
if (array->map.map_flags & BPF_F_MMAPABLE)
bpf_map_area_free(array_map_vmalloc_addr(array));
else
bpf_map_area_free(array);
}
static void array_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
void *value;
rcu_read_lock();
value = array_map_lookup_elem(map, key);
if (!value) {
rcu_read_unlock();
return;
}
if (map->btf_key_type_id)
seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
seq_puts(m, "\n");
rcu_read_unlock();
}
static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
int cpu;
rcu_read_lock();
seq_printf(m, "%u: {\n", *(u32 *)key);
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
seq_puts(m, "\n");
}
seq_puts(m, "}\n");
rcu_read_unlock();
}
static int array_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
u32 int_data;
/* One exception for keyless BTF: .bss/.data/.rodata map */
if (btf_type_is_void(key_type)) {
if (map->map_type != BPF_MAP_TYPE_ARRAY ||
map->max_entries != 1)
return -EINVAL;
if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
return -EINVAL;
return 0;
}
if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
int_data = *(u32 *)(key_type + 1);
/* bpf array can only take a u32 key. This check makes sure
* that the btf matches the attr used during map_create.
*/
if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
return -EINVAL;
return 0;
}
static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
if (!(map->map_flags & BPF_F_MMAPABLE))
return -EINVAL;
if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
return -EINVAL;
return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
vma->vm_pgoff + pgoff);
}
static bool array_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1)
{
if (!bpf_map_meta_equal(meta0, meta1))
return false;
return meta0->map_flags & BPF_F_INNER_MAP ? true :
meta0->max_entries == meta1->max_entries;
}
struct bpf_iter_seq_array_map_info {
struct bpf_map *map;
void *percpu_value_buf;
u32 index;
};
static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
{
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_map *map = info->map;
struct bpf_array *array;
u32 index;
if (info->index >= map->max_entries)
return NULL;
if (*pos == 0)
++*pos;
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
return array->value + array->elem_size * index;
}
static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_map *map = info->map;
struct bpf_array *array;
u32 index;
++*pos;
++info->index;
if (info->index >= map->max_entries)
return NULL;
array = container_of(map, struct bpf_array, map);
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
return array->value + array->elem_size * index;
}
static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
void __percpu **pptr;
u32 size;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, v == NULL);
if (!prog)
return 0;
ctx.meta = &meta;
ctx.map = info->map;
if (v) {
ctx.key = &info->index;
if (!info->percpu_value_buf) {
ctx.value = v;
} else {
pptr = v;
size = round_up(map->value_size, 8);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(info->percpu_value_buf + off,
per_cpu_ptr(pptr, cpu),
size);
off += size;
}
ctx.value = info->percpu_value_buf;
}
}
return bpf_iter_run_prog(prog, &ctx);
}
static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
{
return __bpf_array_map_seq_show(seq, v);
}
static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
{
if (!v)
(void)__bpf_array_map_seq_show(seq, NULL);
}
static int bpf_iter_init_array_map(void *priv_data,
struct bpf_iter_aux_info *aux)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
struct bpf_map *map = aux->map;
void *value_buf;
u32 buf_size;
if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
buf_size = round_up(map->value_size, 8) * num_possible_cpus();
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
if (!value_buf)
return -ENOMEM;
seq_info->percpu_value_buf = value_buf;
}
seq_info->map = map;
return 0;
}
static void bpf_iter_fini_array_map(void *priv_data)
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
kfree(seq_info->percpu_value_buf);
}
static const struct seq_operations bpf_array_map_seq_ops = {
.start = bpf_array_map_seq_start,
.next = bpf_array_map_seq_next,
.stop = bpf_array_map_seq_stop,
.show = bpf_array_map_seq_show,
};
static const struct bpf_iter_seq_info iter_seq_info = {
.seq_ops = &bpf_array_map_seq_ops,
.init_seq_private = bpf_iter_init_array_map,
.fini_seq_private = bpf_iter_fini_array_map,
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
bool is_percpu;
u64 ret = 0;
void *val;
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
if (is_percpu)
migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
else
val = array->value + array->elem_size * i;
num_elems++;
key = i;
ret = callback_fn((u64)(long)map, (u64)(long)&key,
(u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
break;
}
if (is_percpu)
migrate_enable();
return num_elems;
}
static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta,
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &array_map_btf_id,
.iter_seq_info = &iter_seq_info,
};
static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &percpu_array_map_btf_id,
.iter_seq_info = &iter_seq_info,
};
static int fd_array_map_alloc_check(union bpf_attr *attr)
{
/* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return -EINVAL;
/* Program read-only/write-only not supported for special maps yet. */
if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
return -EINVAL;
return array_map_alloc_check(attr);
}
static void fd_array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
return ERR_PTR(-EOPNOTSUPP);
}
/* only called from syscall */
int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
{
void **elem, *ptr;
int ret = 0;
if (!map->ops->map_fd_sys_lookup_elem)
return -ENOTSUPP;
rcu_read_lock();
elem = array_map_lookup_elem(map, key);
if (elem && (ptr = READ_ONCE(*elem)))
*value = map->ops->map_fd_sys_lookup_elem(ptr);
else
ret = -ENOENT;
rcu_read_unlock();
return ret;
}
/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
if (map_flags != BPF_ANY)
return -EINVAL;
if (index >= array->map.max_entries)
return -E2BIG;
ufd = *(u32 *)value;
new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, new_ptr);
map->ops->map_poke_run(map, index, old_ptr, new_ptr);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, new_ptr);
}
if (old_ptr)
map->ops->map_fd_put_ptr(old_ptr);
return 0;
}
static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
u32 index = *(u32 *)key;
if (index >= array->map.max_entries)
return -E2BIG;
if (map->ops->map_poke_run) {
mutex_lock(&array->aux->poke_mutex);
old_ptr = xchg(array->ptrs + index, NULL);
map->ops->map_poke_run(map, index, old_ptr, NULL);
mutex_unlock(&array->aux->poke_mutex);
} else {
old_ptr = xchg(array->ptrs + index, NULL);
}
if (old_ptr) {
map->ops->map_fd_put_ptr(old_ptr);
return 0;
} else {
return -ENOENT;
}
}
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
if (!bpf_prog_array_compatible(array, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
return prog;
}
static void prog_fd_array_put_ptr(void *ptr)
{
bpf_prog_put(ptr);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)
{
return ((struct bpf_prog *)ptr)->aux->id;
}
/* decrement refcnt of all bpf_progs that are stored in this map */
static void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
fd_array_map_delete_elem(map, &i);
}
static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
void **elem, *ptr;
u32 prog_id;
rcu_read_lock();
elem = array_map_lookup_elem(map, key);
if (elem) {
ptr = READ_ONCE(*elem);
if (ptr) {
seq_printf(m, "%u: ", *(u32 *)key);
prog_id = prog_fd_array_sys_lookup_elem(ptr);
btf_type_seq_show(map->btf, map->btf_value_type_id,
&prog_id, m);
seq_puts(m, "\n");
}
}
rcu_read_unlock();
}
struct prog_poke_elem {
struct list_head list;
struct bpf_prog_aux *aux;
};
static int prog_array_map_poke_track(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
int ret = 0;
aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry(elem, &aux->poke_progs, list) {
if (elem->aux == prog_aux)
goto out;
}
elem = kmalloc(sizeof(*elem), GFP_KERNEL);
if (!elem) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&elem->list);
/* We must track the program's aux info at this point in time
* since the program pointer itself may not be stable yet, see
* also comment in prog_array_map_poke_run().
*/
elem->aux = prog_aux;
list_add_tail(&elem->list, &aux->poke_progs);
out:
mutex_unlock(&aux->poke_mutex);
return ret;
}
static void prog_array_map_poke_untrack(struct bpf_map *map,
struct bpf_prog_aux *prog_aux)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;
aux = container_of(map, struct bpf_array, map)->aux;
mutex_lock(&aux->poke_mutex);
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
if (elem->aux == prog_aux) {
list_del_init(&elem->list);
kfree(elem);
break;
}
}
mutex_unlock(&aux->poke_mutex);
}
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
aux = container_of(map, struct bpf_array, map)->aux;
WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
int i, ret;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
/* Few things to be aware of:
*
* 1) We can only ever access aux in this context, but
* not aux->prog since it might not be stable yet and
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
* entry. We skip these as poke->tailcall_target_stable
* is not active yet. The JIT will do the final fixup
* before setting it stable. The various
* poke->tailcall_target_stable are successively
* activated, so tail call updates can arrive from here
* while JIT is still finishing its final fixup for
* non-activated poke entries.
* 3) On program teardown, the program's kallsym entry gets
* removed out of RCU callback, but we can only untrack
* from sleepable context, therefore bpf_arch_text_poke()
* might not see that this is in BPF text section and
* bails out with -EINVAL. As these are unreachable since
* RCU grace period already passed, we simply skip them.
* 4) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
* buffer is freed. When we're still in the middle of
* patching and suddenly kallsyms entry of the program
* gets evicted, we just skip the rest which is fine due
* to point 3).
* 5) Any other error happening below from bpf_arch_text_poke()
* is a unexpected bug.
*/
if (!READ_ONCE(poke->tailcall_target_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
if (poke->tail_call.map != map ||
poke->tail_call.key != key)
continue;
old_bypass_addr = old ? NULL : poke->bypass_addr;
old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
if (new) {
ret = bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
old_addr, new_addr);
BUG_ON(ret < 0 && ret != -EINVAL);
if (!old) {
ret = bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
poke->bypass_addr,
NULL);
BUG_ON(ret < 0 && ret != -EINVAL);
}
} else {
ret = bpf_arch_text_poke(poke->tailcall_bypass,
BPF_MOD_JUMP,
old_bypass_addr,
poke->bypass_addr);
BUG_ON(ret < 0 && ret != -EINVAL);
/* let other CPUs finish the execution of program
* so that it will not possible to expose them
* to invalid nop, stack unwind, nop state
*/
if (!ret)
synchronize_rcu();
ret = bpf_arch_text_poke(poke->tailcall_target,
BPF_MOD_JUMP,
old_addr, NULL);
BUG_ON(ret < 0 && ret != -EINVAL);
}
}
}
}
static void prog_array_map_clear_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_array_aux,
work)->map;
bpf_fd_array_map_clear(map);
bpf_map_put(map);
}
static void prog_array_map_clear(struct bpf_map *map)
{
struct bpf_array_aux *aux = container_of(map, struct bpf_array,
map)->aux;
bpf_map_inc(map);
schedule_work(&aux->work);
}
static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
struct bpf_array_aux *aux;
struct bpf_map *map;
aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
if (!aux)
return ERR_PTR(-ENOMEM);
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
kfree(aux);
return map;
}
container_of(map, struct bpf_array, map)->aux = aux;
aux->map = map;
return map;
}
static void prog_array_map_free(struct bpf_map *map)
{
struct prog_poke_elem *elem, *tmp;
struct bpf_array_aux *aux;
aux = container_of(map, struct bpf_array, map)->aux;
list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
list_del_init(&elem->list);
kfree(elem);
}
kfree(aux);
fd_array_map_free(map);
}
/* prog_array->aux->{type,jited} is a runtime binding.
* Doing static check alone in the verifier is not enough.
* Thus, prog_array_map cannot be used as an inner_map
* and map_meta_equal is not implemented.
*/
static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
.map_free = prog_array_map_free,
.map_poke_track = prog_array_map_poke_track,
.map_poke_untrack = prog_array_map_poke_untrack,
.map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
.map_btf_name = "bpf_array",
.map_btf_id = &prog_array_map_btf_id,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
struct file *map_file)
{
struct bpf_event_entry *ee;
ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
if (ee) {
ee->event = perf_file->private_data;
ee->perf_file = perf_file;
ee->map_file = map_file;
}
return ee;
}
static void __bpf_event_entry_free(struct rcu_head *rcu)
{
struct bpf_event_entry *ee;
ee = container_of(rcu, struct bpf_event_entry, rcu);
fput(ee->perf_file);
kfree(ee);
}
static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
{
call_rcu(&ee->rcu, __bpf_event_entry_free);
}
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_event_entry *ee;
struct perf_event *event;
struct file *perf_file;
u64 value;
perf_file = perf_event_get(fd);
if (IS_ERR(perf_file))
return perf_file;
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
if (ee)
return ee;
ee = ERR_PTR(-ENOMEM);
err_out:
fput(perf_file);
return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
bpf_event_entry_free_rcu(ptr);
}
static void perf_event_fd_array_release(struct bpf_map *map,
struct file *map_file)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_event_entry *ee;
int i;
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
return;
rcu_read_lock();
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
if (ee && ee->map_file == map_file)
fd_array_map_delete_elem(map, &i);
}
rcu_read_unlock();
}
static void perf_event_fd_array_map_free(struct bpf_map *map)
{
if (map->map_flags & BPF_F_PRESERVE_ELEMS)
bpf_fd_array_map_clear(map);
fd_array_map_free(map);
}
static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = perf_event_fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_array",
.map_btf_id = &perf_event_array_map_btf_id,
};
#ifdef CONFIG_CGROUPS
static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file /* not used */,
int fd)
{
return cgroup_get_from_fd(fd);
}
static void cgroup_fd_array_put_ptr(void *ptr)
{
/* cgroup_put free cgrp after a rcu grace period */
cgroup_put(ptr);
}
static void cgroup_fd_array_free(struct bpf_map *map)
{
bpf_fd_array_map_clear(map);
fd_array_map_free(map);
}
static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_array",
.map_btf_id = &cgroup_array_map_btf_id,
};
#endif
static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
{
struct bpf_map *map, *inner_map_meta;
inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
if (IS_ERR(inner_map_meta))
return inner_map_meta;
map = array_map_alloc(attr);
if (IS_ERR(map)) {
bpf_map_meta_free(inner_map_meta);
return map;
}
map->inner_map_meta = inner_map_meta;
return map;
}
static void array_of_map_free(struct bpf_map *map)
{
/* map->inner_map_meta is only accessed by syscall which
* is protected by fdget/fdput.
*/
bpf_map_meta_free(map->inner_map_meta);
bpf_fd_array_map_clear(map);
fd_array_map_free(map);
}
static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_map **inner_map = array_map_lookup_elem(map, key);
if (!inner_map)
return NULL;
return READ_ONCE(*inner_map);
}
static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
}
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*insn++ = BPF_MOV64_IMM(ret, 0);
return insn - insn_buf;
}
static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
.map_free = array_of_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = array_of_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
.map_btf_name = "bpf_array",
.map_btf_id = &array_of_maps_map_btf_id,
};
// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*/
/*
Goals and Theory of Operation
The primary goal of this feature is to reduce the kernel boot time,
by doing various independent hardware delays and discovery operations
decoupled and not strictly serialized.
More specifically, the asynchronous function call concept allows
certain operations (primarily during system boot) to happen
asynchronously, out of order, while these operations still
have their externally visible parts happen sequentially and in-order.
(not unlike how out-of-order CPUs retire their instructions in order)
Key to the asynchronous function call implementation is the concept of
a "sequence cookie" (which, although it has an abstracted type, can be
thought of as a monotonically incrementing number).
The async core will assign each scheduled event such a sequence cookie and
pass this to the called functions.
The asynchronously called function should before doing a globally visible
operation, such as registering device numbers, call the
async_synchronize_cookie() function and pass in its own cookie. The
async_synchronize_cookie() function will make sure that all asynchronous
operations that were scheduled prior to the operation corresponding with the
cookie have completed.
Subsystem/driver initialization code that scheduled asynchronous probe
functions, but which shares global resources with other drivers/subsystems
that do not use the asynchronous call feature, need to do a full
synchronization with the async_synchronize_full() function, before returning
from their init function. This is to maintain strict ordering between the
asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/atomic.h>
#include <linux/ktime.h>
#include <linux/export.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include "workqueue_internal.h"
static async_cookie_t next_cookie = 1;
#define MAX_WORK 32768
#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
static LIST_HEAD(async_global_pending); /* pending from all registered doms */
static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
struct async_entry {
struct list_head domain_list;
struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
async_func_t func;
void *data;
struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
static long long microseconds_since(ktime_t start)
{
ktime_t now = ktime_get();
return ktime_to_ns(ktime_sub(now, start)) >> 10;
}
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
if (domain) {
if (!list_empty(&domain->pending))
first = list_first_entry(&domain->pending,
struct async_entry, domain_list);
} else {
if (!list_empty(&async_global_pending))
first = list_first_entry(&async_global_pending,
struct async_entry, global_list);
}
if (first)
ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
/*
* pick the first pending entry and run it
*/
static void async_run_entry_fn(struct work_struct *work)
{
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
ktime_t calltime;
/* 1) run (and print duration) */
pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
entry->func(entry->data, entry->cookie);
pr_debug("initcall %lli_%pS returned after %lld usecs\n",
(long long)entry->cookie, entry->func,
microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
list_del_init(&entry->domain_list);
list_del_init(&entry->global_list);
/* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* 4) wake up any waiters */
wake_up(&async_done);
}
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @node: NUMA node that we want to schedule this on or close to
* @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
* @domain may be used in the async_synchronize_*_domain() functions to
* wait within a certain synchronization domain rather than globally.
*
* Note: This function may be called from atomic or non-atomic contexts.
*
* The node requested will be honored on a best effort basis. If the node
* has no CPUs associated with it then the work is distributed among all
* available CPUs.
*/
async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
int node, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
/*
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
func(data, newcookie);
return newcookie;
}
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = func;
entry->data = data;
entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
/* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
list_add_tail(&entry->domain_list, &domain->pending);
if (domain->registered)
list_add_tail(&entry->global_list, &async_global_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
return newcookie;
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
* async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*
* The node requested will be honored on a best effort basis. If the node
* has no CPUs associated with it then the work is distributed among all
* available CPUs.
*/
async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
*/
void async_synchronize_full(void)
{
async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain have been done.
*/
void async_synchronize_full_domain(struct async_domain *domain)
{
async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
* @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by @domain submitted prior to @cookie
* have been done.
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t starttime;
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
/**
* async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
*
* This function waits until all asynchronous function calls prior to @cookie
* have been done.
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
/**
* current_is_async - is %current an async worker task?
*
* Returns %true if %current is an async worker task.
*/
bool current_is_async(void)
{
struct worker *worker = current_wq_worker();
return worker && worker->current_func == async_run_entry_fn;
}
EXPORT_SYMBOL_GPL(current_is_async);
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
*
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
* 2) Minimal run-time overhead:
* a) Minimal when syscall auditing is disabled (audit_enable=0).
* b) Small when syscall auditing is enabled and no audit record
* is generated (defer as much work as possible to record
* generation time):
* i) context is allocated,
* ii) names from getname are stored without a copy, and
* iii) inode information stored from path_lookup.
* 3) Ability to disable syscall auditing at boot time (audit=0).
* 4) Usable by other parts of the kernel (if audit_log* is called,
* then a syscall record will be generated automatically for the
* current syscall).
* 5) Netlink interface to user-space.
* 6) Support low-overhead kernel-based filtering to minimize the
* information that must be passed to user-space.
*
* Audit userspace, documentation, tests, and bug/issue trackers:
* https://github.com/linux-audit
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/file.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>
#include <linux/audit.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>
#include "audit.h"
/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
* (Initialization happens after skb_init is called.) */
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
static int audit_initialized = AUDIT_UNINITIALIZED;
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
/* private audit network namespace index */
static unsigned int audit_net_id;
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
*/
struct audit_net {
struct sock *sk;
};
/**
* struct auditd_connection - kernel/auditd connection state
* @pid: auditd PID
* @portid: netlink portid
* @net: the associated network namespace
* @rcu: RCU head
*
* Description:
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the associated spinlock for writing.
*/
struct auditd_connection {
struct pid *pid;
u32 portid;
struct net *net;
struct rcu_head rcu;
};
static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
* audit records being dropped. */
static u32 audit_rate_limit;
/* Number of outstanding audit_buffers allowed.
* When set to zero, this means unlimited. */
static u32 audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
static kuid_t audit_sig_uid = INVALID_UID;
static pid_t audit_sig_pid = -1;
static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
2) out of memory in audit_log_move [alloc_skb]
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
/* Monotonically increasing sum of time the kernel has spent
* waiting while the backlog limit is exceeded.
*/
static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
static struct kmem_cache *audit_buffer_cache;
/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
static struct sk_buff_head audit_hold_queue;
/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
.mask = -1,
.features = 0,
.lock = 0,};
static char *audit_feature_names[2] = {
"only_unset_loginuid",
"loginuid_immutable",
};
/**
* struct audit_ctl_mutex - serialize requests from userspace
* @lock: the mutex used for locking
* @owner: the task which owns the lock
*
* Description:
* This is the lock struct used to ensure we only process userspace requests
* in an orderly fashion. We can't simply use a mutex/lock here because we
* need to track lock ownership so we don't end up blocking the lock owner in
* audit_log_start() or similar.
*/
static struct audit_ctl_mutex {
struct mutex lock;
void *owner;
} audit_cmd_mutex;
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
* should be at least that large. */
#define AUDIT_BUFSIZ 1024
/* The audit_buffer is used when formatting an audit record. The caller
* locks briefly to get the record off the freelist or to allocate the
* buffer, and locks briefly to send the buffer to the netlink layer or
* to place it on a transmit queue. Multiple audit_buffers can be in
* use simultaneously. */
struct audit_buffer {
struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
gfp_t gfp_mask;
};
struct audit_reply {
__u32 portid;
struct net *net;
struct sk_buff *skb;
};
/**
* auditd_test_task - Check to see if a given task is an audit daemon
* @task: the task to check
*
* Description:
* Return 1 if the task is a registered audit daemon, 0 otherwise.
*/
int auditd_test_task(struct task_struct *task)
{
int rc;
struct auditd_connection *ac;
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
rcu_read_unlock();
return rc;
}
/**
* audit_ctl_lock - Take the audit control lock
*/
void audit_ctl_lock(void)
{
mutex_lock(&audit_cmd_mutex.lock);
audit_cmd_mutex.owner = current;
}
/**
* audit_ctl_unlock - Drop the audit control lock
*/
void audit_ctl_unlock(void)
{
audit_cmd_mutex.owner = NULL;
mutex_unlock(&audit_cmd_mutex.lock);
}
/**
* audit_ctl_owner_current - Test to see if the current task owns the lock
*
* Description:
* Return true if the current task owns the audit control lock, false if it
* doesn't own the lock.
*/
static bool audit_ctl_owner_current(void)
{
return (current == audit_cmd_mutex.owner);
}
/**
* auditd_pid_vnr - Return the auditd PID relative to the namespace
*
* Description:
* Returns the PID in relation to the namespace, 0 on failure.
*/
static pid_t auditd_pid_vnr(void)
{
pid_t pid;
const struct auditd_connection *ac;
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac || !ac->pid)
pid = 0;
else
pid = pid_vnr(ac->pid);
rcu_read_unlock();
return pid;
}
/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
* Description:
* Returns the sock pointer if valid, NULL otherwise. The caller must ensure
* that a reference is held for the network namespace while the sock is in use.
*/
static struct sock *audit_get_sk(const struct net *net)
{
struct audit_net *aunet;
if (!net)
return NULL;
aunet = net_generic(net, audit_net_id);
return aunet->sk;
}
void audit_panic(const char *message)
{
switch (audit_failure) {
case AUDIT_FAIL_SILENT:
break;
case AUDIT_FAIL_PRINTK:
if (printk_ratelimit())
pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
panic("audit: %s\n", message);
break;
}
}
static inline int audit_rate_check(void)
{
static unsigned long last_check = 0;
static int messages = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
unsigned long elapsed;
int retval = 0;
if (!audit_rate_limit) return 1;
spin_lock_irqsave(&lock, flags);
if (++messages < audit_rate_limit) {
retval = 1;
} else {
now = jiffies;
elapsed = now - last_check;
if (elapsed > HZ) {
last_check = now;
messages = 0;
retval = 1;
}
}
spin_unlock_irqrestore(&lock, flags);
return retval;
}
/**
* audit_log_lost - conditionally log lost audit message event
* @message: the message stating reason for lost audit message
*
* Emit at least 1 message per second, even if audit_rate_check is
* throttling.
* Always increment the lost messages counter.
*/
void audit_log_lost(const char *message)
{
static unsigned long last_msg = 0;
static DEFINE_SPINLOCK(lock);
unsigned long flags;
unsigned long now;
int print;
atomic_inc(&audit_lost);
print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
if (!print) {
spin_lock_irqsave(&lock, flags);
now = jiffies;
if (now - last_msg > HZ) {
print = 1;
last_msg = now;
}
spin_unlock_irqrestore(&lock, flags);
}
if (print) {
if (printk_ratelimit())
pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
atomic_read(&audit_lost),
audit_rate_limit,
audit_backlog_limit);
audit_panic(message);
}
}
static int audit_log_config_change(char *function_name, u32 new, u32 old,
int allow_changes)
{
struct audit_buffer *ab;
int rc = 0;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
int allow_changes, rc = 0;
u32 old = *to_change;
/* check if we are locked */
if (audit_enabled == AUDIT_LOCKED)
allow_changes = 0;
else
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
/* If we are allowed, make the change */
if (allow_changes == 1)
*to_change = new;
/* Not allowed, update reason */
else if (rc == 0)
rc = -EPERM;
return rc;
}
static int audit_set_rate_limit(u32 limit)
{
return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
}
static int audit_set_backlog_limit(u32 limit)
{
return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
static int audit_set_backlog_wait_time(u32 timeout)
{
return audit_do_config_change("audit_backlog_wait_time",
&audit_backlog_wait_time, timeout);
}
static int audit_set_enabled(u32 state)
{
int rc;
if (state > AUDIT_LOCKED)
return -EINVAL;
rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
static int audit_set_failure(u32 state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
return audit_do_config_change("audit_failure", &audit_failure, state);
}
/**
* auditd_conn_free - RCU helper to release an auditd connection struct
* @rcu: RCU head
*
* Description:
* Drop any references inside the auditd connection tracking struct and free
* the memory.
*/
static void auditd_conn_free(struct rcu_head *rcu)
{
struct auditd_connection *ac;
ac = container_of(rcu, struct auditd_connection, rcu);
put_pid(ac->pid);
put_net(ac->net);
kfree(ac);
}
/**
* auditd_set - Set/Reset the auditd connection state
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
static int auditd_set(struct pid *pid, u32 portid, struct net *net)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
if (!pid || !net)
return -EINVAL;
ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
if (!ac_new)
return -ENOMEM;
ac_new->pid = get_pid(pid);
ac_new->portid = portid;
ac_new->net = get_net(net);
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
rcu_assign_pointer(auditd_conn, ac_new);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
return 0;
}
/**
* kauditd_printk_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
* so write it via printk so the information isn't completely lost.
*/
static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
}
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
* @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
/* put the record back in the queue */
skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
* @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
* function is called we haven't given up yet on sending the record, but things
* are not looking good. The first thing we want to do is try to write the
* record via printk and then see if we want to try and hold on to the record
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
if (!audit_default)
goto drop;
/* the hold queue is only for when the daemon goes away completely,
* not -EAGAIN failures; if we are in a -EAGAIN state requeue the
* record on the retry queue unless it's full, in which case drop it
*/
if (error == -EAGAIN) {
if (!audit_backlog_limit ||
skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_retry_queue, skb);
return;
}
audit_log_lost("kauditd retry queue overflow");
goto drop;
}
/* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
return;
}
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
* @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
if (!audit_backlog_limit ||
skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_retry_queue, skb);
return;
}
/* we have to drop the record, send it via printk as a last effort */
kauditd_printk_skb(skb);
audit_log_lost("kauditd retry queue overflow");
kfree_skb(skb);
}
/**
* auditd_reset - Disconnect the auditd connection
* @ac: auditd connection state
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
* hold queue in case auditd reconnects. It is important to note that the @ac
* pointer should never be dereferenced inside this function as it may be NULL
* or invalid, you can only compare the memory address! If @ac is NULL then
* the connection will always be reset.
*/
static void auditd_reset(const struct auditd_connection *ac)
{
unsigned long flags;
struct sk_buff *skb;
struct auditd_connection *ac_old;
/* if it isn't already broken, break the connection */
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
if (ac && ac != ac_old) {
/* someone already registered a new auditd connection */
spin_unlock_irqrestore(&auditd_conn_lock, flags);
return;
}
rcu_assign_pointer(auditd_conn, NULL);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
* auditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
*
* Description:
* Send a skb to the audit daemon, returns positive/zero values on success and
* negative values on failure; in all cases the skb will be consumed by this
* function. If the send results in -ECONNREFUSED the connection with auditd
* will be reset. This function may sleep so callers should not hold any locks
* where this would cause a problem.
*/
static int auditd_send_unicast_skb(struct sk_buff *skb)
{
int rc;
u32 portid;
struct net *net;
struct sock *sk;
struct auditd_connection *ac;
/* NOTE: we can't call netlink_unicast while in the RCU section so
* take a reference to the network namespace and grab local
* copies of the namespace, the sock, and the portid; the
* namespace and sock aren't going to go away while we hold a
* reference and if the portid does become invalid after the RCU
* section netlink_unicast() should safely return an error */
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
kfree_skb(skb);
rc = -ECONNREFUSED;
goto err;
}
net = get_net(ac->net);
sk = audit_get_sk(net);
portid = ac->portid;
rcu_read_unlock();
rc = netlink_unicast(sk, skb, portid, 0);
put_net(net);
if (rc < 0)
goto err;
return rc;
err:
if (ac && rc == -ECONNREFUSED)
auditd_reset(ac);
return rc;
}
/**
* kauditd_send_queue - Helper for kauditd_thread to flush skb queues
* @sk: the sending sock
* @portid: the netlink destination
* @queue: the skb queue to process
* @retry_limit: limit on number of netlink unicast failures
* @skb_hook: per-skb hook for additional processing
* @err_hook: hook called if the skb fails the netlink unicast send
*
* Description:
* Run through the given queue and attempt to send the audit records to auditd,
* returns zero on success, negative values on failure. It is up to the caller
* to ensure that the @sk is valid for the duration of this function.
*
*/
static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
struct sk_buff *skb = NULL;
struct sk_buff *skb_tail;
unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
skb_tail = skb_peek_tail(queue);
while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
(*err_hook)(skb, -ECONNREFUSED);
continue;
}
retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
/* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
sk = NULL;
if (err_hook)
(*err_hook)(skb, rc);
if (rc == -EAGAIN)
rc = 0;
/* continue to drain the queue */
continue;
} else
goto retry;
} else {
/* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
return (rc >= 0 ? 0 : rc);
}
/*
* kauditd_send_multicast_skb - Send a record to any multicast listeners
* @skb: audit record
*
* Description:
* Write a multicast message to anyone listening in the initial network
* namespace. This function doesn't consume an skb as might be expected since
* it has to copy it anyways.
*/
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
struct sk_buff *copy;
struct sock *sock = audit_get_sk(&init_net);
struct nlmsghdr *nlh;
/* NOTE: we are not taking an additional reference for init_net since
* we don't have to worry about it going away */
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
/*
* The seemingly wasteful skb_copy() rather than bumping the refcount
* using skb_get() is necessary because non-standard mods are made to
* the skb by the original kaudit unicast socket send routine. The
* existing auditd daemon assumes this breakage. Fixing this would
* require co-ordinating a change in the established protocol between
* the kaudit kernel subsystem and the auditd userspace code. There is
* no reason for new multicast clients to continue with this
* non-compliance.
*/
copy = skb_copy(skb, GFP_KERNEL);
if (!copy)
return;
nlh = nlmsg_hdr(copy);
nlh->nlmsg_len = skb->len;
nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}
/**
* kauditd_thread - Worker thread to send audit records to userspace
* @dummy: unused
*/
static int kauditd_thread(void *dummy)
{
int rc;
u32 portid = 0;
struct net *net = NULL;
struct sock *sk = NULL;
struct auditd_connection *ac;
#define UNICAST_RETRIES 5
set_freezable();
while (!kthread_should_stop()) {
/* NOTE: see the lock comments in auditd_send_unicast_skb() */
rcu_read_lock();
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
goto main_queue;
}
net = get_net(ac->net);
sk = audit_get_sk(net);
portid = ac->portid;
rcu_read_unlock();
/* attempt to flush the hold queue */
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
}
/* attempt to flush the retry queue */
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
if (rc < 0) {
sk = NULL;
auditd_reset(ac);
goto main_queue;
}
main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
* multicast send and move the record to the hold queue */
rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
(sk ?
kauditd_retry_skb : kauditd_hold_skb));
if (ac && rc < 0)
auditd_reset(ac);
sk = NULL;
/* drop our netns reference, no auditd sends past this line */
if (net) {
put_net(net);
net = NULL;
}
/* we have processed all the queues so wake everyone */
wake_up(&audit_backlog_wait);
/* NOTE: we want to wake up if there is anything on the queue,
* regardless of if an auditd is connected, as we need to
* do the multicast send and rotate records from the
* main queue to the retry/hold queues */
wait_event_freezable(kauditd_wait,
(skb_queue_len(&audit_queue) ? 1 : 0));
}
return 0;
}
int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
audit_ctl_lock();
audit_ctl_unlock();
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(sk, skb, dest->portid, 0);
put_net(dest->net);
kfree(dest);
return 0;
}
struct sk_buff *audit_make_reply(int seq, int type, int done,
int multi, const void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
void *data;
int flags = multi ? NLM_F_MULTI : 0;
int t = done ? NLMSG_DONE : type;
skb = nlmsg_new(size, GFP_KERNEL);
if (!skb)
return NULL;
nlh = nlmsg_put(skb, 0, seq, t, size, flags);
if (!nlh)
goto out_kfree_skb;
data = nlmsg_data(nlh);
memcpy(data, payload, size);
return skb;
out_kfree_skb:
kfree_skb(skb);
return NULL;
}
static void audit_free_reply(struct audit_reply *reply)
{
if (!reply)
return;
kfree_skb(reply->skb);
if (reply->net)
put_net(reply->net);
kfree(reply);
}
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
reply->skb = NULL;
audit_free_reply(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
* @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
* @multi: multi-part message flag
* @payload: payload data
* @size: payload size
*
* Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
struct task_struct *tsk;
struct audit_reply *reply;
reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
if (!reply->skb)
goto err;
reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
if (IS_ERR(tsk))
goto err;
return;
err:
audit_free_reply(reply);
}
/*
* Check for appropriate CAP_AUDIT_ capabilities on incoming audit
* control messages.
*/
static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
int err = 0;
/* Only support initial user namespace for now. */
/*
* We return ECONNREFUSED because it tricks userspace into thinking
* that audit was not configured into the kernel. Lots of users
* configure their PAM stack (because that's what the distro does)
* to reject login if unable to send messages to audit. If we return
* ECONNREFUSED the PAM stack thinks the kernel does not have audit
* configured in and will let login proceed. If we return EPERM
* userspace will reject all logins. This should be removed when we
* support non init namespaces!!
*/
if (current_user_ns() != &init_user_ns)
return -ECONNREFUSED;
switch (msg_type) {
case AUDIT_LIST:
case AUDIT_ADD:
case AUDIT_DEL:
return -EOPNOTSUPP;
case AUDIT_GET:
case AUDIT_SET:
case AUDIT_GET_FEATURE:
case AUDIT_SET_FEATURE:
case AUDIT_LIST_RULES:
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
/* Only support auditd and auditctl in initial pid namespace
* for now. */
if (task_active_pid_ns(current) != &init_pid_ns)
return -EPERM;
if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!netlink_capable(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
err = -EINVAL;
}
return err;
}
static void audit_log_common_recv_msg(struct audit_context *context,
struct audit_buffer **ab, u16 msg_type)
{
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return;
}
*ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
}
static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
u16 msg_type)
{
audit_log_common_recv_msg(NULL, ab, msg_type);
}
int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
}
static int audit_get_feature(struct sk_buff *skb)
{
u32 seq;
seq = nlmsg_hdr(skb)->nlmsg_seq;
audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
return 0;
}
static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
u32 old_lock, u32 new_lock, int res)
{
struct audit_buffer *ab;
if (audit_enabled == AUDIT_OFF)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
if (!ab)
return;
audit_log_task_info(ab);
audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
audit_log_end(ab);
}
static int audit_set_feature(struct audit_features *uaf)
{
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
/* if there is ever a version 2 we should handle that here */
for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
u32 feature = AUDIT_FEATURE_TO_MASK(i);
u32 old_feature, new_feature, old_lock, new_lock;
/* if we are not changing this feature, move along */
if (!(feature & uaf->mask))
continue;
old_feature = af.features & feature;
new_feature = uaf->features & feature;
new_lock = (uaf->lock | af.lock) & feature;
old_lock = af.lock & feature;
/* are we changing a locked feature? */
if (old_lock && (new_feature != old_feature)) {
audit_log_feature_change(i, old_feature, new_feature,
old_lock, new_lock, 0);
return -EPERM;
}
}
/* nothing invalid, do the changes */
for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
u32 feature = AUDIT_FEATURE_TO_MASK(i);
u32 old_feature, new_feature, old_lock, new_lock;
/* if we are not changing this feature, move along */
if (!(feature & uaf->mask))
continue;
old_feature = af.features & feature;
new_feature = uaf->features & feature;
old_lock = af.lock & feature;
new_lock = (uaf->lock | af.lock) & feature;
if (new_feature != old_feature)
audit_log_feature_change(i, old_feature, new_feature,
old_lock, new_lock, 1);
if (new_feature)
af.features |= feature;
else
af.features &= ~feature;
af.lock |= new_lock;
}
return 0;
}
static int audit_replace(struct pid *pid)
{
pid_t pvnr;
struct sk_buff *skb;
pvnr = pid_vnr(pid);
skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
if (!skb)
return -ENOMEM;
return auditd_send_unicast_skb(skb);
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
err = audit_netlink_ok(skb, msg_type);
if (err)
return err;
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
s.enabled = audit_enabled;
s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
s.pid = auditd_pid_vnr();
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
s.backlog = skb_queue_len(&audit_queue);
s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
s.backlog_wait_time = audit_backlog_wait_time;
s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_FAILURE) {
err = audit_set_failure(s.failure);
if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
/* NOTE: we are using the vnr PID functions below
* because the s.pid value is relative to the
* namespace of the caller; at present this
* doesn't matter much since you can really only
* run auditd from the initial pid namespace, but
* something to keep in mind if this changes */
pid_t new_pid = s.pid;
pid_t auditd_pid;
struct pid *req_pid = task_tgid(current);
/* Sanity check - PID values must match. Setting
* pid to 0 is how auditd ends auditing. */
if (new_pid && (new_pid != pid_vnr(req_pid)))
return -EINVAL;
/* test the auditd connection */
audit_replace(req_pid);
auditd_pid = auditd_pid_vnr();
if (auditd_pid) {
/* replacing a healthy auditd is not allowed */
if (new_pid) {
audit_log_config_change("audit_pid",
new_pid, auditd_pid, 0);
return -EEXIST;
}
/* only current auditd can unregister itself */
if (pid_vnr(req_pid) != auditd_pid) {
audit_log_config_change("audit_pid",
new_pid, auditd_pid, 0);
return -EACCES;
}
}
if (new_pid) {
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
sock_net(NETLINK_CB(skb).sk));
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
auditd_pid,
err ? 0 : 1);
if (err)
return err;
/* try to process any backlog */
wake_up_interruptible(&kauditd_wait);
} else {
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
auditd_pid, 1);
/* unregister the auditd connection */
auditd_reset(NULL);
}
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
err = audit_set_backlog_limit(s.backlog_limit);
if (err < 0)
return err;
}
if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
if (sizeof(s) > (size_t)nlh->nlmsg_len)
return -EINVAL;
if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
return -EINVAL;
err = audit_set_backlog_wait_time(s.backlog_wait_time);
if (err < 0)
return err;
}
if (s.mask == AUDIT_STATUS_LOST) {
u32 lost = atomic_xchg(&audit_lost, 0);
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
return actual;
}
break;
}
case AUDIT_GET_FEATURE:
err = audit_get_feature(skb);
if (err)
return err;
break;
case AUDIT_SET_FEATURE:
if (data_len < sizeof(struct audit_features))
return -EINVAL;
err = audit_set_feature(data);
if (err)
return err;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
/* exit early if there isn't at least one character to print */
if (data_len < 2)
return -EINVAL;
err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
char *str = data;
err = 0;
if (msg_type == AUDIT_USER_TTY) {
err = tty_audit_push();
if (err)
break;
}
audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY) {
/* ensure NULL termination */
str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
str);
} else {
audit_log_format(ab, " data=");
if (data_len > 0 && str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
audit_log_end(ab);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=%s audit_enabled=%d res=0",
msg_type == AUDIT_ADD_RULE ?
"add_rule" : "remove_rule",
audit_enabled);
audit_log_end(ab);
return -EPERM;
}
err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
if (msglen < 2 * sizeof(u32))
break;
memcpy(sizes, bufp, 2 * sizeof(u32));
bufp += 2 * sizeof(u32);
msglen -= 2 * sizeof(u32);
old = audit_unpack_string(&bufp, &msglen, sizes[0]);
if (IS_ERR(old)) {
err = PTR_ERR(old);
break;
}
new = audit_unpack_string(&bufp, &msglen, sizes[1]);
if (IS_ERR(new)) {
err = PTR_ERR(new);
kfree(old);
break;
}
/* OK, here comes... */
err = audit_tag_tree(old, new);
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
audit_log_untrustedstring(ab, new);
audit_log_format(ab, " res=%d", !err);
audit_log_end(ab);
kfree(old);
kfree(new);
break;
}
case AUDIT_SIGNAL_INFO:
len = 0;
if (audit_sig_sid) {
err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
if (err)
return err;
}
sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (audit_sig_sid) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
struct audit_tty_status s;
unsigned int t;
t = READ_ONCE(current->signal->audit_tty);
s.enabled = t & AUDIT_TTY_ENABLE;
s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status s, old;
struct audit_buffer *ab;
unsigned int t;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
err = -EINVAL;
if (err)
t = READ_ONCE(current->signal->audit_tty);
else {
t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
t = xchg(&current->signal->audit_tty, t);
}
old.enabled = t & AUDIT_TTY_ENABLE;
old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
audit_log_common_recv_msg(audit_context(), &ab,
AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
s.log_passwd, !err);
audit_log_end(ab);
break;
}
default:
err = -EINVAL;
break;
}
return err < 0 ? err : 0;
}
/**
* audit_receive - receive messages from a netlink control socket
* @skb: the message buffer
*
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
int err;
nlh = nlmsg_hdr(skb);
len = skb->len;
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
/* can't block with the ctrl lock, so penalize the sender now */
if (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) {
DECLARE_WAITQUEUE(wait, current);
/* wake kauditd to try and flush the queue */
wake_up_interruptible(&kauditd_wait);
add_wait_queue_exclusive(&audit_backlog_wait, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(audit_backlog_wait_time);
remove_wait_queue(&audit_backlog_wait, &wait);
}
}
/* Log information about who is connecting to the audit multicast socket */
static void audit_log_multicast(int group, const char *op, int err)
{
const struct cred *cred;
struct tty_struct *tty;
char comm[sizeof(current->comm)];
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
if (!ab)
return;
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
task_pid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
tty ? tty_name(tty) : "(none)",
audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_task_context(ab); /* subj= */
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm); /* exe= */
audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
audit_log_end(ab);
}
/* Run custom bind function on netlink socket group connect or bind requests. */
static int audit_multicast_bind(struct net *net, int group)
{
int err = 0;
if (!capable(CAP_AUDIT_READ))
err = -EPERM;
audit_log_multicast(group, "connect", err);
return err;
}
static void audit_multicast_unbind(struct net *net, int group)
{
audit_log_multicast(group, "disconnect", 0);
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
.bind = audit_multicast_bind,
.unbind = audit_multicast_unbind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
if (aunet->sk == NULL) {
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
/* limit the timeout in case auditd is blocked/stopped */
aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
/* NOTE: you would think that we would want to check the auditd
* connection and potentially reset it here if it lives in this
* namespace, but since the auditd connection tracking struct holds a
* reference to this namespace (see auditd_set()) we are only ever
* going to get here after that connection has been released */
netlink_kernel_release(aunet->sk);
}
static struct pernet_operations audit_net_ops __net_initdata = {
.init = audit_net_init,
.exit = audit_net_exit,
.id = &audit_net_id,
.size = sizeof(struct audit_net),
};
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
int i;
if (audit_initialized == AUDIT_DISABLED)
return 0;
audit_buffer_cache = kmem_cache_create("audit_buffer",
sizeof(struct audit_buffer),
0, SLAB_PANIC, NULL);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
skb_queue_head_init(&audit_hold_queue);
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
mutex_init(&audit_cmd_mutex.lock);
audit_cmd_mutex.owner = NULL;
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
int err = PTR_ERR(kauditd_task);
panic("audit: failed to start the kauditd thread (%d)\n", err);
}
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
"state=initialized audit_enabled=%u res=1",
audit_enabled);
return 0;
}
postcore_initcall(audit_init);
/*
* Process kernel command-line parameter at boot time.
* audit={0|off} or audit={1|on}.
*/
static int __init audit_enable(char *str)
{
if (!strcasecmp(str, "off") || !strcmp(str, "0"))
audit_default = AUDIT_OFF;
else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
audit_default = AUDIT_ON;
else {
pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
audit_default = AUDIT_ON;
}
if (audit_default == AUDIT_OFF)
audit_initialized = AUDIT_DISABLED;
if (audit_set_enabled(audit_default))
pr_err("audit: error setting audit state (%d)\n",
audit_default);
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
return 1;
}
__setup("audit=", audit_enable);
/* Process kernel command-line parameter at boot time.
* audit_backlog_limit=<n> */
static int __init audit_backlog_limit_set(char *str)
{
u32 audit_backlog_limit_arg;
pr_info("audit_backlog_limit: ");
if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
pr_cont("using default of %u, unable to parse %s\n",
audit_backlog_limit, str);
return 1;
}
audit_backlog_limit = audit_backlog_limit_arg;
pr_cont("%d\n", audit_backlog_limit);
return 1;
}
__setup("audit_backlog_limit=", audit_backlog_limit_set);
static void audit_buffer_free(struct audit_buffer *ab)
{
if (!ab)
return;
kfree_skb(ab->skb);
kmem_cache_free(audit_buffer_cache, ab);
}
static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
gfp_t gfp_mask, int type)
{
struct audit_buffer *ab;
ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
if (!ab)
return NULL;
ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
if (!ab->skb)
goto err;
if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
goto err;
ab->ctx = ctx;
ab->gfp_mask = gfp_mask;
return ab;
err:
audit_buffer_free(ab);
return NULL;
}
/**
* audit_serial - compute a serial number for the audit record
*
* Compute a serial number for the audit record. Audit records are
* written to user-space as soon as they are generated, so a complete
* audit record may be written in several pieces. The timestamp of the
* record and this serial number are used by the user-space tools to
* determine which pieces belong to the same audit record. The
* (timestamp,serial) tuple is unique for each syscall and is live from
* syscall entry to syscall exit.
*
* NOTE: Another possibility is to store the formatted records off the
* audit context (for those records that have a context), and emit them
* all at syscall exit. However, this could delay the reporting of
* significant errors until syscall exit (or never, if the system
* halts).
*/
unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
ktime_get_coarse_real_ts64(t);
*serial = audit_serial();
}
}
/**
* audit_log_start - obtain an audit buffer
* @ctx: audit_context (may be NULL)
* @gfp_mask: type of allocation
* @type: audit message type
*
* Returns audit_buffer pointer on success or NULL on error.
*
* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
* audit_log_*format. If the task (ctx) is a task that is currently in a
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, then
* task context (ctx) should be NULL.
*/
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int type)
{
struct audit_buffer *ab;
struct timespec64 t;
unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
return NULL;
/* NOTE: don't ever fail/sleep on these two conditions:
* 1. auditd generated record - since we need auditd to drain the
* queue; also, when we are checking for auditd, compare PIDs using
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
* while holding the mutex, although we do penalize the sender
* later in audit_receive() when it is safe to block
*/
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) {
/* wake kauditd to try and flush the queue */
wake_up_interruptible(&kauditd_wait);
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
long rtime = stime;
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
stime = schedule_timeout(rtime);
atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
skb_queue_len(&audit_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
return NULL;
}
}
}
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
audit_get_stamp(ab->ctx, &t, &serial);
/* cancel dummy context to enable supporting records */
if (ctx)
ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
(unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
/**
* audit_expand - expand skb in the audit buffer
* @ab: audit_buffer
* @extra: space to add at tail of the skb
*
* Returns 0 (no space) on failed expansion, or available space if
* successful.
*/
static inline int audit_expand(struct audit_buffer *ab, int extra)
{
struct sk_buff *skb = ab->skb;
int oldtail = skb_tailroom(skb);
int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
int newtail = skb_tailroom(skb);
if (ret < 0) {
audit_log_lost("out of memory in audit_expand");
return 0;
}
skb->truesize += newtail - oldtail;
return newtail;
}
/*
* Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
int len, avail;
struct sk_buff *skb;
va_list args2;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
if (avail == 0) {
avail = audit_expand(ab, AUDIT_BUFSIZ);
if (!avail)
goto out;
}
va_copy(args2, args);
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
if (len > 0)
skb_put(skb, len);
out_va_end:
va_end(args2);
out:
return;
}
/**
* audit_log_format - format a message into the audit buffer.
* @ab: audit_buffer
* @fmt: format string
* @...: optional parameters matching @fmt string
*
* All the work is done in audit_log_vformat.
*/
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
va_list args;
if (!ab)
return;
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
}
/**
* audit_log_n_hex - convert a buffer to hex and append it to the audit skb
* @ab: the audit_buffer
* @buf: buffer to convert to hex
* @len: length of @buf to be converted
*
* No return value; failure to expand is silently ignored.
*
* This function will take the passed buf and convert it into a string of
* ascii hex digits. The new string is placed onto the skb.
*/
void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
size_t len)
{
int i, avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
new_len = len<<1;
if (new_len >= avail) {
/* Round the buffer request up to the next multiple */
new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
avail = audit_expand(ab, new_len);
if (!avail)
return;
}
ptr = skb_tail_pointer(skb);
for (i = 0; i < len; i++)
ptr = hex_byte_pack_upper(ptr, buf[i]);
*ptr = 0;
skb_put(skb, len << 1); /* new string is twice the old string */
}
/*
* Format a string of no more than slen characters into the audit buffer,
* enclosed in quote marks.
*/
void audit_log_n_string(struct audit_buffer *ab, const char *string,
size_t slen)
{
int avail, new_len;
unsigned char *ptr;
struct sk_buff *skb;
if (!ab)
return;
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
new_len = slen + 3; /* enclosing quotes + null terminator */
if (new_len > avail) {
avail = audit_expand(ab, new_len);
if (!avail)
return;
}
ptr = skb_tail_pointer(skb);
*ptr++ = '"';
memcpy(ptr, string, slen);
ptr += slen;
*ptr++ = '"';
*ptr = 0;
skb_put(skb, slen + 2); /* don't include null terminator */
}
/**
* audit_string_contains_control - does a string need to be logged in hex
* @string: string to be checked
* @len: max length of the string to check
*/
bool audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return true;
}
return false;
}
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
* @len: length of string (not including trailing null)
* @string: string to be logged
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
* or a space. Unescaped strings will start and end with a double quote mark.
* Strings that are escaped are printed in hex (2 digits per char).
*
* The caller specifies the number of characters in the string to log, which may
* or may not be the entire string.
*/
void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
size_t len)
{
if (audit_string_contains_control(string, len))
audit_log_n_hex(ab, string, len);
else
audit_log_n_string(ab, string, len);
}
/**
* audit_log_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
* @string: string to be logged
*
* Same as audit_log_n_untrustedstring(), except that strlen is used to
* determine string length.
*/
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
audit_log_n_untrustedstring(ab, string, strlen(string));
}
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
const struct path *path)
{
char *p, *pathname;
if (prefix)
audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
}
void audit_log_session_info(struct audit_buffer *ab)
{
unsigned int sessionid = audit_get_sessionid(current);
uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
if (key)
audit_log_untrustedstring(ab, key);
else
audit_log_format(ab, "(null)");
}
int audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
unsigned len;
int error;
u32 sid;
security_current_getsecid_subj(&sid);
if (!sid)
return 0;
error = security_secid_to_secctx(sid, &ctx, &len);
if (error) {
if (error != -EINVAL)
goto error_path;
return 0;
}
audit_log_format(ab, " subj=%s", ctx);
security_release_secctx(ctx, len);
return 0;
error_path:
audit_panic("error in audit_log_task_context");
return error;
}
EXPORT_SYMBOL(audit_log_task_context);
void audit_log_d_path_exe(struct audit_buffer *ab,
struct mm_struct *mm)
{
struct file *exe_file;
if (!mm)
goto out_null;
exe_file = get_mm_exe_file(mm);
if (!exe_file)
goto out_null;
audit_log_d_path(ab, " exe=", &exe_file->f_path);
fput(exe_file);
return;
out_null:
audit_log_format(ab, " exe=(null)");
}
struct tty_struct *audit_get_tty(void)
{
struct tty_struct *tty = NULL;
unsigned long flags;
spin_lock_irqsave(&current->sighand->siglock, flags);
if (current->signal)
tty = tty_kref_get(current->signal->tty);
spin_unlock_irqrestore(&current->sighand->siglock, flags);
return tty;
}
void audit_put_tty(struct tty_struct *tty)
{
tty_kref_put(tty);
}
void audit_log_task_info(struct audit_buffer *ab)
{
const struct cred *cred;
char comm[sizeof(current->comm)];
struct tty_struct *tty;
if (!ab)
return;
cred = current_cred();
tty = audit_get_tty();
audit_log_format(ab,
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(current),
task_tgid_nr(current),
from_kuid(&init_user_ns, audit_get_loginuid(current)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
from_kuid(&init_user_ns, cred->suid),
from_kuid(&init_user_ns, cred->fsuid),
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid),
tty ? tty_name(tty) : "(none)",
audit_get_sessionid(current));
audit_put_tty(tty);
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);
/**
* audit_log_path_denied - report a path restriction denial
* @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
* @operation: specific operation name
*/
void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
if (!audit_enabled || audit_dummy_context())
return;
/* Generate log with subject, operation, outcome. */
ab = audit_log_start(audit_context(), GFP_KERNEL, type);
if (!ab)
return;
audit_log_format(ab, "op=%s", operation);
audit_log_task_info(ab);
audit_log_format(ab, " res=0");
audit_log_end(ab);
}
/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);
static int audit_set_loginuid_perm(kuid_t loginuid)
{
/* if we are unset, we don't need privs */
if (!audit_loginuid_set(current))
return 0;
/* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
return -EPERM;
/* it is set, you need permission */
if (!capable(CAP_AUDIT_CONTROL))
return -EPERM;
/* reject if this is not an unset and we don't allow that */
if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
&& uid_valid(loginuid))
return -EPERM;
return 0;
}
static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
unsigned int oldsessionid,
unsigned int sessionid, int rc)
{
struct audit_buffer *ab;
uid_t uid, oldloginuid, loginuid;
struct tty_struct *tty;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
loginuid = from_kuid(&init_user_ns, kloginuid);
tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
oldsessionid, sessionid, !rc);
audit_put_tty(tty);
audit_log_end(ab);
}
/**
* audit_set_loginuid - set current task's loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
int audit_set_loginuid(kuid_t loginuid)
{
unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
kuid_t oldloginuid;
int rc;
oldloginuid = audit_get_loginuid(current);
oldsessionid = audit_get_sessionid(current);
rc = audit_set_loginuid_perm(loginuid);
if (rc)
goto out;
/* are we setting or clearing? */
if (uid_valid(loginuid)) {
sessionid = (unsigned int)atomic_inc_return(&session_id);
if (unlikely(sessionid == AUDIT_SID_UNSET))
sessionid = (unsigned int)atomic_inc_return(&session_id);
}
current->sessionid = sessionid;
current->loginuid = loginuid;
out:
audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
return rc;
}
/**
* audit_signal_info - record signal info for shutting down audit subsystem
* @sig: signal value
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
int audit_signal_info(int sig, struct task_struct *t)
{
kuid_t uid = current_uid(), auid;
if (auditd_test_task(t) &&
(sig == SIGTERM || sig == SIGHUP ||
sig == SIGUSR1 || sig == SIGUSR2)) {
audit_sig_pid = task_tgid_nr(current);
auid = audit_get_loginuid(current);
if (uid_valid(auid))
audit_sig_uid = auid;
else
audit_sig_uid = uid;
security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
}
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
* We can not do a netlink send inside an irq context because it blocks (last
* arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
* queue and a kthread is scheduled to remove them from the queue outside the
* irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
if (!ab)
return;
if (audit_rate_check()) {
skb = ab->skb;
ab->skb = NULL;
/* setup the netlink header, see the comments in
* kauditd_send_multicast_skb() for length quirks */
nlh = nlmsg_hdr(skb);
nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
/* queue the netlink packet and poke the kauditd thread */
skb_queue_tail(&audit_queue, skb);
wake_up_interruptible(&kauditd_wait);
} else
audit_log_lost("rate limit exceeded");
audit_buffer_free(ab);
}
/**
* audit_log - Log an audit record
* @ctx: audit context
* @gfp_mask: type of allocation
* @type: audit message type
* @fmt: format string to use
* @...: variable parameters matching the format string
*
* This is a convenience function that calls audit_log_start,
* audit_log_vformat, and audit_log_end. It may be called
* in any context.
*/
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
ab = audit_log_start(ctx, gfp_mask, type);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
va_end(args);
audit_log_end(ab);
}
}
EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
EXPORT_SYMBOL(audit_log);
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_fsnotify.c -- tracking inodes
*
* Copyright 2003-2009,2014-2015 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include "audit.h"
/*
* this mark lives on the parent directory of the inode in question.
* but dev, ino, and path are about the child
*/
struct audit_fsnotify_mark {
dev_t dev; /* associated superblock device */
unsigned long ino; /* associated inode number */
char *path; /* insertion path */
struct fsnotify_mark mark; /* fsnotify mark on the inode */
struct audit_krule *rule;
};
/* fsnotify handle. */
static struct fsnotify_group *audit_fsnotify_group;
/* fsnotify events we care about. */
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
FS_MOVE_SELF)
static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
{
kfree(audit_mark->path);
kfree(audit_mark);
}
static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct audit_fsnotify_mark *audit_mark;
audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
audit_fsnotify_mark_free(audit_mark);
}
char *audit_mark_path(struct audit_fsnotify_mark *mark)
{
return mark->path;
}
int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
{
if (mark->ino == AUDIT_INO_UNSET)
return 0;
return (mark->ino == ino) && (mark->dev == dev);
}
static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
const struct inode *inode)
{
audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
}
struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
{
struct audit_fsnotify_mark *audit_mark;
struct path path;
struct dentry *dentry;
struct inode *inode;
int ret;
if (pathname[0] != '/' || pathname[len-1] == '/')
return ERR_PTR(-EINVAL);
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
if (unlikely(!audit_mark)) {
audit_mark = ERR_PTR(-ENOMEM);
goto out;
}
fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
audit_mark->mark.mask = AUDIT_FS_EVENTS;
audit_mark->path = pathname;
audit_update_mark(audit_mark, dentry->d_inode);
audit_mark->rule = krule;
ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
if (ret < 0) {
fsnotify_put_mark(&audit_mark->mark);
audit_mark = ERR_PTR(ret);
}
out:
dput(dentry);
path_put(&path);
return audit_mark;
}
static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
{
struct audit_buffer *ab;
struct audit_krule *rule = audit_mark->rule;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_session_info(ab);
audit_log_format(ab, " op=%s path=", op);
audit_log_untrustedstring(ab, audit_mark->path);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
}
void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
{
fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
fsnotify_put_mark(&audit_mark->mark);
}
void audit_remove_mark_rule(struct audit_krule *krule)
{
struct audit_fsnotify_mark *mark = krule->exe;
audit_remove_mark(mark);
}
static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
{
struct audit_krule *rule = audit_mark->rule;
struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
audit_mark_log_rule_change(audit_mark, "autoremove_rule");
audit_del_rule(entry);
}
/* Update mark data in audit rules based on fsnotify events. */
static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *dname, u32 cookie)
{
struct audit_fsnotify_mark *audit_mark;
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
return 0;
audit_update_mark(audit_mark, inode);
} else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
audit_autoremove_mark_rule(audit_mark);
}
return 0;
}
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
.handle_inode_event = audit_mark_handle_event,
.free_mark = audit_fsnotify_free_mark,
};
static int __init audit_fsnotify_init(void)
{
audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
if (IS_ERR(audit_fsnotify_group)) {
audit_fsnotify_group = NULL;
audit_panic("cannot create audit fsnotify group");
}
return 0;
}
device_initcall(audit_fsnotify_init);
// SPDX-License-Identifier: GPL-2.0
#include "audit.h"
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
#include <linux/refcount.h>
#include <linux/slab.h>
struct audit_tree;
struct audit_chunk;
struct audit_tree {
refcount_t count;
int goner;
struct audit_chunk *root;
struct list_head chunks;
struct list_head rules;
struct list_head list;
struct list_head same_root;
struct rcu_head head;
char pathname[];
};
struct audit_chunk {
struct list_head hash;
unsigned long key;
struct fsnotify_mark *mark;
struct list_head trees; /* with root here */
int count;
atomic_long_t refs;
struct rcu_head head;
struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
} owners[];
};
struct audit_tree_mark {
struct fsnotify_mark mark;
struct audit_chunk *chunk;
};
static LIST_HEAD(tree_list);
static LIST_HEAD(prune_list);
static struct task_struct *prune_thread;
/*
* One struct chunk is attached to each inode of interest through
* audit_tree_mark (fsnotify mark). We replace struct chunk on tagging /
* untagging, the mark is stable as long as there is chunk attached. The
* association between mark and chunk is protected by hash_lock and
* audit_tree_group->mark_mutex. Thus as long as we hold
* audit_tree_group->mark_mutex and check that the mark is alive by
* FSNOTIFY_MARK_FLAG_ATTACHED flag check, we are sure the mark points to
* the current chunk.
*
* Rules have pointer to struct audit_tree.
* Rules have struct list_head rlist forming a list of rules over
* the same tree.
* References to struct chunk are collected at audit_inode{,_child}()
* time and used in AUDIT_TREE rule matching.
* These references are dropped at the same time we are calling
* audit_free_names(), etc.
*
* Cyclic lists galore:
* tree.chunks anchors chunk.owners[].list hash_lock
* tree.rules anchors rule.rlist audit_filter_mutex
* chunk.trees anchors tree.same_root hash_lock
* chunk.hash is a hash with middle bits of watch.inode as
* a hash function. RCU, hash_lock
*
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
* chunk is refcounted by embedded .refs. Mark associated with the chunk holds
* one chunk reference. This reference is dropped either when a mark is going
* to be freed (corresponding inode goes away) or when chunk attached to the
* mark gets replaced. This reference must be dropped using
* audit_mark_put_chunk() to make sure the reference is dropped only after RCU
* grace period as it protects RCU readers of the hash table.
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
* revert - several operations have very unpleasant cleanup logics and
* that makes a difference. Some.
*/
static struct fsnotify_group *audit_tree_group;
static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
INIT_LIST_HEAD(&tree->chunks);
INIT_LIST_HEAD(&tree->rules);
INIT_LIST_HEAD(&tree->list);
INIT_LIST_HEAD(&tree->same_root);
tree->root = NULL;
strcpy(tree->pathname, s);
}
return tree;
}
static inline void get_tree(struct audit_tree *tree)
{
refcount_inc(&tree->count);
}
static inline void put_tree(struct audit_tree *tree)
{
if (refcount_dec_and_test(&tree->count))
kfree_rcu(tree, head);
}
/* to avoid bringing the entire thing in audit.h */
const char *audit_tree_path(struct audit_tree *tree)
{
return tree->pathname;
}
static void free_chunk(struct audit_chunk *chunk)
{
int i;
for (i = 0; i < chunk->count; i++) {
if (chunk->owners[i].owner)
put_tree(chunk->owners[i].owner);
}
kfree(chunk);
}
void audit_put_chunk(struct audit_chunk *chunk)
{
if (atomic_long_dec_and_test(&chunk->refs))
free_chunk(chunk);
}
static void __put_chunk(struct rcu_head *rcu)
{
struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
audit_put_chunk(chunk);
}
/*
* Drop reference to the chunk that was held by the mark. This is the reference
* that gets dropped after we've removed the chunk from the hash table and we
* use it to make sure chunk cannot be freed before RCU grace period expires.
*/
static void audit_mark_put_chunk(struct audit_chunk *chunk)
{
call_rcu(&chunk->head, __put_chunk);
}
static inline struct audit_tree_mark *audit_mark(struct fsnotify_mark *mark)
{
return container_of(mark, struct audit_tree_mark, mark);
}
static struct audit_chunk *mark_chunk(struct fsnotify_mark *mark)
{
return audit_mark(mark)->chunk;
}
static void audit_tree_destroy_watch(struct fsnotify_mark *mark)
{
kmem_cache_free(audit_tree_mark_cachep, audit_mark(mark));
}
static struct fsnotify_mark *alloc_mark(void)
{
struct audit_tree_mark *amark;
amark = kmem_cache_zalloc(audit_tree_mark_cachep, GFP_KERNEL);
if (!amark)
return NULL;
fsnotify_init_mark(&amark->mark, audit_tree_group);
amark->mark.mask = FS_IN_IGNORED;
return &amark->mark;
}
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
int i;
chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
INIT_LIST_HEAD(&chunk->hash);
INIT_LIST_HEAD(&chunk->trees);
chunk->count = count;
atomic_long_set(&chunk->refs, 1);
for (i = 0; i < count; i++) {
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
}
return chunk;
}
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
/* Function to return search key in our hash from inode. */
static unsigned long inode_to_key(const struct inode *inode)
{
/* Use address pointed to by connector->obj as the key */
return (unsigned long)&inode->i_fsnotify_marks;
}
static inline struct list_head *chunk_hash(unsigned long key)
{
unsigned long n = key / L1_CACHE_BYTES;
return chunk_hash_heads + n % HASH_SIZE;
}
/* hash_lock & mark->group->mark_mutex is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
struct list_head *list;
/*
* Make sure chunk is fully initialized before making it visible in the
* hash. Pairs with a data dependency barrier in READ_ONCE() in
* audit_tree_lookup().
*/
smp_wmb();
WARN_ON_ONCE(!chunk->key);
list = chunk_hash(chunk->key);
list_add_rcu(&chunk->hash, list);
}
/* called under rcu_read_lock */
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
unsigned long key = inode_to_key(inode);
struct list_head *list = chunk_hash(key);
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
/*
* We use a data dependency barrier in READ_ONCE() to make sure
* the chunk we see is fully initialized.
*/
if (READ_ONCE(p->key) == key) {
atomic_long_inc(&p->refs);
return p;
}
}
return NULL;
}
bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
return true;
return false;
}
/* tagging and untagging inodes with trees */
static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
return container_of(p, struct audit_chunk, owners[0]);
}
static void replace_mark_chunk(struct fsnotify_mark *mark,
struct audit_chunk *chunk)
{
struct audit_chunk *old;
assert_spin_locked(&hash_lock);
old = mark_chunk(mark);
audit_mark(mark)->chunk = chunk;
if (chunk)
chunk->mark = mark;
if (old)
old->mark = NULL;
}
static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
{
struct audit_tree *owner;
int i, j;
new->key = old->key;
list_splice_init(&old->trees, &new->trees);
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
for (i = j = 0; j < old->count; i++, j++) {
if (!old->owners[j].owner) {
i--;
continue;
}
owner = old->owners[j].owner;
new->owners[i].owner = owner;
new->owners[i].index = old->owners[j].index - j + i;
if (!owner) /* result of earlier fallback */
continue;
get_tree(owner);
list_replace_init(&old->owners[j].list, &new->owners[i].list);
}
replace_mark_chunk(old->mark, new);
/*
* Make sure chunk is fully initialized before making it visible in the
* hash. Pairs with a data dependency barrier in READ_ONCE() in
* audit_tree_lookup().
*/
smp_wmb();
list_replace_rcu(&old->hash, &new->hash);
}
static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
if (owner->root == chunk) {
list_del_init(&owner->same_root);
owner->root = NULL;
}
list_del_init(&p->list);
p->owner = NULL;
put_tree(owner);
}
static int chunk_count_trees(struct audit_chunk *chunk)
{
int i;
int ret = 0;
for (i = 0; i < chunk->count; i++)
if (chunk->owners[i].owner)
ret++;
return ret;
}
static void untag_chunk(struct audit_chunk *chunk, struct fsnotify_mark *mark)
{
struct audit_chunk *new;
int size;
mutex_lock(&audit_tree_group->mark_mutex);
/*
* mark_mutex stabilizes chunk attached to the mark so we can check
* whether it didn't change while we've dropped hash_lock.
*/
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) ||
mark_chunk(mark) != chunk)
goto out_mutex;
size = chunk_count_trees(chunk);
if (!size) {
spin_lock(&hash_lock);
list_del_init(&chunk->trees);
list_del_rcu(&chunk->hash);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
mutex_unlock(&audit_tree_group->mark_mutex);
audit_mark_put_chunk(chunk);
fsnotify_free_mark(mark);
return;
}
new = alloc_chunk(size);
if (!new)
goto out_mutex;
spin_lock(&hash_lock);
/*
* This has to go last when updating chunk as once replace_chunk() is
* called, new RCU readers can see the new chunk.
*/
replace_chunk(new, chunk);
spin_unlock(&hash_lock);
mutex_unlock(&audit_tree_group->mark_mutex);
audit_mark_put_chunk(chunk);
return;
out_mutex:
mutex_unlock(&audit_tree_group->mark_mutex);
}
/* Call with group->mark_mutex held, releases it */
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk) {
mutex_unlock(&audit_tree_group->mark_mutex);
return -ENOMEM;
}
mark = alloc_mark();
if (!mark) {
mutex_unlock(&audit_tree_group->mark_mutex);
kfree(chunk);
return -ENOMEM;
}
if (fsnotify_add_inode_mark_locked(mark, inode, 0)) {
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_put_mark(mark);
kfree(chunk);
return -ENOSPC;
}
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
fsnotify_detach_mark(mark);
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
}
replace_mark_chunk(mark, chunk);
chunk->owners[0].index = (1U << 31);
chunk->owners[0].owner = tree;
get_tree(tree);
list_add(&chunk->owners[0].list, &tree->chunks);
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
chunk->key = inode_to_key(inode);
/*
* Inserting into the hash table has to go last as once we do that RCU
* readers can see the chunk.
*/
insert_hash(chunk);
spin_unlock(&hash_lock);
mutex_unlock(&audit_tree_group->mark_mutex);
/*
* Drop our initial reference. When mark we point to is getting freed,
* we get notification through ->freeing_mark callback and cleanup
* chunk pointing to this mark.
*/
fsnotify_put_mark(mark);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
struct audit_node *p;
int n;
mutex_lock(&audit_tree_group->mark_mutex);
mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
if (!mark)
return create_chunk(inode, tree);
/*
* Found mark is guaranteed to be attached and mark_mutex protects mark
* from getting detached and thus it makes sure there is chunk attached
* to the mark.
*/
/* are we already there? */
spin_lock(&hash_lock);
old = mark_chunk(mark);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_put_mark(mark);
return 0;
}
}
spin_unlock(&hash_lock);
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_put_mark(mark);
return -ENOMEM;
}
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_put_mark(mark);
kfree(chunk);
return 0;
}
p = &chunk->owners[chunk->count - 1];
p->index = (chunk->count - 1) | (1U<<31);
p->owner = tree;
get_tree(tree);
list_add(&p->list, &tree->chunks);
if (!tree->root) {
tree->root = chunk;
list_add(&tree->same_root, &chunk->trees);
}
/*
* This has to go last when updating chunk as once replace_chunk() is
* called, new RCU readers can see the new chunk.
*/
replace_chunk(chunk, old);
spin_unlock(&hash_lock);
mutex_unlock(&audit_tree_group->mark_mutex);
fsnotify_put_mark(mark); /* pair to fsnotify_find_mark */
audit_mark_put_chunk(old);
return 0;
}
static void audit_tree_log_remove_rule(struct audit_context *context,
struct audit_krule *rule)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab, "op=remove_rule dir=");
audit_log_untrustedstring(ab, rule->tree->pathname);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=1", rule->listnr);
audit_log_end(ab);
}
static void kill_rules(struct audit_context *context, struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
audit_tree_log_remove_rule(context, rule);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
call_rcu(&entry->rcu, audit_free_rule_rcu);
}
}
}
/*
* Remove tree from chunks. If 'tagged' is set, remove tree only from tagged
* chunks. The function expects tagged chunks are all at the beginning of the
* chunks list.
*/
static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
chunk = find_chunk(p);
mark = chunk->mark;
remove_chunk_node(chunk, p);
/* Racing with audit_tree_freeing_mark()? */
if (!mark)
continue;
fsnotify_get_mark(mark);
spin_unlock(&hash_lock);
untag_chunk(chunk, mark);
fsnotify_put_mark(mark);
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
}
/*
* finish killing struct audit_tree
*/
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
put_tree(victim);
}
/* trim the uncommitted chunks from tree */
static void trim_marked(struct audit_tree *tree)
{
struct list_head *p, *q;
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
return;
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
list_add(p, &tree->chunks);
}
}
spin_unlock(&hash_lock);
prune_tree_chunks(tree, true);
spin_lock(&hash_lock);
if (!tree->root && !tree->goner) {
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
kill_rules(audit_context(), tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
} else {
spin_unlock(&hash_lock);
}
}
static void audit_schedule_prune(void);
/* called with audit_filter_mutex */
int audit_remove_tree_rule(struct audit_krule *rule)
{
struct audit_tree *tree;
tree = rule->tree;
if (tree) {
spin_lock(&hash_lock);
list_del_init(&rule->rlist);
if (list_empty(&tree->rules) && !tree->goner) {
tree->root = NULL;
list_del_init(&tree->same_root);
tree->goner = 1;
list_move(&tree->list, &prune_list);
rule->tree = NULL;
spin_unlock(&hash_lock);
audit_schedule_prune();
return 1;
}
rule->tree = NULL;
spin_unlock(&hash_lock);
return 1;
}
return 0;
}
static int compare_root(struct vfsmount *mnt, void *arg)
{
return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
(unsigned long)arg;
}
void audit_trim_trees(void)
{
struct list_head cursor;
mutex_lock(&audit_filter_mutex);
list_add(&cursor, &tree_list);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
if (err)
goto skip_it;
root_mnt = collect_mounts(&path);
path_put(&path);
if (IS_ERR(root_mnt))
goto skip_it;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
if (iterate_mounts(compare_root,
(void *)(chunk->key),
root_mnt))
node->index &= ~(1U<<31);
}
spin_unlock(&hash_lock);
trim_marked(tree);
drop_collected_mounts(root_mnt);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
(rule->listnr != AUDIT_FILTER_EXIT &&
rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
rule->tree = alloc_tree(pathname);
if (!rule->tree)
return -ENOMEM;
return 0;
}
void audit_put_tree(struct audit_tree *tree)
{
put_tree(tree);
}
static int tag_mount(struct vfsmount *mnt, void *arg)
{
return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
}
/*
* That gets run when evict_chunk() ends up needing to kill audit_tree.
* Runs from a separate thread.
*/
static int prune_tree_thread(void *unused)
{
for (;;) {
if (list_empty(&prune_list)) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(&prune_list)) {
struct audit_tree *victim;
victim = list_entry(prune_list.next,
struct audit_tree, list);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
audit_ctl_unlock();
}
return 0;
}
static int audit_launch_prune(void)
{
if (prune_thread)
return 0;
prune_thread = kthread_run(prune_tree_thread, NULL,
"audit_prune_tree");
if (IS_ERR(prune_thread)) {
pr_err("cannot start thread audit_prune_tree");
prune_thread = NULL;
return -ENOMEM;
}
return 0;
}
/* called with audit_filter_mutex */
int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct vfsmount *mnt;
int err;
rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
rule->tree = tree;
list_add(&rule->rlist, &tree->rules);
return 0;
}
}
tree = seed;
list_add(&tree->list, &tree_list);
list_add(&rule->rlist, &tree->rules);
/* do not set rule->tree yet */
mutex_unlock(&audit_filter_mutex);
if (unlikely(!prune_thread)) {
err = audit_launch_prune();
if (err)
goto Err;
}
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
mnt = collect_mounts(&path);
path_put(&path);
if (IS_ERR(mnt)) {
err = PTR_ERR(mnt);
goto Err;
}
get_tree(tree);
err = iterate_mounts(tag_mount, tree, mnt);
drop_collected_mounts(mnt);
if (!err) {
struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
goto Err;
}
mutex_lock(&audit_filter_mutex);
if (list_empty(&rule->rlist)) {
put_tree(tree);
return -ENOENT;
}
rule->tree = tree;
put_tree(tree);
return 0;
Err:
mutex_lock(&audit_filter_mutex);
list_del_init(&tree->list);
list_del_init(&tree->rules);
put_tree(tree);
return err;
}
int audit_tag_tree(char *old, char *new)
{
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
struct vfsmount *tagged;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
tagged = collect_mounts(&path2);
path_put(&path2);
if (IS_ERR(tagged))
return PTR_ERR(tagged);
err = kern_path(old, 0, &path1);
if (err) {
drop_collected_mounts(tagged);
return err;
}
mutex_lock(&audit_filter_mutex);
list_add(&barrier, &tree_list);
list_add(&cursor, &barrier);
while (cursor.next != &tree_list) {
struct audit_tree *tree;
int good_one = 0;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
if (!err) {
good_one = path_is_under(&path1, &path2);
path_put(&path2);
}
if (!good_one) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
failed = iterate_mounts(tag_mount, tree, tagged);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
break;
}
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
list_move(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
}
while (barrier.prev != &tree_list) {
struct audit_tree *tree;
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
list_move(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
spin_unlock(&hash_lock);
} else {
trim_marked(tree);
}
put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&barrier);
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
drop_collected_mounts(tagged);
return failed;
}
static void audit_schedule_prune(void)
{
wake_up_process(prune_thread);
}
/*
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
void audit_kill_trees(struct audit_context *context)
{
struct list_head *list = &context->killed_trees;
audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
while (!list_empty(list)) {
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
kill_rules(context, victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
prune_one(victim);
mutex_lock(&audit_filter_mutex);
}
mutex_unlock(&audit_filter_mutex);
audit_ctl_unlock();
}
/*
* Here comes the stuff asynchronous to auditctl operations
*/
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
struct list_head *postponed = audit_killed_trees();
int need_prune = 0;
int n;
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
while (!list_empty(&chunk->trees)) {
owner = list_entry(chunk->trees.next,
struct audit_tree, same_root);
owner->goner = 1;
owner->root = NULL;
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
kill_rules(audit_context(), owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
list_move(&owner->list, postponed);
}
spin_lock(&hash_lock);
}
list_del_rcu(&chunk->hash);
for (n = 0; n < chunk->count; n++)
list_del_init(&chunk->owners[n].list);
spin_unlock(&hash_lock);
mutex_unlock(&audit_filter_mutex);
if (need_prune)
audit_schedule_prune();
}
static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *file_name, u32 cookie)
{
return 0;
}
static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
struct audit_chunk *chunk;
mutex_lock(&mark->group->mark_mutex);
spin_lock(&hash_lock);
chunk = mark_chunk(mark);
replace_mark_chunk(mark, NULL);
spin_unlock(&hash_lock);
mutex_unlock(&mark->group->mark_mutex);
if (chunk) {
evict_chunk(chunk);
audit_mark_put_chunk(chunk);
}
/*
* We are guaranteed to have at least one reference to the mark from
* either the inode or the caller of fsnotify_destroy_mark().
*/
BUG_ON(refcount_read(&mark->refcnt) < 1);
}
static const struct fsnotify_ops audit_tree_ops = {
.handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
.free_mark = audit_tree_destroy_watch,
};
static int __init audit_tree_init(void)
{
int i;
audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
if (IS_ERR(audit_tree_group))
audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
return 0;
}
__initcall(audit_tree_init);
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
* audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
* audit_remove_watch(). Additionally, an audit_watch may exist
* temporarily to assist in searching existing filter data. Each
* audit_krule holds a reference to its associated watch.
*/
struct audit_watch {
refcount_t count; /* reference count */
dev_t dev; /* associated superblock device */
char *path; /* insertion path */
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
struct list_head rules; /* anchor for krule->rlist */
};
struct audit_parent {
struct list_head watches; /* anchor for audit_watch->wlist */
struct fsnotify_mark mark; /* fsnotify mark on the inode */
};
/* fsnotify handle. */
static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
WARN_ON(!list_empty(&parent->watches));
kfree(parent);
}
static void audit_watch_free_mark(struct fsnotify_mark *entry)
{
struct audit_parent *parent;
parent = container_of(entry, struct audit_parent, mark);
audit_free_parent(parent);
}
static void audit_get_parent(struct audit_parent *parent)
{
if (likely(parent))
fsnotify_get_mark(&parent->mark);
}
static void audit_put_parent(struct audit_parent *parent)
{
if (likely(parent))
fsnotify_put_mark(&parent->mark);
}
/*
* Find and return the audit_parent on the given inode. If found a reference
* is taken on this parent.
*/
static inline struct audit_parent *audit_find_parent(struct inode *inode)
{
struct audit_parent *parent = NULL;
struct fsnotify_mark *entry;
entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
if (entry)
parent = container_of(entry, struct audit_parent, mark);
return parent;
}
void audit_get_watch(struct audit_watch *watch)
{
refcount_inc(&watch->count);
}
void audit_put_watch(struct audit_watch *watch)
{
if (refcount_dec_and_test(&watch->count)) {
WARN_ON(watch->parent);
WARN_ON(!list_empty(&watch->rules));
kfree(watch->path);
kfree(watch);
}
}
static void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
audit_put_parent(watch->parent);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
char *audit_watch_path(struct audit_watch *watch)
{
return watch->path;
}
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct path *path)
{
struct inode *inode = d_backing_inode(path->dentry);
struct audit_parent *parent;
int ret;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
fsnotify_init_mark(&parent->mark, audit_watch_group);
parent->mark.mask = AUDIT_FS_WATCH;
ret = fsnotify_add_inode_mark(&parent->mark, inode, 0);
if (ret < 0) {
audit_free_parent(parent);
return ERR_PTR(ret);
}
return parent;
}
/* Initialize a watch entry. */
static struct audit_watch *audit_init_watch(char *path)
{
struct audit_watch *watch;
watch = kzalloc(sizeof(*watch), GFP_KERNEL);
if (unlikely(!watch))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&watch->rules);
refcount_set(&watch->count, 1);
watch->path = path;
watch->dev = AUDIT_DEV_UNSET;
watch->ino = AUDIT_INO_UNSET;
return watch;
}
/* Translate a watch string to kernel representation. */
int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
if (!audit_watch_group)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
(krule->listnr != AUDIT_FILTER_EXIT &&
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
watch = audit_init_watch(path);
if (IS_ERR(watch))
return PTR_ERR(watch);
krule->watch = watch;
return 0;
}
/* Duplicate the given audit watch. The new watch's rules list is initialized
* to an empty list and wlist is undefined. */
static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
{
char *path;
struct audit_watch *new;
path = kstrdup(old->path, GFP_KERNEL);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
new = audit_init_watch(path);
if (IS_ERR(new)) {
kfree(path);
goto out;
}
new->dev = old->dev;
new->ino = old->ino;
audit_get_parent(old->parent);
new->parent = old->parent;
out:
return new;
}
static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
audit_log_format(ab, "op=%s path=", op);
audit_log_untrustedstring(ab, w->path);
audit_log_key(ab, r->filterkey);
audit_log_format(ab, " list=%d res=1", r->listnr);
audit_log_end(ab);
}
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
const struct qstr *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
/* Run all of the watches on this parent looking for the one that
* matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
if (audit_compare_dname_path(dname, owatch->path,
AUDIT_NAME_FULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
if (invalidating && !audit_dummy_context())
audit_filter_inodes(current, audit_context());
/* updating ino will likely change which audit_hash_list we
* are on so we need a new watch for the new list */
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
audit_panic("error updating watch, skipping");
return;
}
nwatch->dev = dev;
nwatch->ino = ino;
list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
oentry = container_of(r, struct audit_entry, rule);
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
nentry = audit_dupe_rule(&oentry->rule);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
/*
* nentry->rule.watch == oentry->rule.watch so
* we must drop that reference and set it to our
* new watch.
*/
audit_put_watch(nentry->rule.watch);
audit_get_watch(nwatch);
nentry->rule.watch = nwatch;
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
if (oentry->rule.exe)
audit_remove_mark(oentry->rule.exe);
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
audit_remove_watch(owatch);
goto add_watch_to_parent; /* event applies to a single watch */
}
mutex_unlock(&audit_filter_mutex);
return;
add_watch_to_parent:
list_add(&nwatch->wlist, &parent->watches);
mutex_unlock(&audit_filter_mutex);
return;
}
/* Remove all watches & rules associated with a parent that is going away. */
static void audit_remove_parent_watches(struct audit_parent *parent)
{
struct audit_watch *w, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
if (e->rule.exe)
audit_remove_mark(e->rule.exe);
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
}
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
if (d_is_positive(d)) {
/* update watch filter fields */
watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
}
/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
{
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
BUG_ON(!mutex_is_locked(&audit_filter_mutex));
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
watch_found = 1;
/* put krule's ref to temporary watch */
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
audit_put_parent(parent);
break;
}
if (!watch_found) {
watch->parent = parent;
audit_get_watch(watch);
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
}
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent;
struct path parent_path;
int h, ret = 0;
/*
* When we will be calling audit_add_to_parent, krule->watch might have
* been updated and watch might have been freed.
* So we need to keep a reference of watch.
*/
audit_get_watch(watch);
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
ret = audit_get_nd(watch, &parent_path);
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
if (ret) {
audit_put_watch(watch);
return ret;
}
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
if (!parent) {
parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
goto error;
}
}
audit_add_to_parent(krule, parent);
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
path_put(&parent_path);
audit_put_watch(watch);
return ret;
}
void audit_remove_watch_rule(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
/*
* audit_remove_watch() drops our reference to 'parent' which
* can get freed. Grab our own reference to be safe.
*/
audit_get_parent(parent);
audit_remove_watch(watch);
if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
/* Update watch data in audit rules based on fsnotify events. */
static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
struct inode *inode, struct inode *dir,
const struct qstr *dname, u32 cookie)
{
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
return 0;
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
.handle_inode_event = audit_watch_handle_event,
.free_mark = audit_watch_free_mark,
};
static int __init audit_watch_init(void)
{
audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
if (IS_ERR(audit_watch_group)) {
audit_watch_group = NULL;
audit_panic("cannot create audit fsnotify group");
}
return 0;
}
device_initcall(audit_watch_init);
int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
{
struct audit_fsnotify_mark *audit_mark;
char *pathname;
pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
if (!pathname)
return -ENOMEM;
audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
if (IS_ERR(audit_mark)) {
kfree(pathname);
return PTR_ERR(audit_mark);
}
new->exe = audit_mark;
return 0;
}
int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
{
struct file *exe_file;
unsigned long ino;
dev_t dev;
exe_file = get_task_exe_file(tsk);
if (!exe_file)
return 0;
ino = file_inode(exe_file)->i_ino;
dev = file_inode(exe_file)->i_sb->s_dev;
fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include "audit.h"
/*
* Locking model:
*
* audit_filter_mutex:
* Synchronizes writes and blocking reads of audit's filterlist
* data. Rcu is used to traverse the filterlist and access
* contents of structs audit_entry, audit_watch and opaque
* LSM rules during filtering. If modified, these structures
* must be copied and replace their counterparts in the filterlist.
* An audit_parent struct is not accessed during filtering, so may
* be written directly provided audit_filter_mutex is held.
*/
/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[0]),
LIST_HEAD_INIT(audit_filter_list[1]),
LIST_HEAD_INIT(audit_filter_list[2]),
LIST_HEAD_INIT(audit_filter_list[3]),
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
LIST_HEAD_INIT(audit_filter_list[7]),
#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[0]),
LIST_HEAD_INIT(audit_rules_list[1]),
LIST_HEAD_INIT(audit_rules_list[2]),
LIST_HEAD_INIT(audit_rules_list[3]),
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
static void audit_free_lsm_field(struct audit_field *f)
{
switch (f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
kfree(f->lsm_str);
security_audit_rule_free(f->lsm_rule);
}
}
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
/* some rules don't have associated watches */
if (erule->watch)
audit_put_watch(erule->watch);
if (erule->fields)
for (i = 0; i < erule->field_count; i++)
audit_free_lsm_field(&erule->fields[i]);
kfree(erule->fields);
kfree(erule->filterkey);
kfree(e);
}
void audit_free_rule_rcu(struct rcu_head *head)
{
struct audit_entry *e = container_of(head, struct audit_entry, rcu);
audit_free_rule(e);
}
/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
struct audit_entry *entry;
struct audit_field *fields;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (unlikely(!entry))
return NULL;
fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
if (unlikely(!fields)) {
kfree(entry);
return NULL;
}
entry->rule.fields = fields;
return entry;
}
/* Unpack a filter field's string representation from user-space
* buffer. */
char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
{
char *str;
if (!*bufp || (len == 0) || (len > *remain))
return ERR_PTR(-EINVAL);
/* Of the currently implemented string fields, PATH_MAX
* defines the longest valid length.
*/
if (len > PATH_MAX)
return ERR_PTR(-ENAMETOOLONG);
str = kmalloc(len + 1, GFP_KERNEL);
if (unlikely(!str))
return ERR_PTR(-ENOMEM);
memcpy(str, *bufp, len);
str[len] = 0;
*bufp += len;
*remain -= len;
return str;
}
/* Translate an inode field to kernel representation. */
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
if ((krule->listnr != AUDIT_FILTER_EXIT &&
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
krule->inode_f = f;
return 0;
}
static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
{
__u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
unsigned n = *list++;
if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
kfree(p);
return -EINVAL;
}
p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
}
if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
kfree(p);
return -EINVAL;
}
classes[class] = p;
return 0;
}
int audit_match_class(int class, unsigned syscall)
{
if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
return 0;
if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
return 0;
return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
}
#ifdef CONFIG_AUDITSYSCALL
static inline int audit_match_class_bits(int class, u32 *mask)
{
int i;
if (classes[class]) {
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
if (mask[i] & classes[class][i])
return 0;
}
return 1;
}
static int audit_match_signal(struct audit_entry *entry)
{
struct audit_field *arch = entry->rule.arch_f;
if (!arch) {
/* When arch is unspecified, we must check both masks on biarch
* as syscall number alone is ambiguous. */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask) &&
audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
entry->rule.mask));
}
switch(audit_classify_arch(arch->val)) {
case 0: /* native */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
entry->rule.mask));
case 1: /* 32bit on biarch */
return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
entry->rule.mask));
default:
return 1;
}
}
#endif
/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
unsigned listnr;
struct audit_entry *entry;
int i, err;
err = -EINVAL;
listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
switch(listnr) {
default:
goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
pr_err("AUDIT_POSSIBLE is deprecated\n");
goto exit_err;
}
if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
goto exit_err;
if (rule->field_count > AUDIT_MAX_FIELDS)
goto exit_err;
err = -ENOMEM;
entry = audit_init_entry(rule->field_count);
if (!entry)
goto exit_err;
entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
entry->rule.listnr = listnr;
entry->rule.action = rule->action;
entry->rule.field_count = rule->field_count;
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
entry->rule.mask[i] = rule->mask[i];
for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
__u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
__u32 *class;
if (!(*p & AUDIT_BIT(bit)))
continue;
*p &= ~AUDIT_BIT(bit);
class = classes[i];
if (class) {
int j;
for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
entry->rule.mask[j] |= class[j];
}
}
return entry;
exit_err:
return ERR_PTR(err);
}
static u32 audit_ops[] =
{
[Audit_equal] = AUDIT_EQUAL,
[Audit_not_equal] = AUDIT_NOT_EQUAL,
[Audit_bitmask] = AUDIT_BIT_MASK,
[Audit_bittest] = AUDIT_BIT_TEST,
[Audit_lt] = AUDIT_LESS_THAN,
[Audit_gt] = AUDIT_GREATER_THAN,
[Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
[Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
};
static u32 audit_to_op(u32 op)
{
u32 n;
for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
;
return n;
}
/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
switch (f->type) {
case AUDIT_MSGTYPE:
if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
return -EINVAL;
break;
case AUDIT_FSTYPE:
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
case AUDIT_PERM:
if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
return -EINVAL;
break;
}
switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
switch(f->type) {
case AUDIT_FSTYPE:
case AUDIT_FILTERKEY:
break;
default:
return -EINVAL;
}
}
/* Check for valid field type and op */
switch (f->type) {
case AUDIT_ARG0:
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
case AUDIT_PERS: /* <uapi/linux/personality.h> */
case AUDIT_DEVMINOR:
/* all ops are valid */
break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_LOGINUID:
case AUDIT_OBJ_UID:
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
case AUDIT_PID:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
case AUDIT_SESSIONID:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
case AUDIT_SADDR_FAM:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_WATCH:
case AUDIT_DIR:
case AUDIT_FILTERKEY:
case AUDIT_LOGINUID_SET:
case AUDIT_ARCH:
case AUDIT_FSTYPE:
case AUDIT_PERM:
case AUDIT_FILETYPE:
case AUDIT_FIELD_COMPARE:
case AUDIT_EXE:
/* only equal and not equal valid ops */
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
default:
/* field not recognized */
return -EINVAL;
}
/* Check for select valid field values */
switch (f->type) {
case AUDIT_LOGINUID_SET:
if ((f->val != 0) && (f->val != 1))
return -EINVAL;
break;
case AUDIT_PERM:
if (f->val & ~15)
return -EINVAL;
break;
case AUDIT_FILETYPE:
if (f->val & ~S_IFMT)
return -EINVAL;
break;
case AUDIT_FIELD_COMPARE:
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
case AUDIT_SADDR_FAM:
if (f->val >= AF_MAX)
return -EINVAL;
break;
default:
break;
}
return 0;
}
/* Translate struct audit_rule_data to kernel's rule representation. */
static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t datasz)
{
int err = 0;
struct audit_entry *entry;
void *bufp;
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
char *str;
struct audit_fsnotify_mark *audit_mark;
entry = audit_to_entry_common(data);
if (IS_ERR(entry))
goto exit_nofree;
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
u32 f_val;
err = -EINVAL;
f->op = audit_to_op(data->fieldflags[i]);
if (f->op == Audit_bad)
goto exit_free;
f->type = data->fields[i];
f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
err = audit_field_valid(entry, f);
if (err)
goto exit_free;
err = -EINVAL;
switch (f->type) {
case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
entry->rule.buflen += f_val;
f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
(void **)&f->lsm_rule);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (err == -EINVAL) {
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
} else if (err)
goto exit_free;
break;
case AUDIT_WATCH:
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
if (entry->rule.exe || f_val > PATH_MAX)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
goto exit_free;
}
entry->rule.buflen += f_val;
entry->rule.exe = audit_mark;
break;
default:
f->val = f_val;
break;
}
}
if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
entry->rule.inode_f = NULL;
exit_nofree:
return entry;
exit_free:
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe); /* that's the template one */
audit_free_rule(entry);
return ERR_PTR(err);
}
/* Pack a filter field's string representation into data block. */
static inline size_t audit_pack_string(void **bufp, const char *str)
{
size_t len = strlen(str);
memcpy(*bufp, str, len);
*bufp += len;
return len;
}
/* Translate kernel rule representation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
struct audit_rule_data *data;
void *bufp;
int i;
data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
data->flags = krule->flags | krule->listnr;
data->action = krule->action;
data->field_count = krule->field_count;
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &krule->fields[i];
data->fields[i] = f->type;
data->fieldflags[i] = audit_ops[f->op];
switch(f->type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
data->buflen += data->values[i] =
audit_pack_string(&bufp, f->lsm_str);
break;
case AUDIT_WATCH:
data->buflen += data->values[i] =
audit_pack_string(&bufp,
audit_watch_path(krule->watch));
break;
case AUDIT_DIR:
data->buflen += data->values[i] =
audit_pack_string(&bufp,
audit_tree_path(krule->tree));
break;
case AUDIT_FILTERKEY:
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
case AUDIT_EXE:
data->buflen += data->values[i] =
audit_pack_string(&bufp, audit_mark_path(krule->exe));
break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
data->values[i] = AUDIT_UID_UNSET;
break;
}
fallthrough; /* if set */
default:
data->values[i] = f->val;
}
}
for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
return data;
}
/* Compare two rules in kernel format. Considered success if rules
* don't match. */
static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
{
int i;
if (a->flags != b->flags ||
a->pflags != b->pflags ||
a->listnr != b->listnr ||
a->action != b->action ||
a->field_count != b->field_count)
return 1;
for (i = 0; i < a->field_count; i++) {
if (a->fields[i].type != b->fields[i].type ||
a->fields[i].op != b->fields[i].op)
return 1;
switch(a->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
return 1;
break;
case AUDIT_WATCH:
if (strcmp(audit_watch_path(a->watch),
audit_watch_path(b->watch)))
return 1;
break;
case AUDIT_DIR:
if (strcmp(audit_tree_path(a->tree),
audit_tree_path(b->tree)))
return 1;
break;
case AUDIT_FILTERKEY:
/* both filterkeys exist based on above type compare */
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
case AUDIT_EXE:
/* both paths exist based on above type compare */
if (strcmp(audit_mark_path(a->exe),
audit_mark_path(b->exe)))
return 1;
break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_LOGINUID:
case AUDIT_OBJ_UID:
if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
return 1;
break;
case AUDIT_GID:
case AUDIT_EGID:
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
return 1;
break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
}
}
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
if (a->mask[i] != b->mask[i])
return 1;
return 0;
}
/* Duplicate LSM field information. The lsm_rule is opaque, so must be
* re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
struct audit_field *sf)
{
int ret = 0;
char *lsm_str;
/* our own copy of lsm_str */
lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
if (unlikely(!lsm_str))
return -ENOMEM;
df->lsm_str = lsm_str;
/* our own (refreshed) copy of lsm_rule */
ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
(void **)&df->lsm_rule);
/* Keep currently invalid fields around in case they
* become valid after a policy reload. */
if (ret == -EINVAL) {
pr_warn("audit rule for LSM \'%s\' is invalid\n",
df->lsm_str);
ret = 0;
}
return ret;
}
/* Duplicate an audit rule. This will be a deep copy with the exception
* of the watch - that pointer is carried over. The LSM specific fields
* will be updated in the copy. The point is to be able to replace the old
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
struct audit_krule *new;
char *fk;
int i, err = 0;
entry = audit_init_entry(fcount);
if (unlikely(!entry))
return ERR_PTR(-ENOMEM);
new = &entry->rule;
new->flags = old->flags;
new->pflags = old->pflags;
new->listnr = old->listnr;
new->action = old->action;
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
new->mask[i] = old->mask[i];
new->prio = old->prio;
new->buflen = old->buflen;
new->inode_f = old->inode_f;
new->field_count = old->field_count;
/*
* note that we are OK with not refcounting here; audit_match_tree()
* never dereferences tree and we can't get false positives there
* since we'd have to have rule gone from the list *and* removed
* before the chunks found by lookup had been allocated, i.e. before
* the beginning of list scan.
*/
new->tree = old->tree;
memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
/* deep copy this information, updating the lsm_rule fields, because
* the originals will all be freed when the old rule is freed. */
for (i = 0; i < fcount; i++) {
switch (new->fields[i].type) {
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
err = audit_dupe_lsm_field(&new->fields[i],
&old->fields[i]);
break;
case AUDIT_FILTERKEY:
fk = kstrdup(old->filterkey, GFP_KERNEL);
if (unlikely(!fk))
err = -ENOMEM;
else
new->filterkey = fk;
break;
case AUDIT_EXE:
err = audit_dupe_exe(new, old);
break;
}
if (err) {
if (new->exe)
audit_remove_mark(new->exe);
audit_free_rule(entry);
return ERR_PTR(err);
}
}
if (old->watch) {
audit_get_watch(old->watch);
new->watch = old->watch;
}
return entry;
}
/* Find an existing audit rule.
* Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
struct list_head **p)
{
struct audit_entry *e, *found = NULL;
struct list_head *list;
int h;
if (entry->rule.inode_f) {
h = audit_hash_ino(entry->rule.inode_f->val);
*p = list = &audit_inode_hash[h];
} else if (entry->rule.watch) {
/* we don't know the inode number, so must walk entire hash */
for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
list = &audit_inode_hash[h];
list_for_each_entry(e, list, list)
if (!audit_compare_rule(&entry->rule, &e->rule)) {
found = e;
goto out;
}
}
goto out;
} else {
*p = list = &audit_filter_list[entry->rule.listnr];
}
list_for_each_entry(e, list, list)
if (!audit_compare_rule(&entry->rule, &e->rule)) {
found = e;
goto out;
}
out:
return found;
}
static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;
/* Add rule to given filterlist if not a duplicate. */
static inline int audit_add_rule(struct audit_entry *entry)
{
struct audit_entry *e;
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int err = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
/* If any of these, don't count towards total */
switch(entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
dont_count = 1;
}
#endif
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (e) {
mutex_unlock(&audit_filter_mutex);
err = -EEXIST;
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
audit_put_tree(tree);
return err;
}
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
/*
* normally audit_add_tree_rule() will free it
* on failure
*/
if (tree)
audit_put_tree(tree);
return err;
}
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
return err;
}
}
entry->rule.prio = ~0ULL;
if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
entry->rule.prio = --prio_low;
}
if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
list_add(&entry->rule.list,
&audit_rules_list[entry->rule.listnr]);
list_add_rcu(&entry->list, list);
entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
} else {
list_add_tail(&entry->rule.list,
&audit_rules_list[entry->rule.listnr]);
list_add_tail_rcu(&entry->list, list);
}
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
audit_n_rules++;
if (!audit_match_signal(entry))
audit_signals++;
#endif
mutex_unlock(&audit_filter_mutex);
return err;
}
/* Remove an existing rule from filterlist. */
int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
/* If any of these, don't count towards total */
switch(entry->rule.listnr) {
case AUDIT_FILTER_USER:
case AUDIT_FILTER_EXCLUDE:
case AUDIT_FILTER_FS:
dont_count = 1;
}
#endif
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (!e) {
ret = -ENOENT;
goto out;
}
if (e->rule.watch)
audit_remove_watch_rule(&e->rule);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
if (e->rule.exe)
audit_remove_mark_rule(&e->rule);
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
audit_n_rules--;
if (!audit_match_signal(entry))
audit_signals--;
#endif
list_del_rcu(&e->list);
list_del(&e->rule.list);
call_rcu(&e->rcu, audit_free_rule_rcu);
out:
mutex_unlock(&audit_filter_mutex);
if (tree)
audit_put_tree(tree); /* that's the temporary one */
return ret;
}
/* List rules using struct audit_rule_data. */
static void audit_list_rules(int seq, struct sk_buff_head *q)
{
struct sk_buff *skb;
struct audit_krule *r;
int i;
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
for (i=0; i<AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
data = audit_krule_to_data(r);
if (unlikely(!data))
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
}
}
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
if (skb)
skb_queue_tail(q, skb);
}
/* Log rule additions and removals */
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
audit_log_task_context(ab);
audit_log_format(ab, " op=%s", action);
audit_log_key(ab, rule->filterkey);
audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
audit_log_end(ab);
}
/**
* audit_rule_change - apply all rules to the specified message type
* @type: audit message type
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
*/
int audit_rule_change(int type, int seq, void *data, size_t datasz)
{
int err = 0;
struct audit_entry *entry;
switch (type) {
case AUDIT_ADD_RULE:
entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
audit_log_rule_change("add_rule", &entry->rule, !err);
break;
case AUDIT_DEL_RULE:
entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
audit_log_rule_change("remove_rule", &entry->rule, !err);
break;
default:
WARN_ON(1);
return -EINVAL;
}
if (err || type == AUDIT_DEL_RULE) {
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
audit_free_rule(entry);
}
return err;
}
/**
* audit_list_rules_send - list the audit rules
* @request_skb: skb of request we are replying to (used to target the reply)
* @seq: netlink audit message sequence (serial) number
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
struct task_struct *tsk;
struct audit_netlink_list *dest;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
* auditctl to read from it... which isn't ever going to
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
put_net(dest->net);
kfree(dest);
return PTR_ERR(tsk);
}
return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
case Audit_equal:
return (left == right);
case Audit_not_equal:
return (left != right);
case Audit_lt:
return (left < right);
case Audit_le:
return (left <= right);
case Audit_gt:
return (left > right);
case Audit_ge:
return (left >= right);
case Audit_bitmask:
return (left & right);
case Audit_bittest:
return ((left & right) == right);
default:
return 0;
}
}
int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
switch (op) {
case Audit_equal:
return uid_eq(left, right);
case Audit_not_equal:
return !uid_eq(left, right);
case Audit_lt:
return uid_lt(left, right);
case Audit_le:
return uid_lte(left, right);
case Audit_gt:
return uid_gt(left, right);
case Audit_ge:
return uid_gte(left, right);
case Audit_bitmask:
case Audit_bittest:
default:
return 0;
}
}
int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
{
switch (op) {
case Audit_equal:
return gid_eq(left, right);
case Audit_not_equal:
return !gid_eq(left, right);
case Audit_lt:
return gid_lt(left, right);
case Audit_le:
return gid_lte(left, right);
case Audit_gt:
return gid_gt(left, right);
case Audit_ge:
return gid_gte(left, right);
case Audit_bitmask:
case Audit_bittest:
default:
return 0;
}
}
/**
* parent_len - find the length of the parent portion of a pathname
* @path: pathname of which to determine length
*/
int parent_len(const char *path)
{
int plen;
const char *p;
plen = strlen(path);
if (plen == 0)
return plen;
/* disregard trailing slashes */
p = path + plen - 1;
while ((*p == '/') && (p > path))
p--;
/* walk backward until we find the next slash or hit beginning */
while ((*p != '/') && (p > path))
p--;
/* did we find a slash? Then increment to include it in path */
if (*p == '/')
p++;
return p - path;
}
/**
* audit_compare_dname_path - compare given dentry name with last component in
* given path. Return of 0 indicates a match.
* @dname: dentry name that we're comparing
* @path: full pathname that we're comparing
* @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
* here indicates that we must compute this value.
*/
int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
int dlen, pathlen;
const char *p;
dlen = dname->len;
pathlen = strlen(path);
if (pathlen < dlen)
return 1;
parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
if (pathlen - parentlen != dlen)
return 1;
p = path + parentlen;
return strncmp(p, dname->name, dlen);
}
int audit_filter(int msgtype, unsigned int listtype)
{
struct audit_entry *e;
int ret = 1; /* Audit by default */
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
int i, result = 0;
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
pid_t pid;
u32 sid;
switch (f->type) {
case AUDIT_PID:
pid = task_pid_nr(current);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_UID:
result = audit_uid_comparator(current_uid(), f->op, f->uid);
break;
case AUDIT_GID:
result = audit_gid_comparator(current_gid(), f->op, f->gid);
break;
case AUDIT_LOGINUID:
result = audit_uid_comparator(audit_get_loginuid(current),
f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(current),
f->op, f->val);
break;
case AUDIT_MSGTYPE:
result = audit_comparator(msgtype, f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
break;
case AUDIT_EXE:
result = audit_exe_compare(current, e->rule.exe);
if (f->op == Audit_not_equal)
result = !result;
break;
default:
goto unlock_and_return;
}
if (result < 0) /* error */
goto unlock_and_return;
if (!result)
break;
}
if (result > 0) {
if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
ret = 0;
break;
}
}
unlock_and_return:
rcu_read_unlock();
return ret;
}
static int update_lsm_rule(struct audit_krule *r)
{
struct audit_entry *entry = container_of(r, struct audit_entry, rule);
struct audit_entry *nentry;
int err = 0;
if (!security_audit_rule_known(r))
return 0;
nentry = audit_dupe_rule(r);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
err = PTR_ERR(nentry);
audit_panic("error updating LSM filters");
if (r->watch)
list_del(&r->rlist);
list_del_rcu(&entry->list);
list_del(&r->list);
} else {
if (r->watch || r->tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
list_replace_rcu(&entry->list, &nentry->list);
list_replace(&r->list, &nentry->rule.list);
}
call_rcu(&entry->rcu, audit_free_rule_rcu);
return err;
}
/* This function will re-initialize the lsm_rule field of all applicable rules.
* It will traverse the filter lists serarching for rules that contain LSM
* specific filter fields. When such a rule is found, it is copied, the
* LSM field is re-initialized, and the old rule is replaced with the
* updated rule. */
int audit_update_lsm_rules(void)
{
struct audit_krule *r, *n;
int i, err = 0;
/* audit_filter_mutex synchronizes the writers */
mutex_lock(&audit_filter_mutex);
for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
int res = update_lsm_rule(r);
if (!err)
err = res;
}
}
mutex_unlock(&audit_filter_mutex);
return err;
}
// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
* especially the idea of avoiding a copy by using getname.
*
* The method for actual interception of syscall entry and exit (not in
* this file -- see entry.S) is based on a GPL'd patch written by
* okir@suse.de and Copyright 2003 SuSE Linux AG.
*
* POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
* 2006.
*
* The support of additional filter rules compares (>, <, >=, <=) was
* added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
*
* Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
* filesystem information.
*
* Subject and object context labeling support added by <danjones@us.ibm.com>
* and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <asm/types.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/socket.h>
#include <linux/mqueue.h>
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
#include <linux/netlink.h>
#include <linux/compiler.h>
#include <asm/unistd.h>
#include <linux/security.h>
#include <linux/list.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <asm/syscall.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
#include "audit.h"
/* flags stating the success for a syscall */
#define AUDITSC_INVALID 0
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
/* no execve audit message should be longer than this (userspace limits),
* see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
#define MAX_PROCTITLE_AUDIT_LEN 128
/* number of audit rules */
int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
struct audit_aux_data {
struct audit_aux_data *next;
int type;
};
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
struct audit_aux_data_pids {
struct audit_aux_data d;
pid_t target_pid[AUDIT_AUX_PIDS];
kuid_t target_auid[AUDIT_AUX_PIDS];
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
struct audit_aux_data_bprm_fcaps {
struct audit_aux_data d;
struct audit_cap_data fcap;
unsigned int fcap_ver;
struct audit_cap_data old_pcap;
struct audit_cap_data new_pcap;
};
struct audit_tree_refs {
struct audit_tree_refs *next;
struct audit_chunk *c[31];
};
struct audit_nfcfgop_tab {
enum audit_nfcfgop op;
const char *s;
};
static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
{ AUDIT_XT_OP_REGISTER, "xt_register" },
{ AUDIT_XT_OP_REPLACE, "xt_replace" },
{ AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
{ AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
{ AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
{ AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
{ AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
{ AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
{ AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
{ AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
{ AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
{ AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
{ AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
{ AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
{ AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
{ AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
{ AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
{ AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
{ AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
{ AUDIT_NFT_OP_INVALID, "nft_invalid" },
};
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
if (unlikely(!ctx))
return 0;
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
if ((mask & AUDIT_PERM_READ) &&
audit_match_class(AUDIT_CLASS_READ, n))
return 1;
if ((mask & AUDIT_PERM_ATTR) &&
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
if ((mask & AUDIT_PERM_READ) &&
audit_match_class(AUDIT_CLASS_READ_32, n))
return 1;
if ((mask & AUDIT_PERM_ATTR) &&
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
case AUDITSC_OPENAT2:
return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
}
static int audit_match_filetype(struct audit_context *ctx, int val)
{
struct audit_names *n;
umode_t mode = (umode_t)val;
if (unlikely(!ctx))
return 0;
list_for_each_entry(n, &ctx->names_list, list) {
if ((n->ino != AUDIT_INO_UNSET) &&
((n->mode & S_IFMT) == mode))
return 1;
}
return 0;
}
/*
* We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *;
* ->first_trees points to its beginning, ->trees - to the current end of data.
* ->tree_count is the number of free entries in array pointed to by ->trees.
* Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL,
* "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously,
* it's going to remain 1-element for almost any setup) until we free context itself.
* References in it _are_ dropped - at the same time we free/drop aux stuff.
*/
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_STATE_RECORD;
}
}
static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
int left = ctx->tree_count;
if (likely(left)) {
p->c[--left] = chunk;
ctx->tree_count = left;
return 1;
}
if (!p)
return 0;
p = p->next;
if (p) {
p->c[30] = chunk;
ctx->trees = p;
ctx->tree_count = 30;
return 1;
}
return 0;
}
static int grow_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p = ctx->trees;
ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
if (!ctx->trees) {
ctx->trees = p;
return 0;
}
if (p)
p->next = ctx->trees;
else
ctx->first_trees = ctx->trees;
ctx->tree_count = 31;
return 1;
}
static void unroll_tree_refs(struct audit_context *ctx,
struct audit_tree_refs *p, int count)
{
struct audit_tree_refs *q;
int n;
if (!p) {
/* we started with empty chain */
p = ctx->first_trees;
count = 31;
/* if the very first allocation has failed, nothing to do */
if (!p)
return;
}
n = count;
for (q = p; q != ctx->trees; q = q->next, n = 31) {
while (n--) {
audit_put_chunk(q->c[n]);
q->c[n] = NULL;
}
}
while (n-- > ctx->tree_count) {
audit_put_chunk(q->c[n]);
q->c[n] = NULL;
}
ctx->trees = p;
ctx->tree_count = count;
}
static void free_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p, *q;
for (p = ctx->first_trees; p; p = q) {
q = p->next;
kfree(p);
}
}
static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
struct audit_tree_refs *p;
int n;
if (!tree)
return 0;
/* full ones */
for (p = ctx->first_trees; p != ctx->trees; p = p->next) {
for (n = 0; n < 31; n++)
if (audit_tree_match(p->c[n], tree))
return 1;
}
/* partial */
if (p) {
for (n = ctx->tree_count; n < 31; n++)
if (audit_tree_match(p->c[n], tree))
return 1;
}
return 0;
}
static int audit_compare_uid(kuid_t uid,
struct audit_names *name,
struct audit_field *f,
struct audit_context *ctx)
{
struct audit_names *n;
int rc;
if (name) {
rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_uid_comparator(uid, f->op, n->uid);
if (rc)
return rc;
}
}
return 0;
}
static int audit_compare_gid(kgid_t gid,
struct audit_names *name,
struct audit_field *f,
struct audit_context *ctx)
{
struct audit_names *n;
int rc;
if (name) {
rc = audit_gid_comparator(gid, f->op, name->gid);
if (rc)
return rc;
}
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_gid_comparator(gid, f->op, n->gid);
if (rc)
return rc;
}
}
return 0;
}
static int audit_field_compare(struct task_struct *tsk,
const struct cred *cred,
struct audit_field *f,
struct audit_context *ctx,
struct audit_names *name)
{
switch (f->val) {
/* process to file object comparisons */
case AUDIT_COMPARE_UID_TO_OBJ_UID:
return audit_compare_uid(cred->uid, name, f, ctx);
case AUDIT_COMPARE_GID_TO_OBJ_GID:
return audit_compare_gid(cred->gid, name, f, ctx);
case AUDIT_COMPARE_EUID_TO_OBJ_UID:
return audit_compare_uid(cred->euid, name, f, ctx);
case AUDIT_COMPARE_EGID_TO_OBJ_GID:
return audit_compare_gid(cred->egid, name, f, ctx);
case AUDIT_COMPARE_AUID_TO_OBJ_UID:
return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
case AUDIT_COMPARE_SUID_TO_OBJ_UID:
return audit_compare_uid(cred->suid, name, f, ctx);
case AUDIT_COMPARE_SGID_TO_OBJ_GID:
return audit_compare_gid(cred->sgid, name, f, ctx);
case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
return audit_compare_uid(cred->fsuid, name, f, ctx);
case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
return audit_compare_gid(cred->fsgid, name, f, ctx);
/* uid comparisons */
case AUDIT_COMPARE_UID_TO_AUID:
return audit_uid_comparator(cred->uid, f->op,
audit_get_loginuid(tsk));
case AUDIT_COMPARE_UID_TO_EUID:
return audit_uid_comparator(cred->uid, f->op, cred->euid);
case AUDIT_COMPARE_UID_TO_SUID:
return audit_uid_comparator(cred->uid, f->op, cred->suid);
case AUDIT_COMPARE_UID_TO_FSUID:
return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
/* auid comparisons */
case AUDIT_COMPARE_AUID_TO_EUID:
return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
cred->euid);
case AUDIT_COMPARE_AUID_TO_SUID:
return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
cred->suid);
case AUDIT_COMPARE_AUID_TO_FSUID:
return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
cred->fsuid);
/* euid comparisons */
case AUDIT_COMPARE_EUID_TO_SUID:
return audit_uid_comparator(cred->euid, f->op, cred->suid);
case AUDIT_COMPARE_EUID_TO_FSUID:
return audit_uid_comparator(cred->euid, f->op, cred->fsuid);
/* suid comparisons */
case AUDIT_COMPARE_SUID_TO_FSUID:
return audit_uid_comparator(cred->suid, f->op, cred->fsuid);
/* gid comparisons */
case AUDIT_COMPARE_GID_TO_EGID:
return audit_gid_comparator(cred->gid, f->op, cred->egid);
case AUDIT_COMPARE_GID_TO_SGID:
return audit_gid_comparator(cred->gid, f->op, cred->sgid);
case AUDIT_COMPARE_GID_TO_FSGID:
return audit_gid_comparator(cred->gid, f->op, cred->fsgid);
/* egid comparisons */
case AUDIT_COMPARE_EGID_TO_SGID:
return audit_gid_comparator(cred->egid, f->op, cred->sgid);
case AUDIT_COMPARE_EGID_TO_FSGID:
return audit_gid_comparator(cred->egid, f->op, cred->fsgid);
/* sgid comparison */
case AUDIT_COMPARE_SGID_TO_FSGID:
return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);
default:
WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
return 0;
}
return 0;
}
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise.
*
* If task_creation is true, this is an explicit indication that we are
* filtering a task rule at task creation time. This and tsk == current are
* the only situations where tsk->cred may be accessed without an rcu read lock.
*/
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
enum audit_state *state,
bool task_creation)
{
const struct cred *cred;
int i, need_sid = 1;
u32 sid;
unsigned int sessionid;
if (ctx && rule->prio <= ctx->prio)
return 0;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
struct audit_names *n;
int result = 0;
pid_t pid;
switch (f->type) {
case AUDIT_PID:
pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
if (ctx) {
if (!ctx->ppid)
ctx->ppid = task_ppid_nr(tsk);
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
case AUDIT_EXE:
result = audit_exe_compare(tsk, rule->exe);
if (f->op == Audit_not_equal)
result = !result;
break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
case AUDIT_EUID:
result = audit_uid_comparator(cred->euid, f->op, f->uid);
break;
case AUDIT_SUID:
result = audit_uid_comparator(cred->suid, f->op, f->uid);
break;
case AUDIT_FSUID:
result = audit_uid_comparator(cred->fsuid, f->op, f->uid);
break;
case AUDIT_GID:
result = audit_gid_comparator(cred->gid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_EGID:
result = audit_gid_comparator(cred->egid, f->op, f->gid);
if (f->op == Audit_equal) {
if (!result)
result = groups_search(cred->group_info, f->gid);
} else if (f->op == Audit_not_equal) {
if (result)
result = !groups_search(cred->group_info, f->gid);
}
break;
case AUDIT_SGID:
result = audit_gid_comparator(cred->sgid, f->op, f->gid);
break;
case AUDIT_FSGID:
result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
break;
case AUDIT_SESSIONID:
sessionid = audit_get_sessionid(tsk);
result = audit_comparator(sessionid, f->op, f->val);
break;
case AUDIT_PERS:
result = audit_comparator(tsk->personality, f->op, f->val);
break;
case AUDIT_ARCH:
if (ctx)
result = audit_comparator(ctx->arch, f->op, f->val);
break;
case AUDIT_EXIT:
if (ctx && ctx->return_valid != AUDITSC_INVALID)
result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
if (ctx && ctx->return_valid != AUDITSC_INVALID) {
if (f->val)
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
}
break;
case AUDIT_DEVMAJOR:
if (name) {
if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
audit_comparator(MAJOR(name->rdev), f->op, f->val))
++result;
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
++result;
break;
}
}
}
break;
case AUDIT_DEVMINOR:
if (name) {
if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
audit_comparator(MINOR(name->rdev), f->op, f->val))
++result;
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
audit_comparator(MINOR(n->rdev), f->op, f->val)) {
++result;
break;
}
}
}
break;
case AUDIT_INODE:
if (name)
result = audit_comparator(name->ino, f->op, f->val);
else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_comparator(n->ino, f->op, f->val)) {
++result;
break;
}
}
}
break;
case AUDIT_OBJ_UID:
if (name) {
result = audit_uid_comparator(name->uid, f->op, f->uid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_uid_comparator(n->uid, f->op, f->uid)) {
++result;
break;
}
}
}
break;
case AUDIT_OBJ_GID:
if (name) {
result = audit_gid_comparator(name->gid, f->op, f->gid);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_gid_comparator(n->gid, f->op, f->gid)) {
++result;
break;
}
}
}
break;
case AUDIT_WATCH:
if (name) {
result = audit_watch_compare(rule->watch,
name->ino,
name->dev);
if (f->op == Audit_not_equal)
result = !result;
}
break;
case AUDIT_DIR:
if (ctx) {
result = match_tree_refs(ctx, rule->tree);
if (f->op == Audit_not_equal)
result = !result;
}
break;
case AUDIT_LOGINUID:
result = audit_uid_comparator(audit_get_loginuid(tsk),
f->op, f->uid);
break;
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
/* NOTE: this may return negative values indicating
a temporary error. We simply treat this as a
match for now to avoid losing information that
may be wanted. An error message will also be
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
/* @tsk should always be equal to
* @current with the exception of
* fork()/copy_process() in which case
* the new @tsk creds are still a dup
* of @current's creds so we can still
* use security_current_getsecid_subj()
* here even though it always refs
* @current's creds
*/
security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
f->op,
f->lsm_rule);
}
break;
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
/* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
also applies here */
if (f->lsm_rule) {
/* Find files that match */
if (name) {
result = security_audit_rule_match(
name->osid,
f->type,
f->op,
f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (security_audit_rule_match(
n->osid,
f->type,
f->op,
f->lsm_rule)) {
++result;
break;
}
}
}
/* Find ipc objects that match */
if (!ctx || ctx->type != AUDIT_IPC)
break;
if (security_audit_rule_match(ctx->ipc.osid,
f->type, f->op,
f->lsm_rule))
++result;
}
break;
case AUDIT_ARG0:
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
if (ctx)
result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
break;
case AUDIT_FILTERKEY:
/* ignore this field for filtering */
result = 1;
break;
case AUDIT_PERM:
result = audit_match_perm(ctx, f->val);
if (f->op == Audit_not_equal)
result = !result;
break;
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
if (f->op == Audit_not_equal)
result = !result;
break;
case AUDIT_FIELD_COMPARE:
result = audit_field_compare(tsk, cred, f, ctx, name);
break;
}
if (!result)
return 0;
}
if (ctx) {
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
}
ctx->prio = rule->prio;
}
switch (rule->action) {
case AUDIT_NEVER:
*state = AUDIT_STATE_DISABLED;
break;
case AUDIT_ALWAYS:
*state = AUDIT_STATE_RECORD;
break;
}
return 1;
}
/* At process creation time, we can determine if system-call auditing is
* completely disabled for this task. Since we only have the task
* structure at this point, we can only check uid and gid.
*/
static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
{
struct audit_entry *e;
enum audit_state state;
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
&state, true)) {
if (state == AUDIT_STATE_RECORD)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
}
rcu_read_unlock();
return AUDIT_STATE_BUILD;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
{
int word, bit;
if (val > 0xffffffff)
return false;
word = AUDIT_WORD(val);
if (word >= AUDIT_BITMASK_SIZE)
return false;
bit = AUDIT_BIT(val);
return rule->mask[word] & bit;
}
/**
* audit_filter_uring - apply filters to an io_uring operation
* @tsk: associated task
* @ctx: audit context
*/
static void audit_filter_uring(struct task_struct *tsk,
struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
if (auditd_test_task(tsk))
return;
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
list) {
if (audit_in_mask(&e->rule, ctx->uring_op) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
}
}
rcu_read_unlock();
}
/* At syscall exit time, this filter is called if the audit_state is
* not low enough that auditing cannot take place, but is also not
* high enough that we already know we have to write an audit record
* (i.e., the state is AUDIT_STATE_BUILD).
*/
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
if (auditd_test_task(tsk))
return;
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
}
}
rcu_read_unlock();
return;
}
/*
* Given an audit_name check the inode hash table to see if they match.
* Called holding the rcu read lock to protect the use of audit_inode_hash
*/
static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_names *n,
struct audit_context *ctx) {
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
struct audit_entry *e;
enum audit_state state;
list_for_each_entry_rcu(e, list, list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
ctx->current_state = state;
return 1;
}
}
return 0;
}
/* At syscall exit time, this filter is called if any audit_names have been
* collected during syscall processing. We only check rules in sublists at hash
* buckets applicable to the inode numbers in audit_names.
* Regarding audit_state, same rules apply as for audit_filter_syscall().
*/
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
struct audit_names *n;
if (auditd_test_task(tsk))
return;
rcu_read_lock();
list_for_each_entry(n, &ctx->names_list, list) {
if (audit_filter_inode_name(tsk, n, ctx))
break;
}
rcu_read_unlock();
}
static inline void audit_proctitle_free(struct audit_context *context)
{
kfree(context->proctitle.value);
context->proctitle.value = NULL;
context->proctitle.len = 0;
}
static inline void audit_free_module(struct audit_context *context)
{
if (context->type == AUDIT_KERN_MODULE) {
kfree(context->module.name);
context->module.name = NULL;
}
}
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
if (n->name)
putname(n->name);
if (n->should_free)
kfree(n);
}
context->name_count = 0;
path_put(&context->pwd);
context->pwd.dentry = NULL;
context->pwd.mnt = NULL;
}
static inline void audit_free_aux(struct audit_context *context)
{
struct audit_aux_data *aux;
while ((aux = context->aux)) {
context->aux = aux->next;
kfree(aux);
}
context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
context->aux_pids = NULL;
}
/**
* audit_reset_context - reset a audit_context structure
* @ctx: the audit_context to reset
*
* All fields in the audit_context will be reset to an initial state, all
* references held by fields will be dropped, and private memory will be
* released. When this function returns the audit_context will be suitable
* for reuse, so long as the passed context is not NULL or a dummy context.
*/
static void audit_reset_context(struct audit_context *ctx)
{
if (!ctx)
return;
/* if ctx is non-null, reset the "ctx->state" regardless */
ctx->context = AUDIT_CTX_UNUSED;
if (ctx->dummy)
return;
/*
* NOTE: It shouldn't matter in what order we release the fields, so
* release them in the order in which they appear in the struct;
* this gives us some hope of quickly making sure we are
* resetting the audit_context properly.
*
* Other things worth mentioning:
* - we don't reset "dummy"
* - we don't reset "state", we do reset "current_state"
* - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
* - much of this is likely overkill, but play it safe for now
* - we really need to work on improving the audit_context struct
*/
ctx->current_state = ctx->state;
ctx->serial = 0;
ctx->major = 0;
ctx->uring_op = 0;
ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
memset(ctx->argv, 0, sizeof(ctx->argv));
ctx->return_code = 0;
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
ctx->return_valid = AUDITSC_INVALID;
audit_free_names(ctx);
if (ctx->state != AUDIT_STATE_RECORD) {
kfree(ctx->filterkey);
ctx->filterkey = NULL;
}
audit_free_aux(ctx);
kfree(ctx->sockaddr);
ctx->sockaddr = NULL;
ctx->sockaddr_len = 0;
ctx->pid = ctx->ppid = 0;
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
ctx->personality = 0;
ctx->arch = 0;
ctx->target_pid = 0;
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
ctx->target_sessionid = 0;
ctx->target_sid = 0;
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
ctx->type = 0;
audit_free_module(ctx);
ctx->fds[0] = -1;
audit_proctitle_free(ctx);
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
context->context = AUDIT_CTX_UNUSED;
context->state = state;
context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
context->fds[0] = -1;
context->return_valid = AUDITSC_INVALID;
return context;
}
/**
* audit_alloc - allocate an audit context block for a task
* @tsk: task
*
* Filter on the task information and allocate a per-task audit context
* if necessary. Doing so turns on system call auditing for the
* specified task. This is called from copy_process, so no lock is
* needed.
*/
int audit_alloc(struct task_struct *tsk)
{
struct audit_context *context;
enum audit_state state;
char *key = NULL;
if (likely(!audit_ever_enabled))
return 0;
state = audit_filter_task(tsk, &key);
if (state == AUDIT_STATE_DISABLED) {
clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
if (!(context = audit_alloc_context(state))) {
kfree(key);
audit_log_lost("out of memory in audit_alloc");
return -ENOMEM;
}
context->filterkey = key;
audit_set_context(tsk, context);
set_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
/**
* audit_alloc_kernel - allocate an audit_context for a kernel task
* @tsk: the kernel task
*
* Similar to the audit_alloc() function, but intended for kernel private
* threads. Returns zero on success, negative values on failure.
*/
int audit_alloc_kernel(struct task_struct *tsk)
{
/*
* At the moment we are just going to call into audit_alloc() to
* simplify the code, but there two things to keep in mind with this
* approach:
*
* 1. Filtering internal kernel tasks is a bit laughable in almost all
* cases, but there is at least one case where there is a benefit:
* the '-a task,never' case allows the admin to effectively disable
* task auditing at runtime.
*
* 2. The {set,clear}_task_syscall_work() ops likely have zero effect
* on these internal kernel tasks, but they probably don't hurt either.
*/
return audit_alloc(tsk);
}
static inline void audit_free_context(struct audit_context *context)
{
/* resetting is extra work, but it is likely just noise */
audit_reset_context(context);
free_tree_refs(context);
kfree(context->filterkey);
kfree(context);
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
{
struct audit_buffer *ab;
char *ctx = NULL;
u32 len;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
if (!ab)
return rc;
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
if (sid) {
if (security_secid_to_secctx(sid, &ctx, &len)) {
audit_log_format(ab, " obj=(none)");
rc = 1;
} else {
audit_log_format(ab, " obj=%s", ctx);
security_release_secctx(ctx, len);
}
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
audit_log_end(ab);
return rc;
}
static void audit_log_execve_info(struct audit_context *context,
struct audit_buffer **ab)
{
long len_max;
long len_rem;
long len_full;
long len_buf;
long len_abuf = 0;
long len_tmp;
bool require_data;
bool encode;
unsigned int iter;
unsigned int arg;
char *buf_head;
char *buf;
const char __user *p = (const char __user *)current->mm->arg_start;
/* NOTE: this buffer needs to be large enough to hold all the non-arg
* data we put in the audit record for this argument (see the
* code below) ... at this point in time 96 is plenty */
char abuf[96];
/* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
* current value of 7500 is not as important as the fact that it
* is less than 8k, a setting of 7500 gives us plenty of wiggle
* room if we go over a little bit in the logging below */
WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
len_max = MAX_EXECVE_AUDIT_LEN;
/* scratch buffer to hold the userspace args */
buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
if (!buf_head) {
audit_panic("out of memory for argv string");
return;
}
buf = buf_head;
audit_log_format(*ab, "argc=%d", context->execve.argc);
len_rem = len_max;
len_buf = 0;
len_full = 0;
require_data = true;
encode = false;
iter = 0;
arg = 0;
do {
/* NOTE: we don't ever want to trust this value for anything
* serious, but the audit record format insists we
* provide an argument length for really long arguments,
* e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
* to use strncpy_from_user() to obtain this value for
* recording in the log, although we don't use it
* anywhere here to avoid a double-fetch problem */
if (len_full == 0)
len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
/* read more data from userspace */
if (require_data) {
/* can we make more room in the buffer? */
if (buf != buf_head) {
memmove(buf_head, buf, len_buf);
buf = buf_head;
}
/* fetch as much as we can of the argument */
len_tmp = strncpy_from_user(&buf_head[len_buf], p,
len_max - len_buf);
if (len_tmp == -EFAULT) {
/* unable to copy from userspace */
send_sig(SIGKILL, current, 0);
goto out;
} else if (len_tmp == (len_max - len_buf)) {
/* buffer is not large enough */
require_data = true;
/* NOTE: if we are going to span multiple
* buffers force the encoding so we stand
* a chance at a sane len_full value and
* consistent record encoding */
encode = true;
len_full = len_full * 2;
p += len_tmp;
} else {
require_data = false;
if (!encode)
encode = audit_string_contains_control(
buf, len_tmp);
/* try to use a trusted value for len_full */
if (len_full < len_max)
len_full = (encode ?
len_tmp * 2 : len_tmp);
p += len_tmp + 1;
}
len_buf += len_tmp;
buf_head[len_buf] = '\0';
/* length of the buffer in the audit record? */
len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
/* write as much as we can to the audit log */
if (len_buf >= 0) {
/* NOTE: some magic numbers here - basically if we
* can't fit a reasonable amount of data into the
* existing audit buffer, flush it and start with
* a new buffer */
if ((sizeof(abuf) + 8) > len_rem) {
len_rem = len_max;
audit_log_end(*ab);
*ab = audit_log_start(context,
GFP_KERNEL, AUDIT_EXECVE);
if (!*ab)
goto out;
}
/* create the non-arg portion of the arg record */
len_tmp = 0;
if (require_data || (iter > 0) ||
((len_abuf + sizeof(abuf)) > len_rem)) {
if (iter == 0) {
len_tmp += snprintf(&abuf[len_tmp],
sizeof(abuf) - len_tmp,
" a%d_len=%lu",
arg, len_full);
}
len_tmp += snprintf(&abuf[len_tmp],
sizeof(abuf) - len_tmp,
" a%d[%d]=", arg, iter++);
} else
len_tmp += snprintf(&abuf[len_tmp],
sizeof(abuf) - len_tmp,
" a%d=", arg);
WARN_ON(len_tmp >= sizeof(abuf));
abuf[sizeof(abuf) - 1] = '\0';
/* log the arg in the audit record */
audit_log_format(*ab, "%s", abuf);
len_rem -= len_tmp;
len_tmp = len_buf;
if (encode) {
if (len_abuf > len_rem)
len_tmp = len_rem / 2; /* encoding */
audit_log_n_hex(*ab, buf, len_tmp);
len_rem -= len_tmp * 2;
len_abuf -= len_tmp * 2;
} else {
if (len_abuf > len_rem)
len_tmp = len_rem - 2; /* quotes */
audit_log_n_string(*ab, buf, len_tmp);
len_rem -= len_tmp + 2;
/* don't subtract the "2" because we still need
* to add quotes to the remaining string */
len_abuf -= len_tmp;
}
len_buf -= len_tmp;
buf += len_tmp;
}
/* ready to move to the next argument? */
if ((len_buf == 0) && !require_data) {
arg++;
iter = 0;
len_full = 0;
require_data = true;
encode = false;
}
} while (arg < context->execve.argc);
/* NOTE: the caller handles the final audit_log_end() call */
out:
kfree(buf_head);
}
static void audit_log_cap(struct audit_buffer *ab, char *prefix,
kernel_cap_t *cap)
{
int i;
if (cap_isclear(*cap)) {
audit_log_format(ab, " %s=0", prefix);
return;
}
audit_log_format(ab, " %s=", prefix);
CAP_FOR_EACH_U32(i)
audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
}
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
if (name->fcap_ver == -1) {
audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
return;
}
audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
name->fcap.fE, name->fcap_ver,
from_kuid(&init_user_ns, name->fcap.rootid));
}
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
int i;
ab = audit_log_start(context, GFP_KERNEL, context->type);
if (!ab)
return;
switch (context->type) {
case AUDIT_SOCKETCALL: {
int nargs = context->socketcall.nargs;
audit_log_format(ab, "nargs=%d", nargs);
for (i = 0; i < nargs; i++)
audit_log_format(ab, " a%d=%lx", i,
context->socketcall.args[i]);
break; }
case AUDIT_IPC: {
u32 osid = context->ipc.osid;
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
from_kuid(&init_user_ns, context->ipc.uid),
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
if (osid) {
char *ctx = NULL;
u32 len;
if (security_secid_to_secctx(osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", osid);
*call_panic = 1;
} else {
audit_log_format(ab, " obj=%s", ctx);
security_release_secctx(ctx, len);
}
}
if (context->ipc.has_perm) {
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
if (unlikely(!ab))
return;
audit_log_format(ab,
"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
}
break; }
case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
context->mq_open.oflag, context->mq_open.mode,
context->mq_open.attr.mq_flags,
context->mq_open.attr.mq_maxmsg,
context->mq_open.attr.mq_msgsize,
context->mq_open.attr.mq_curmsgs);
break;
case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
"abs_timeout_sec=%lld abs_timeout_nsec=%ld",
context->mq_sendrecv.mqdes,
context->mq_sendrecv.msg_len,
context->mq_sendrecv.msg_prio,
(long long) context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
break;
case AUDIT_MQ_NOTIFY:
audit_log_format(ab, "mqdes=%d sigev_signo=%d",
context->mq_notify.mqdes,
context->mq_notify.sigev_signo);
break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
audit_log_format(ab,
"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
"mq_curmsgs=%ld ",
context->mq_getsetattr.mqdes,
attr->mq_flags, attr->mq_maxmsg,
attr->mq_msgsize, attr->mq_curmsgs);
break; }
case AUDIT_CAPSET:
audit_log_format(ab, "pid=%d", context->capset.pid);
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
break;
case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
case AUDIT_OPENAT2:
audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
context->openat2.flags,
context->openat2.mode,
context->openat2.resolve);
break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
case AUDIT_KERN_MODULE:
audit_log_format(ab, "name=");
if (context->module.name) {
audit_log_untrustedstring(ab, context->module.name);
} else
audit_log_format(ab, "(null)");
break;
}
audit_log_end(ab);
}
static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
char *end = proctitle + len - 1;
while (end > proctitle && !isprint(*end))
end--;
/* catch the case where proctitle is only 1 non-print character */
len = end - proctitle + 1;
len -= isprint(proctitle[len-1]) == 0;
return len;
}
/*
* audit_log_name - produce AUDIT_PATH record from struct audit_names
* @context: audit_context for the task
* @n: audit_names structure with reportable details
* @path: optional path to report instead of audit_names->name
* @record_num: record number to report when handling a list of names
* @call_panic: optional pointer to int that will be updated if secid fails
*/
static void audit_log_name(struct audit_context *context, struct audit_names *n,
const struct path *path, int record_num, int *call_panic)
{
struct audit_buffer *ab;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
if (!ab)
return;
audit_log_format(ab, "item=%d", record_num);
if (path)
audit_log_d_path(ab, " name=", path);
else if (n->name) {
switch (n->name_len) {
case AUDIT_NAME_FULL:
/* log the full path */
audit_log_format(ab, " name=");
audit_log_untrustedstring(ab, n->name->name);
break;
case 0:
/* name was specified as a relative path and the
* directory component is the cwd
*/
if (context->pwd.dentry && context->pwd.mnt)
audit_log_d_path(ab, " name=", &context->pwd);
else
audit_log_format(ab, " name=(null)");
break;
default:
/* log the name's directory component */
audit_log_format(ab, " name=");
audit_log_n_untrustedstring(ab, n->name->name,
n->name_len);
}
} else
audit_log_format(ab, " name=(null)");
if (n->ino != AUDIT_INO_UNSET)
audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
n->ino,
MAJOR(n->dev),
MINOR(n->dev),
n->mode,
from_kuid(&init_user_ns, n->uid),
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
if (n->osid != 0) {
char *ctx = NULL;
u32 len;
if (security_secid_to_secctx(
n->osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", n->osid);
if (call_panic)
*call_panic = 2;
} else {
audit_log_format(ab, " obj=%s", ctx);
security_release_secctx(ctx, len);
}
}
/* log the audit_names record type */
switch (n->type) {
case AUDIT_TYPE_NORMAL:
audit_log_format(ab, " nametype=NORMAL");
break;
case AUDIT_TYPE_PARENT:
audit_log_format(ab, " nametype=PARENT");
break;
case AUDIT_TYPE_CHILD_DELETE:
audit_log_format(ab, " nametype=DELETE");
break;
case AUDIT_TYPE_CHILD_CREATE:
audit_log_format(ab, " nametype=CREATE");
break;
default:
audit_log_format(ab, " nametype=UNKNOWN");
break;
}
audit_log_fcaps(ab, n);
audit_log_end(ab);
}
static void audit_log_proctitle(void)
{
int res;
char *buf;
char *msg = "(null)";
int len = strlen(msg);
struct audit_context *context = audit_context();
struct audit_buffer *ab;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
audit_log_format(ab, "proctitle=");
/* Not cached */
if (!context->proctitle.value) {
buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
if (!buf)
goto out;
/* Historically called this from procfs naming */
res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN);
if (res == 0) {
kfree(buf);
goto out;
}
res = audit_proctitle_rtrim(buf, res);
if (res == 0) {
kfree(buf);
goto out;
}
context->proctitle.value = buf;
context->proctitle.len = res;
}
msg = context->proctitle.value;
len = context->proctitle.len;
out:
audit_log_n_untrustedstring(ab, msg, len);
audit_log_end(ab);
}
/**
* audit_log_uring - generate a AUDIT_URINGOP record
* @ctx: the audit context
*/
static void audit_log_uring(struct audit_context *ctx)
{
struct audit_buffer *ab;
const struct cred *cred;
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
if (!ab)
return;
cred = current_cred();
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
(ctx->return_valid == AUDITSC_SUCCESS ?
"yes" : "no"),
ctx->return_code);
audit_log_format(ab,
" items=%d"
" ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
" fsuid=%u egid=%u sgid=%u fsgid=%u",
ctx->name_count,
task_ppid_nr(current), task_tgid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
from_kuid(&init_user_ns, cred->suid),
from_kuid(&init_user_ns, cred->fsuid),
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid));
audit_log_task_context(ab);
audit_log_key(ab, ctx->filterkey);
audit_log_end(ab);
}
static void audit_log_exit(void)
{
int i, call_panic = 0;
struct audit_context *context = audit_context();
struct audit_buffer *ab;
struct audit_aux_data *aux;
struct audit_names *n;
context->personality = current->personality;
switch (context->context) {
case AUDIT_CTX_SYSCALL:
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
return;
audit_log_format(ab, "arch=%x syscall=%d",
context->arch, context->major);
if (context->personality != PER_LINUX)
audit_log_format(ab, " per=%lx", context->personality);
if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid == AUDITSC_SUCCESS ?
"yes" : "no"),
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
context->argv[0],
context->argv[1],
context->argv[2],
context->argv[3],
context->name_count);
audit_log_task_info(ab);
audit_log_key(ab, context->filterkey);
audit_log_end(ab);
break;
case AUDIT_CTX_URING:
audit_log_uring(context);
break;
default:
BUG();
break;
}
for (aux = context->aux; aux; aux = aux->next) {
ab = audit_log_start(context, GFP_KERNEL, aux->type);
if (!ab)
continue; /* audit_panic has been called */
switch (aux->type) {
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
audit_log_format(ab, "fver=%x", axs->fcap_ver);
audit_log_cap(ab, "fp", &axs->fcap.permitted);
audit_log_cap(ab, "fi", &axs->fcap.inheritable);
audit_log_format(ab, " fe=%d", axs->fcap.fE);
audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
audit_log_cap(ab, "pe", &axs->new_pcap.effective);
audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
audit_log_format(ab, " frootid=%d",
from_kuid(&init_user_ns,
axs->fcap.rootid));
break; }
}
audit_log_end(ab);
}
if (context->type)
show_special(context, &call_panic);
if (context->fds[0] >= 0) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
if (ab) {
audit_log_format(ab, "fd0=%d fd1=%d",
context->fds[0], context->fds[1]);
audit_log_end(ab);
}
}
if (context->sockaddr_len) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
if (ab) {
audit_log_format(ab, "saddr=");
audit_log_n_hex(ab, (void *)context->sockaddr,
context->sockaddr_len);
audit_log_end(ab);
}
}
for (aux = context->aux_pids; aux; aux = aux->next) {
struct audit_aux_data_pids *axs = (void *)aux;
for (i = 0; i < axs->pid_count; i++)
if (audit_log_pid_context(context, axs->target_pid[i],
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
axs->target_sid[i],
axs->target_comm[i]))
call_panic = 1;
}
if (context->target_pid &&
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
context->target_sid, context->target_comm))
call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
i = 0;
list_for_each_entry(n, &context->names_list, list) {
if (n->hidden)
continue;
audit_log_name(context, n, NULL, i++, &call_panic);
}
if (context->context == AUDIT_CTX_SYSCALL)
audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
* Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
struct audit_context *context = tsk->audit_context;
if (!context)
return;
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
* random task_struct that doesn't doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy) {
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
if (context->context == AUDIT_CTX_SYSCALL) {
audit_filter_syscall(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_STATE_RECORD)
audit_log_exit();
} else if (context->context == AUDIT_CTX_URING) {
/* TODO: verify this case is real and valid */
audit_filter_uring(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_STATE_RECORD)
audit_log_uring(context);
}
}
audit_set_context(tsk, NULL);
audit_free_context(context);
}
/**
* audit_return_fixup - fixup the return codes in the audit_context
* @ctx: the audit_context
* @success: true/false value to indicate if the operation succeeded or not
* @code: operation return code
*
* We need to fixup the return code in the audit logs if the actual return
* codes are later going to be fixed by the arch specific signal handlers.
*/
static void audit_return_fixup(struct audit_context *ctx,
int success, long code)
{
/*
* This is actually a test for:
* (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
* (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
*
* but is faster than a bunch of ||
*/
if (unlikely(code <= -ERESTARTSYS) &&
(code >= -ERESTART_RESTARTBLOCK) &&
(code != -ENOIOCTLCMD))
ctx->return_code = -EINTR;
else
ctx->return_code = code;
ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
}
/**
* __audit_uring_entry - prepare the kernel task's audit context for io_uring
* @op: the io_uring opcode
*
* This is similar to audit_syscall_entry() but is intended for use by io_uring
* operations. This function should only ever be called from
* audit_uring_entry() as we rely on the audit context checking present in that
* function.
*/
void __audit_uring_entry(u8 op)
{
struct audit_context *ctx = audit_context();
if (ctx->state == AUDIT_STATE_DISABLED)
return;
/*
* NOTE: It's possible that we can be called from the process' context
* before it returns to userspace, and before audit_syscall_exit()
* is called. In this case there is not much to do, just record
* the io_uring details and return.
*/
ctx->uring_op = op;
if (ctx->context == AUDIT_CTX_SYSCALL)
return;
ctx->dummy = !audit_n_rules;
if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
ctx->prio = 0;
ctx->context = AUDIT_CTX_URING;
ctx->current_state = ctx->state;
ktime_get_coarse_real_ts64(&ctx->ctime);
}
/**
* __audit_uring_exit - wrap up the kernel task's audit context after io_uring
* @success: true/false value to indicate if the operation succeeded or not
* @code: operation return code
*
* This is similar to audit_syscall_exit() but is intended for use by io_uring
* operations. This function should only ever be called from
* audit_uring_exit() as we rely on the audit context checking present in that
* function.
*/
void __audit_uring_exit(int success, long code)
{
struct audit_context *ctx = audit_context();
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
* where we may be called from process context before we
* return to userspace via audit_syscall_exit(). In this
* case we simply emit a URINGOP record and bail, the
* normal syscall exit handling will take care of
* everything else.
* It is also worth mentioning that when we are called,
* the current process creds may differ from the creds
* used during the normal syscall processing; keep that
* in mind if/when we move the record generation code.
*/
/*
* We need to filter on the syscall info here to decide if we
* should emit a URINGOP record. I know it seems odd but this
* solves the problem where users have a filter to block *all*
* syscall records in the "exit" filter; we want to preserve
* the behavior here.
*/
audit_filter_syscall(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
audit_filter_uring(current, ctx);
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
return;
audit_log_uring(ctx);
return;
}
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&ctx->killed_trees))
audit_kill_trees(ctx);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_uring(current, ctx);
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
goto out;
audit_return_fixup(ctx, success, code);
audit_log_exit();
out:
audit_reset_context(ctx);
}
/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
* @a2: additional syscall register 2
* @a3: additional syscall register 3
* @a4: additional syscall register 4
*
* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
* per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
* be written).
*/
void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
struct audit_context *context = audit_context();
enum audit_state state;
if (!audit_enabled || !context)
return;
WARN_ON(context->context != AUDIT_CTX_UNUSED);
WARN_ON(context->name_count);
if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
audit_panic("unrecoverable error in audit_syscall_entry()");
return;
}
state = context->state;
if (state == AUDIT_STATE_DISABLED)
return;
context->dummy = !audit_n_rules;
if (!context->dummy && state == AUDIT_STATE_BUILD) {
context->prio = 0;
if (auditd_test_task(current))
return;
}
context->arch = syscall_get_arch(current);
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
ktime_get_coarse_real_ts64(&context->ctime);
}
/**
* __audit_syscall_exit - deallocate audit context after a system call
* @success: success value of the syscall
* @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_STATE_RECORD state from
* filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
void __audit_syscall_exit(int success, long return_code)
{
struct audit_context *context = audit_context();
if (!context || context->dummy ||
context->context != AUDIT_CTX_SYSCALL)
goto out;
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
if (context->current_state < AUDIT_STATE_RECORD)
goto out;
audit_return_fixup(context, success, return_code);
audit_log_exit();
out:
audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
{
struct audit_context *context;
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
if (likely(!inode->i_fsnotify_marks))
return;
context = audit_context();
p = context->trees;
count = context->tree_count;
rcu_read_lock();
chunk = audit_tree_lookup(inode);
rcu_read_unlock();
if (!chunk)
return;
if (likely(put_tree_ref(context, chunk)))
return;
if (unlikely(!grow_tree_refs(context))) {
pr_warn("out of memory, audit has lost a tree reference\n");
audit_set_auditable(context);
audit_put_chunk(chunk);
unroll_tree_refs(context, p, count);
return;
}
put_tree_ref(context, chunk);
}
static void handle_path(const struct dentry *dentry)
{
struct audit_context *context;
struct audit_tree_refs *p;
const struct dentry *d, *parent;
struct audit_chunk *drop;
unsigned long seq;
int count;
context = audit_context();
p = context->trees;
count = context->tree_count;
retry:
drop = NULL;
d = dentry;
rcu_read_lock();
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d_backing_inode(d);
if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
if (unlikely(!put_tree_ref(context, chunk))) {
drop = chunk;
break;
}
}
}
parent = d->d_parent;
if (parent == d)
break;
d = parent;
}
if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */
rcu_read_unlock();
if (!drop) {
/* just a race with rename */
unroll_tree_refs(context, p, count);
goto retry;
}
audit_put_chunk(drop);
if (grow_tree_refs(context)) {
/* OK, got more space */
unroll_tree_refs(context, p, count);
goto retry;
}
/* too bad */
pr_warn("out of memory, audit has lost a tree reference\n");
unroll_tree_refs(context, p, count);
audit_set_auditable(context);
return;
}
rcu_read_unlock();
}
static struct audit_names *audit_alloc_name(struct audit_context *context,
unsigned char type)
{
struct audit_names *aname;
if (context->name_count < AUDIT_NAMES) {
aname = &context->preallocated_names[context->name_count];
memset(aname, 0, sizeof(*aname));
} else {
aname = kzalloc(sizeof(*aname), GFP_NOFS);
if (!aname)
return NULL;
aname->should_free = true;
}
aname->ino = AUDIT_INO_UNSET;
aname->type = type;
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
return aname;
}
/**
* __audit_reusename - fill out filename with info from existing entry
* @uptr: userland ptr to pathname
*
* Search the audit_names list for the current audit context. If there is an
* existing entry with a matching "uptr" then return the filename
* associated with that audit_name. If not, return NULL.
*/
struct filename *
__audit_reusename(const __user char *uptr)
{
struct audit_context *context = audit_context();
struct audit_names *n;
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
if (n->name->uptr == uptr) {
n->name->refcnt++;
return n->name;
}
}
return NULL;
}
/**
* __audit_getname - add a name to the list
* @name: name to add
*
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
void __audit_getname(struct filename *name)
{
struct audit_context *context = audit_context();
struct audit_names *n;
if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
name->refcnt++;
}
static inline int audit_copy_fcaps(struct audit_names *name,
const struct dentry *dentry)
{
struct cpu_vfs_cap_data caps;
int rc;
if (!dentry)
return 0;
rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
if (rc)
return rc;
name->fcap.permitted = caps.permitted;
name->fcap.inheritable = caps.inheritable;
name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
name->fcap.rootid = caps.rootid;
name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
VFS_CAP_REVISION_SHIFT;
return 0;
}
/* Copy inode data into an audit_names. */
static void audit_copy_inode(struct audit_names *name,
const struct dentry *dentry,
struct inode *inode, unsigned int flags)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
name->mode = inode->i_mode;
name->uid = inode->i_uid;
name->gid = inode->i_gid;
name->rdev = inode->i_rdev;
security_inode_getsecid(inode, &name->osid);
if (flags & AUDIT_INODE_NOEVAL) {
name->fcap_ver = -1;
return;
}
audit_copy_fcaps(name, dentry);
}
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
* @flags: attributes for this particular entry
*/
void __audit_inode(struct filename *name, const struct dentry *dentry,
unsigned int flags)
{
struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
struct audit_entry *e;
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
list_for_each_entry_rcu(e, list, list) {
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
if (f->type == AUDIT_FSTYPE
&& audit_comparator(inode->i_sb->s_magic,
f->op, f->val)
&& e->rule.action == AUDIT_NEVER) {
rcu_read_unlock();
return;
}
}
}
rcu_read_unlock();
if (!name)
goto out_alloc;
/*
* If we have a pointer to an audit_names entry already, then we can
* just use it directly if the type is correct.
*/
n = name->aname;
if (n) {
if (parent) {
if (n->type == AUDIT_TYPE_PARENT ||
n->type == AUDIT_TYPE_UNKNOWN)
goto out;
} else {
if (n->type != AUDIT_TYPE_PARENT)
goto out;
}
}
list_for_each_entry_reverse(n, &context->names_list, list) {
if (n->ino) {
/* valid inode number, use that for the comparison */
if (n->ino != inode->i_ino ||
n->dev != inode->i_sb->s_dev)
continue;
} else if (n->name) {
/* inode number has not been set, check the name */
if (strcmp(n->name->name, name->name))
continue;
} else
/* no inode and no name (?!) ... this is odd ... */
continue;
/* match the correct record type */
if (parent) {
if (n->type == AUDIT_TYPE_PARENT ||
n->type == AUDIT_TYPE_UNKNOWN)
goto out;
} else {
if (n->type != AUDIT_TYPE_PARENT)
goto out;
}
}
out_alloc:
/* unable to find an entry with both a matching name and type */
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
if (name) {
n->name = name;
name->refcnt++;
}
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
n->type = AUDIT_TYPE_PARENT;
if (flags & AUDIT_INODE_HIDDEN)
n->hidden = true;
} else {
n->name_len = AUDIT_NAME_FULL;
n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}
void __audit_file(const struct file *file)
{
__audit_inode(NULL, file->f_path.dentry, 0);
}
/**
* __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
* @dentry: dentry being audited
* @type: AUDIT_TYPE_* value that we're looking for
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
* This call updates the audit context with the child's information.
* Syscalls that create a new filesystem object must be hooked after
* the object is created. Syscalls that remove a filesystem object
* must be hooked prior, in order to capture the target inode during
* unsuccessful attempts.
*/
void __audit_inode_child(struct inode *parent,
const struct dentry *dentry,
const unsigned char type)
{
struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
const struct qstr *dname = &dentry->d_name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
struct audit_entry *e;
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
list_for_each_entry_rcu(e, list, list) {
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
if (f->type == AUDIT_FSTYPE
&& audit_comparator(parent->i_sb->s_magic,
f->op, f->val)
&& e->rule.action == AUDIT_NEVER) {
rcu_read_unlock();
return;
}
}
}
rcu_read_unlock();
if (inode)
handle_one(inode);
/* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
if (!n->name ||
(n->type != AUDIT_TYPE_PARENT &&
n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
!audit_compare_dname_path(dname,
n->name->name, n->name_len)) {
if (n->type == AUDIT_TYPE_UNKNOWN)
n->type = AUDIT_TYPE_PARENT;
found_parent = n;
break;
}
}
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
if (!n->name ||
(n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (!strcmp(dname->name, n->name->name) ||
!audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
AUDIT_NAME_FULL)) {
if (n->type == AUDIT_TYPE_UNKNOWN)
n->type = type;
found_child = n;
break;
}
}
if (!found_parent) {
/* create a new, "anonymous" parent record */
n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
audit_copy_inode(n, NULL, parent, 0);
}
if (!found_child) {
found_child = audit_alloc_name(context, type);
if (!found_child)
return;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
found_child->name->refcnt++;
}
}
if (inode)
audit_copy_inode(found_child, dentry, inode, 0);
else
found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
/**
* auditsc_get_stamp - get local copies of audit_context values
* @ctx: audit_context for the task
* @t: timespec64 to store time recorded in the audit_context
* @serial: serial value that is recorded in the audit_context
*
* Also sets the context as auditable.
*/
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
t->tv_sec = ctx->ctime.tv_sec;
t->tv_nsec = ctx->ctime.tv_nsec;
*serial = ctx->serial;
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_STATE_RECORD;
}
return 1;
}
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
* @mode: mode bits
* @attr: queue attributes
*
*/
void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
struct audit_context *context = audit_context();
if (attr)
memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
else
memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
context->mq_open.oflag = oflag;
context->mq_open.mode = mode;
context->type = AUDIT_MQ_OPEN;
}
/**
* __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
* @mqdes: MQ descriptor
* @msg_len: Message length
* @msg_prio: Message priority
* @abs_timeout: Message timeout in absolute time
*
*/
void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
const struct timespec64 *abs_timeout)
{
struct audit_context *context = audit_context();
struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
if (abs_timeout)
memcpy(p, abs_timeout, sizeof(*p));
else
memset(p, 0, sizeof(*p));
context->mq_sendrecv.mqdes = mqdes;
context->mq_sendrecv.msg_len = msg_len;
context->mq_sendrecv.msg_prio = msg_prio;
context->type = AUDIT_MQ_SENDRECV;
}
/**
* __audit_mq_notify - record audit data for a POSIX MQ notify
* @mqdes: MQ descriptor
* @notification: Notification event
*
*/
void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
struct audit_context *context = audit_context();
if (notification)
context->mq_notify.sigev_signo = notification->sigev_signo;
else
context->mq_notify.sigev_signo = 0;
context->mq_notify.mqdes = mqdes;
context->type = AUDIT_MQ_NOTIFY;
}
/**
* __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
* @mqdes: MQ descriptor
* @mqstat: MQ flags
*
*/
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
struct audit_context *context = audit_context();
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
}
/**
* __audit_ipc_obj - record audit data for ipc object
* @ipcp: ipc permissions
*
*/
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
struct audit_context *context = audit_context();
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
context->ipc.has_perm = 0;
security_ipc_getsecid(ipcp, &context->ipc.osid);
context->type = AUDIT_IPC;
}
/**
* __audit_ipc_set_perm - record audit data for new ipc permissions
* @qbytes: msgq bytes
* @uid: msgq user id
* @gid: msgq group id
* @mode: msgq mode (permissions)
*
* Called only after audit_ipc_obj().
*/
void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
struct audit_context *context = audit_context();
context->ipc.qbytes = qbytes;
context->ipc.perm_uid = uid;
context->ipc.perm_gid = gid;
context->ipc.perm_mode = mode;
context->ipc.has_perm = 1;
}
void __audit_bprm(struct linux_binprm *bprm)
{
struct audit_context *context = audit_context();
context->type = AUDIT_EXECVE;
context->execve.argc = bprm->argc;
}
/**
* __audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = audit_context();
if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
return 0;
}
/**
* __audit_fd_pair - record audit data for pipe and socketpair
* @fd1: the first file descriptor
* @fd2: the second file descriptor
*
*/
void __audit_fd_pair(int fd1, int fd2)
{
struct audit_context *context = audit_context();
context->fds[0] = fd1;
context->fds[1] = fd2;
}
/**
* __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
* @len: data length in user space
* @a: data address in kernel space
*
* Returns 0 for success or NULL context or < 0 on error.
*/
int __audit_sockaddr(int len, void *a)
{
struct audit_context *context = audit_context();
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
if (!p)
return -ENOMEM;
context->sockaddr = p;
}
context->sockaddr_len = len;
memcpy(context->sockaddr, a, len);
return 0;
}
void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = audit_context();
context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
security_task_getsecid_obj(t, &context->target_sid);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
/**
* audit_signal_info_syscall - record signal info for syscalls
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
int audit_signal_info_syscall(struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct audit_context *ctx = audit_context();
kuid_t t_uid = task_uid(t);
if (!audit_signals || audit_dummy_context())
return 0;
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
ctx->target_pid = task_tgid_nr(t);
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
security_task_getsecid_obj(t, &ctx->target_sid);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
axp = (void *)ctx->aux_pids;
if (!axp || axp->pid_count == AUDIT_AUX_PIDS) {
axp = kzalloc(sizeof(*axp), GFP_ATOMIC);
if (!axp)
return -ENOMEM;
axp->d.type = AUDIT_OBJ_PID;
axp->d.next = ctx->aux_pids;
ctx->aux_pids = (void *)axp;
}
BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
axp->target_pid[axp->pid_count] = task_tgid_nr(t);
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
return 0;
}
/**
* __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
* @bprm: pointer to the bprm being processed
* @new: the proposed new credentials
* @old: the old credentials
*
* Simply check if the proc already has the caps given by the file and if not
* store the priv escalation info for later auditing at the end of the syscall
*
* -Eric
*/
int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
const struct cred *new, const struct cred *old)
{
struct audit_aux_data_bprm_fcaps *ax;
struct audit_context *context = audit_context();
struct cpu_vfs_cap_data vcaps;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
return -ENOMEM;
ax->d.type = AUDIT_BPRM_FCAPS;
ax->d.next = context->aux;
context->aux = (void *)ax;
get_vfs_caps_from_disk(&init_user_ns,
bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
ax->fcap.rootid = vcaps.rootid;
ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
ax->old_pcap.permitted = old->cap_permitted;
ax->old_pcap.inheritable = old->cap_inheritable;
ax->old_pcap.effective = old->cap_effective;
ax->old_pcap.ambient = old->cap_ambient;
ax->new_pcap.permitted = new->cap_permitted;
ax->new_pcap.inheritable = new->cap_inheritable;
ax->new_pcap.effective = new->cap_effective;
ax->new_pcap.ambient = new->cap_ambient;
return 0;
}
/**
* __audit_log_capset - store information about the arguments to the capset syscall
* @new: the new credentials
* @old: the old (current) credentials
*
* Record the arguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = audit_context();
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
}
void __audit_mmap_fd(int fd, int flags)
{
struct audit_context *context = audit_context();
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
}
void __audit_openat2_how(struct open_how *how)
{
struct audit_context *context = audit_context();
context->openat2.flags = how->flags;
context->openat2.mode = how->mode;
context->openat2.resolve = how->resolve;
context->type = AUDIT_OPENAT2;
}
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
context->module.name = kstrdup(name, GFP_KERNEL);
if (!context->module.name)
audit_log_lost("out of memory in __audit_log_kern_module");
context->type = AUDIT_KERN_MODULE;
}
void __audit_fanotify(unsigned int response)
{
audit_log(audit_context(), GFP_KERNEL,
AUDIT_FANOTIFY, "resp=%u", response);
}
void __audit_tk_injoffset(struct timespec64 offset)
{
audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
"sec=%lli nsec=%li",
(long long)offset.tv_sec, offset.tv_nsec);
}
static void audit_log_ntp_val(const struct audit_ntp_data *ad,
const char *op, enum audit_ntp_type type)
{
const struct audit_ntp_val *val = &ad->vals[type];
if (val->newval == val->oldval)
return;
audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
"op=%s old=%lli new=%lli", op, val->oldval, val->newval);
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
enum audit_nfcfgop op, gfp_t gfp)
{
struct audit_buffer *ab;
char comm[sizeof(current->comm)];
ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
if (!ab)
return;
audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
name, af, nentries, audit_nfcfgs[op].s);
audit_log_format(ab, " pid=%u", task_pid_nr(current));
audit_log_task_context(ab); /* subj= */
audit_log_format(ab, " comm=");
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_end(ab);
}
EXPORT_SYMBOL_GPL(__audit_log_nfcfg);
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
kgid_t gid;
unsigned int sessionid;
char comm[sizeof(current->comm)];
auid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
current_uid_gid(&uid, &gid);
audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid),
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
*
* If a process ends with a core dump, something fishy is going on and we
* should record the event for investigation.
*/
void audit_core_dumps(long signr)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
if (signr == SIGQUIT) /* don't care for those */
return;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND);
if (unlikely(!ab))
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld res=1", signr);
audit_log_end(ab);
}
/**
* audit_seccomp - record information about a seccomp action
* @syscall: syscall number
* @signr: signal value
* @code: the seccomp action
*
* Record the information associated with a seccomp action. Event filtering for
* seccomp actions that are not to be logged is done in seccomp_log().
* Therefore, this function forces auditing independent of the audit_enabled
* and dummy context state because seccomp actions should be logged even when
* audit is not in use.
*/
void audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP);
if (unlikely(!ab))
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
signr, syscall_get_arch(current), syscall,
in_compat_syscall(), KSTK_EIP(current), code);
audit_log_end(ab);
}
void audit_seccomp_actions_logged(const char *names, const char *old_names,
int res)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
ab = audit_log_start(audit_context(), GFP_KERNEL,
AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab,
"op=seccomp-logging actions=%s old-actions=%s res=%d",
names, old_names, res);
audit_log_end(ab);
}
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Auto-group scheduling implementation:
*/
#include <linux/nospec.h>
#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
}
void autogroup_free(struct task_group *tg)
{
kfree(tg->autogroup);
}
static inline void autogroup_destroy(struct kref *kref)
{
struct autogroup *ag = container_of(kref, struct autogroup, kref);
#ifdef CONFIG_RT_GROUP_SCHED
/* We've redirected RT tasks to the root task group... */
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
static inline void autogroup_kref_put(struct autogroup *ag)
{
kref_put(&ag->kref, autogroup_destroy);
}
static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
{
kref_get(&ag->kref);
return ag;
}
static inline struct autogroup *autogroup_task_get(struct task_struct *p)
{
struct autogroup *ag;
unsigned long flags;
if (!lock_task_sighand(p, &flags))
return autogroup_kref_get(&autogroup_default);
ag = autogroup_kref_get(p->signal->autogroup);
unlock_task_sighand(p, &flags);
return ag;
}
static inline struct autogroup *autogroup_create(void)
{
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
struct task_group *tg;
if (!ag)
goto out_fail;
tg = sched_create_group(&root_task_group);
if (IS_ERR(tg))
goto out_free;
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
ag->tg = tg;
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Autogroup RT tasks are redirected to the root task group
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
* the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
tg->rt_rq = root_task_group.rt_rq;
#endif
tg->autogroup = ag;
sched_online_group(tg, &root_task_group);
return ag;
out_free:
kfree(ag);
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
}
bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
/*
* If we race with autogroup_move_group() the caller can use the old
* value of signal->autogroup but in this case sched_move_task() will
* be called again before autogroup_kref_put().
*
* However, there is no way sched_autogroup_exit_task() could tell us
* to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
*/
if (p->flags & PF_EXITING)
return false;
return true;
}
void sched_autogroup_exit_task(struct task_struct *p)
{
/*
* We are going to call exit_notify() and autogroup_move_group() can't
* see this thread after that: we can no longer use signal->autogroup.
* See the PF_EXITING check in task_wants_autogroup().
*/
sched_move_task(p);
}
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
struct autogroup *prev;
struct task_struct *t;
unsigned long flags;
BUG_ON(!lock_task_sighand(p, &flags));
prev = p->signal->autogroup;
if (prev == ag) {
unlock_task_sighand(p, &flags);
return;
}
p->signal->autogroup = autogroup_kref_get(ag);
/*
* We can't avoid sched_move_task() after we changed signal->autogroup,
* this process can already run with task_group() == prev->tg or we can
* race with cgroup code which can read autogroup = prev under rq->lock.
* In the latter case for_each_thread() can not miss a migrating thread,
* cpu_cgroup_attach() must not be possible after cgroup_exit() and it
* can't be removed from thread list, we hold ->siglock.
*
* If an exiting thread was already removed from thread list we rely on
* sched_autogroup_exit_task().
*/
for_each_thread(p, t)
sched_move_task(t);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
void sched_autogroup_create_attach(struct task_struct *p)
{
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
/* Drop extra reference added by autogroup_create(): */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
/* Cannot be called under siglock. Currently has no users: */
void sched_autogroup_detach(struct task_struct *p)
{
autogroup_move_group(p, &autogroup_default);
}
EXPORT_SYMBOL(sched_autogroup_detach);
void sched_autogroup_fork(struct signal_struct *sig)
{
sig->autogroup = autogroup_task_get(current);
}
void sched_autogroup_exit(struct signal_struct *sig)
{
autogroup_kref_put(sig->autogroup);
}
static int __init setup_autogroup(char *str)
{
sysctl_sched_autogroup_enabled = 0;
return 1;
}
__setup("noautogroup", setup_autogroup);
#ifdef CONFIG_PROC_FS
int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
{
static unsigned long next = INITIAL_JIFFIES;
struct autogroup *ag;
unsigned long shares;
int err, idx;
if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
err = security_task_setnice(current, nice);
if (err)
return err;
if (nice < 0 && !can_nice(current, nice))
return -EPERM;
/* This is a heavy operation, taking global locks.. */
if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
return -EAGAIN;
next = HZ / 10 + jiffies;
ag = autogroup_task_get(p);
idx = array_index_nospec(nice + 20, 40);
shares = scale_load(sched_prio_to_weight[idx]);
down_write(&ag->lock);
err = sched_group_set_shares(ag->tg, shares);
if (!err)
ag->nice = nice;
up_write(&ag->lock);
autogroup_kref_put(ag);
return err;
}
void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
{
struct autogroup *ag = autogroup_task_get(p);
if (!task_group_is_autogroup(ag->tg))
goto out;
down_read(&ag->lock);
seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
up_read(&ag->lock);
out:
autogroup_kref_put(ag);
}
#endif /* CONFIG_PROC_FS */
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
return 0;
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
*
* This file contains the interrupt probing code and driver APIs.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/async.h>
#include "internals.h"
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
* "IRQS_WAITING" cleared and the interrupt disabled.
*/
static DEFINE_MUTEX(probing_active);
/**
* probe_irq_on - begin an interrupt autodetect
*
* Commence probing for an interrupt. The interrupts are scanned
* and a mask of potential interrupt lines is returned.
*
*/
unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
unsigned long mask = 0;
int i;
/*
* quiesce the kernel, or at least the asynchronous portion
*/
async_synchronize_full();
mutex_lock(&probing_active);
/*
* something may have generated an irq long ago and we want to
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
irq_activate_and_startup(desc, IRQ_NORESEND);
}
raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
msleep(20);
/*
* enable any unassigned irqs
* (we must startup again here because if a longstanding irq
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
}
/*
* Wait for spurious interrupts to trigger
*/
msleep(100);
/*
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
} else
if (i < 32)
mask |= 1 << i;
}
raw_spin_unlock_irq(&desc->lock);
}
return mask;
}
EXPORT_SYMBOL(probe_irq_on);
/**
* probe_irq_mask - scan a bitmap of interrupt lines
* @val: mask of interrupts to consider
*
* Scan the interrupt lines and return a bitmap of active
* autodetect interrupts. The interrupt probe logic state
* is then returned to its previous value.
*
* Note: we need to scan all the irq's even though we will
* only return autodetect irq numbers - just so that we reset
* them all to a known state.
*/
unsigned int probe_irq_mask(unsigned long val)
{
unsigned int mask = 0;
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
return mask & val;
}
EXPORT_SYMBOL(probe_irq_mask);
/**
* probe_irq_off - end an interrupt autodetect
* @val: mask of potential interrupts (unused)
*
* Scans the unused interrupt lines and returns the line which
* appears to have triggered the interrupt. If no interrupt was
* found then zero is returned. If more than one interrupt is
* found then minus the first candidate is returned to indicate
* their is doubt.
*
* The interrupt probe logic state is returned to its previous
* value.
*
* BUGS: When used in a module (which arguably shouldn't happen)
* nothing prevents two IRQ probe callers from overlapping. The
* results of this are non-optimal.
*/
int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_of_irqs = 0;
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
irq_found = i;
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
if (nr_of_irqs > 1)
irq_found = -irq_found;
return irq_found;
}
EXPORT_SYMBOL(probe_irq_off);
// SPDX-License-Identifier: GPL-2.0
/*
* kernel/power/autosleep.c
*
* Opportunistic sleep support.
*
* Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
*/
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/pm_wakeup.h>
#include "power.h"
static suspend_state_t autosleep_state;
static struct workqueue_struct *autosleep_wq;
/*
* Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
* is active, otherwise a deadlock with try_to_suspend() is possible.
* Alternatively mutex_lock_interruptible() can be used. This will then fail
* if an auto_sleep cycle tries to freeze processes.
*/
static DEFINE_MUTEX(autosleep_lock);
static struct wakeup_source *autosleep_ws;
static void try_to_suspend(struct work_struct *work)
{
unsigned int initial_count, final_count;
if (!pm_get_wakeup_count(&initial_count, true))
goto out;
mutex_lock(&autosleep_lock);
if (!pm_save_wakeup_count(initial_count) ||
system_state != SYSTEM_RUNNING) {
mutex_unlock(&autosleep_lock);
goto out;
}
if (autosleep_state == PM_SUSPEND_ON) {
mutex_unlock(&autosleep_lock);
return;
}
if (autosleep_state >= PM_SUSPEND_MAX)
hibernate();
else
pm_suspend(autosleep_state);
mutex_unlock(&autosleep_lock);
if (!pm_get_wakeup_count(&final_count, false))
goto out;
/*
* If the wakeup occurred for an unknown reason, wait to prevent the
* system from trying to suspend and waking up in a tight loop.
*/
if (final_count == initial_count)
schedule_timeout_uninterruptible(HZ / 2);
out:
queue_up_suspend_work();
}
static DECLARE_WORK(suspend_work, try_to_suspend);
void queue_up_suspend_work(void)
{
if (autosleep_state > PM_SUSPEND_ON)
queue_work(autosleep_wq, &suspend_work);
}
suspend_state_t pm_autosleep_state(void)
{
return autosleep_state;
}
int pm_autosleep_lock(void)
{
return mutex_lock_interruptible(&autosleep_lock);
}
void pm_autosleep_unlock(void)
{
mutex_unlock(&autosleep_lock);
}
int pm_autosleep_set_state(suspend_state_t state)
{
#ifndef CONFIG_HIBERNATION
if (state >= PM_SUSPEND_MAX)
return -EINVAL;
#endif
__pm_stay_awake(autosleep_ws);
mutex_lock(&autosleep_lock);
autosleep_state = state;
__pm_relax(autosleep_ws);
if (state > PM_SUSPEND_ON) {
pm_wakep_autosleep_enabled(true);
queue_up_suspend_work();
} else {
pm_wakep_autosleep_enabled(false);
}
mutex_unlock(&autosleep_lock);
return 0;
}
int __init pm_autosleep_init(void)
{
autosleep_ws = wakeup_source_register(NULL, "autosleep");
if (!autosleep_ws)
return -ENOMEM;
autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
if (autosleep_wq)
return 0;
wakeup_source_unregister(autosleep_ws);
return -ENOMEM;
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*/
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/stacktrace.h>
static void backtrace_test_normal(void)
{
pr_info("Testing a backtrace from process context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
dump_stack();
}
static DECLARE_COMPLETION(backtrace_work);
static void backtrace_test_irq_callback(unsigned long data)
{
dump_stack();
complete(&backtrace_work);
}
static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
static void backtrace_test_irq(void)
{
pr_info("Testing a backtrace from irq context.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
init_completion(&backtrace_work);
tasklet_schedule(&backtrace_tasklet);
wait_for_completion(&backtrace_work);
}
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
unsigned long entries[8];
unsigned int nr_entries;
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
stack_trace_print(entries, nr_entries, 0);
}
#else
static void backtrace_test_saved(void)
{
pr_info("Saved backtrace test skipped.\n");
}
#endif
static int backtrace_regression_test(void)
{
pr_info("====[ backtrace testing ]===========\n");
backtrace_test_normal();
backtrace_test_irq();
backtrace_test_saved();
pr_info("====[ end of backtrace testing ]====\n");
return 0;
}
static void exitf(void)
{
}
module_init(backtrace_regression_test);
module_exit(exitf);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
// SPDX-License-Identifier: GPL-2.0
/*
* This code maintains a list of active profiling data structures.
*
* Copyright IBM Corp. 2009
* Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
*
* Uses gcc-internal data definitions.
* Based on the gcov-kernel patch by:
* Hubertus Franke <frankeh@us.ibm.com>
* Nigel Hinds <nhinds@us.ibm.com>
* Rajan Ravindran <rajancr@us.ibm.com>
* Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
* Paul Larson
*/
#define pr_fmt(fmt) "gcov: " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include "gcov.h"
int gcov_events_enabled;
DEFINE_MUTEX(gcov_lock);
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
* Turn on reporting of profiling data load/unload-events through the
* gcov_event() callback. Also replay all previous events once. This function
* is needed because some events are potentially generated too early for the
* callback implementation to handle them initially.
*/
void gcov_enable_events(void)
{
struct gcov_info *info = NULL;
mutex_lock(&gcov_lock);
gcov_events_enabled = 1;
/* Perform event callback for previously registered entries. */
while ((info = gcov_info_next(info))) {
gcov_event(GCOV_ADD, info);
cond_resched();
}
mutex_unlock(&gcov_lock);
}
/**
* store_gcov_u32 - store 32 bit number in gcov format to buffer
* @buffer: target buffer or NULL
* @off: offset into the buffer
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
* store anything.
*/
size_t store_gcov_u32(void *buffer, size_t off, u32 v)
{
u32 *data;
if (buffer) {
data = buffer + off;
*data = v;
}
return sizeof(*data);
}
/**
* store_gcov_u64 - store 64 bit number in gcov format to buffer
* @buffer: target buffer or NULL
* @off: offset into the buffer
* @v: value to be stored
*
* Number format defined by gcc: numbers are recorded in the 32 bit
* unsigned binary form of the endianness of the machine generating the
* file. 64 bit numbers are stored as two 32 bit numbers, the low part
* first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
* anything.
*/
size_t store_gcov_u64(void *buffer, size_t off, u64 v)
{
u32 *data;
if (buffer) {
data = buffer + off;
data[0] = (v & 0xffffffffUL);
data[1] = (v >> 32);
}
return sizeof(*data) * 2;
}
#ifdef CONFIG_MODULES
/* Update list and generate events when modules are unloaded. */
static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
void *data)
{
struct module *mod = data;
struct gcov_info *info = NULL;
struct gcov_info *prev = NULL;
if (event != MODULE_STATE_GOING)
return NOTIFY_OK;
mutex_lock(&gcov_lock);
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
if (gcov_info_within_module(info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
} else
prev = info;
}
mutex_unlock(&gcov_lock);
return NOTIFY_OK;
}
static struct notifier_block gcov_nb = {
.notifier_call = gcov_module_notifier,
};
static int __init gcov_init(void)
{
return register_module_notifier(&gcov_nb);
}
device_initcall(gcov_init);
#endif /* CONFIG_MODULES */
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/export.h>
#include <linux/time.h>
#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/blk-cgroup.h>
#include "../../block/blk.h"
#include <trace/events/block.h>
#include "trace_output.h"
#ifdef CONFIG_BLK_DEV_IO_TRACE
static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
static bool blk_tracer_enabled __read_mostly;
static LIST_HEAD(running_trace_list);
static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock);
/* Select an alternative, minimalistic output than the original one */
#define TRACE_BLK_OPT_CLASSIC 0x1
#define TRACE_BLK_OPT_CGROUP 0x2
#define TRACE_BLK_OPT_CGNAME 0x4
static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
#ifdef CONFIG_BLK_CGROUP
{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
{ TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
#endif
{ }
};
static struct tracer_flags blk_tracer_flags = {
.val = 0,
.opts = blk_tracer_opts,
};
/* Global reference count of probes */
static DEFINE_MUTEX(blk_probe_mutex);
static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
/*
* Send out a notify message.
*/
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
const void *data, size_t len, u64 cgid)
{
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
unsigned int trace_ctx = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (blk_tracer) {
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len + cgid_len,
trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
goto record_it;
}
if (!bt->rchan)
return;
t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
record_it:
t->device = bt->dev;
t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;
t->cpu = cpu;
t->pdu_len = len + cgid_len;
if (cgid_len)
memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
if (blk_tracer)
trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
}
}
/*
* Send out a notify for this process, if we haven't done so since a trace
* started
*/
static void trace_note_tsk(struct task_struct *tsk)
{
unsigned long flags;
struct blk_trace *bt;
tsk->btrace_seq = blktrace_seq;
raw_spin_lock_irqsave(&running_trace_lock, flags);
list_for_each_entry(bt, &running_trace_list, running_list) {
trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
sizeof(tsk->comm), 0);
}
raw_spin_unlock_irqrestore(&running_trace_lock, flags);
}
static void trace_note_time(struct blk_trace *bt)
{
struct timespec64 now;
unsigned long flags;
u32 words[2];
/* need to check user space to see if this breaks in y2038 or y2106 */
ktime_get_real_ts64(&now);
words[0] = (u32)now.tv_sec;
words[1] = now.tv_nsec;
local_irq_save(flags);
trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0);
local_irq_restore(flags);
}
void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
const char *fmt, ...)
{
int n;
va_list args;
unsigned long flags;
char *buf;
if (unlikely(bt->trace_state != Blktrace_running &&
!blk_tracer_enabled))
return;
/*
* If the BLK_TC_NOTIFY action mask isn't set, don't send any note
* message to the trace.
*/
if (!(bt->act_mask & BLK_TC_NOTIFY))
return;
local_irq_save(flags);
buf = this_cpu_ptr(bt->msg_data);
va_start(args, fmt);
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(__trace_note_message);
static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
return 1;
if (sector && (sector < bt->start_lba || sector > bt->end_lba))
return 1;
if (bt->pid && pid != bt->pid)
return 1;
return 0;
}
/*
* Data direction bit lookup
*/
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
#define BLK_TC_RAHEAD BLK_TC_AHEAD
#define BLK_TC_PREFLUSH BLK_TC_FLUSH
/* The ilog2() calls fall out because they're constant */
#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
(ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
* The worker for the various blk_add_trace*() types. Fills out a
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
int op, int op_flags, u32 what, int error, int pdu_len,
void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
unsigned int trace_ctx = 0;
pid_t pid;
int cpu;
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
what |= ddir_act[op_is_write(op) ? WRITE : READ];
what |= MASK_TC_BIT(op_flags, SYNC);
what |= MASK_TC_BIT(op_flags, RAHEAD);
what |= MASK_TC_BIT(op_flags, META);
what |= MASK_TC_BIT(op_flags, PREFLUSH);
what |= MASK_TC_BIT(op_flags, FUA);
if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
what |= BLK_TC_ACT(BLK_TC_DISCARD);
if (op == REQ_OP_FLUSH)
what |= BLK_TC_ACT(BLK_TC_FLUSH);
if (cgid)
what |= __BLK_TA_CGROUP;
pid = tsk->pid;
if (act_log_check(bt, what, sector, pid))
return;
cpu = raw_smp_processor_id();
if (blk_tracer) {
tracing_record_cmdline(current);
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len + cgid_len,
trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
goto record_it;
}
if (unlikely(tsk->btrace_seq != blktrace_seq))
trace_note_tsk(tsk);
/*
* A word about the locking here - we disable interrupts to reserve
* some space in the relay per-cpu buffer, to prevent an irq
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
if (t) {
sequence = per_cpu_ptr(bt->sequence, cpu);
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
t->time = ktime_to_ns(ktime_get());
record_it:
/*
* These two are not needed in ftrace as they are in the
* generic trace_entry, filled by tracing_generic_entry_update,
* but for the trace_event->bin() synthesizer benefit we do it
* here too.
*/
t->cpu = cpu;
t->pid = pid;
t->sector = sector;
t->bytes = bytes;
t->action = what;
t->device = bt->dev;
t->error = error;
t->pdu_len = pdu_len + cgid_len;
if (cgid_len)
memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
if (pdu_len)
memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
if (blk_tracer) {
trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
return;
}
}
local_irq_restore(flags);
}
static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
relay_close(bt->rchan);
/*
* If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
* under 'q->debugfs_dir', thus lookup and remove them.
*/
if (!bt->dir) {
debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
} else {
debugfs_remove(bt->dir);
}
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
}
static void get_probe_ref(void)
{
mutex_lock(&blk_probe_mutex);
if (++blk_probes_ref == 1)
blk_register_tracepoints();
mutex_unlock(&blk_probe_mutex);
}
static void put_probe_ref(void)
{
mutex_lock(&blk_probe_mutex);
if (!--blk_probes_ref)
blk_unregister_tracepoints();
mutex_unlock(&blk_probe_mutex);
}
static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
synchronize_rcu();
blk_trace_free(q, bt);
put_probe_ref();
}
static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
lockdep_is_held(&q->debugfs_mutex));
if (!bt)
return -EINVAL;
if (bt->trace_state != Blktrace_running)
blk_trace_cleanup(q, bt);
return 0;
}
int blk_trace_remove(struct request_queue *q)
{
int ret;
mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_remove(q);
mutex_unlock(&q->debugfs_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_remove);
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
char buf[16];
snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = simple_open,
.read = blk_dropped_read,
.llseek = default_llseek,
};
static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
size_t count, loff_t *ppos)
{
char *msg;
struct blk_trace *bt;
if (count >= BLK_TN_MAX_MSG)
return -EINVAL;
msg = memdup_user_nul(buffer, count);
if (IS_ERR(msg))
return PTR_ERR(msg);
bt = filp->private_data;
__trace_note_message(bt, NULL, "%s", msg);
kfree(msg);
return count;
}
static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
.open = simple_open,
.write = blk_msg_write,
.llseek = noop_llseek,
};
/*
* Keep track of how many times we encountered a full subbuffer, to aid
* the user space app in telling how many lost events there were.
*/
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
void *prev_subbuf, size_t prev_padding)
{
struct blk_trace *bt;
if (!relay_buf_full(buf))
return 1;
bt = buf->chan->private_data;
atomic_inc(&bt->dropped);
return 0;
}
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
return 0;
}
static struct dentry *blk_create_buf_file_callback(const char *filename,
struct dentry *parent,
umode_t mode,
struct rchan_buf *buf,
int *is_global)
{
return debugfs_create_file(filename, mode, parent, buf,
&relay_file_operations);
}
static const struct rchan_callbacks blk_relay_callbacks = {
.subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
static void blk_trace_setup_lba(struct blk_trace *bt,
struct block_device *bdev)
{
if (bdev) {
bt->start_lba = bdev->bd_start_sect;
bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev);
} else {
bt->start_lba = 0;
bt->end_lba = -1ULL;
}
}
/*
* Setup everything required to start tracing
*/
static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
struct blk_user_trace_setup *buts)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
int ret;
lockdep_assert_held(&q->debugfs_mutex);
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
/*
* some device names have larger paths - convert the slashes
* to underscores for this to work as expected
*/
strreplace(buts->name, '/', '_');
/*
* bdev can be NULL, as with scsi-generic, this is a helpful as
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
pr_warn("Concurrent blktraces are not allowed on %s\n",
buts->name);
return -EBUSY;
}
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
ret = -ENOMEM;
bt->sequence = alloc_percpu(unsigned long);
if (!bt->sequence)
goto err;
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
if (!bt->msg_data)
goto err;
/*
* When tracing the whole disk reuse the existing debugfs directory
* created by the block layer on init. For partitions block devices,
* and scsi-generic block devices we create a temporary new debugfs
* directory that will be removed once the trace ends.
*/
if (bdev && !bdev_is_partition(bdev))
dir = q->debugfs_dir;
else
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
/*
* As blktrace relies on debugfs for its interface the debugfs directory
* is required, contrary to the usual mantra of not checking for debugfs
* files or directories.
*/
if (IS_ERR_OR_NULL(dir)) {
pr_warn("debugfs_dir not present for %s so skipping\n",
buts->name);
ret = -ENOENT;
goto err;
}
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
bt->act_mask = buts->act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
blk_trace_setup_lba(bt, bdev);
/* overwrite with user settings */
if (buts->start_lba)
bt->start_lba = buts->start_lba;
if (buts->end_lba)
bt->end_lba = buts->end_lba;
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
err:
if (ret)
blk_trace_free(q, bt);
return ret;
}
static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev, char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
ret = copy_from_user(&buts, arg, sizeof(buts));
if (ret)
return -EFAULT;
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
__blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
char __user *arg)
{
int ret;
mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_setup(q, name, dev, bdev, arg);
mutex_unlock(&q->debugfs_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
dev_t dev, struct block_device *bdev,
char __user *arg)
{
struct blk_user_trace_setup buts;
struct compat_blk_user_trace_setup cbuts;
int ret;
if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
return -EFAULT;
buts = (struct blk_user_trace_setup) {
.act_mask = cbuts.act_mask,
.buf_size = cbuts.buf_size,
.buf_nr = cbuts.buf_nr,
.start_lba = cbuts.start_lba,
.end_lba = cbuts.end_lba,
.pid = cbuts.pid,
};
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
if (ret)
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
__blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
#endif
static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
/*
* For starting a trace, we can transition from a setup or stopped
* trace. For stopping a trace, the state must be running
*/
ret = -EINVAL;
if (start) {
if (bt->trace_state == Blktrace_setup ||
bt->trace_state == Blktrace_stopped) {
blktrace_seq++;
smp_mb();
bt->trace_state = Blktrace_running;
raw_spin_lock_irq(&running_trace_lock);
list_add(&bt->running_list, &running_trace_list);
raw_spin_unlock_irq(&running_trace_lock);
trace_note_time(bt);
ret = 0;
}
} else {
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
raw_spin_lock_irq(&running_trace_lock);
list_del_init(&bt->running_list);
raw_spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
ret = 0;
}
}
return ret;
}
int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
mutex_lock(&q->debugfs_mutex);
ret = __blk_trace_startstop(q, start);
mutex_unlock(&q->debugfs_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
/*
* When reading or writing the blktrace sysfs files, the references to the
* opened sysfs or device files should prevent the underlying block device
* from being removed. So no further delete protection is really needed.
*/
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
*
**/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
struct request_queue *q;
int ret, start = 0;
char b[BDEVNAME_SIZE];
q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
mutex_lock(&q->debugfs_mutex);
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
bdevname(bdev, b);
ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#endif
case BLKTRACESTART:
start = 1;
fallthrough;
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
ret = __blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
mutex_unlock(&q->debugfs_mutex);
return ret;
}
/**
* blk_trace_shutdown: - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
void blk_trace_shutdown(struct request_queue *q)
{
mutex_lock(&q->debugfs_mutex);
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
mutex_unlock(&q->debugfs_mutex);
}
#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
bt = rcu_dereference_protected(q->blk_trace, 1);
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0;
if (!bio->bi_blkg)
return 0;
return cgroup_id(bio_blkcg(bio)->css.cgroup);
}
#else
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
return 0;
}
#endif
static u64
blk_trace_request_get_cgid(struct request *rq)
{
if (!rq->bio)
return 0;
/* Use the first bio */
return blk_trace_bio_get_cgid(rq->q, rq->bio);
}
/*
* blktrace probes
*/
/**
* blk_add_trace_rq - Add a trace for a request oriented action
* @rq: the source request
* @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
* @cgid: the cgroup info
*
* Description:
* Records an action against a request. Will log the bio offset + size.
*
**/
static void blk_add_trace_rq(struct request *rq, blk_status_t error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
else
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, blk_status_to_errno(error), 0,
NULL, cgid);
rcu_read_unlock();
}
static void blk_add_trace_rq_insert(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_issue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_merge(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE,
blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_requeue(void *ignore, struct request *rq)
{
blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
blk_trace_request_get_cgid(rq));
}
static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
blk_status_t error, unsigned int nr_bytes)
{
blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
blk_trace_request_get_cgid(rq));
}
/**
* blk_add_trace_bio - Add a trace for a bio oriented action
* @q: queue the io is for
* @bio: the source bio
* @what: the action
* @error: error, if any
*
* Description:
* Records an action against a bio. Will log the bio offset + size.
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
{
blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
blk_status_to_errno(bio->bi_status));
}
static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio)
{
blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE,
0);
}
static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio)
{
blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE,
0);
}
static void blk_add_trace_bio_queue(void *ignore, struct bio *bio)
{
blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore, struct bio *bio)
{
blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0);
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
if (explicit)
what = BLK_TA_UNPLUG_IO;
else
what = BLK_TA_UNPLUG_TIMER;
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
BLK_TA_SPLIT,
blk_status_to_errno(bio->bi_status),
sizeof(rpdu), &rpdu,
blk_trace_bio_get_cgid(q, bio));
}
rcu_read_unlock();
}
/**
* blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
* @bio: the source bio
* @dev: source device
* @from: source sector
*
* Called after a bio is remapped to a different device and/or sector.
**/
static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev,
sector_t from)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio_dev(bio));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
/**
* blk_add_trace_rq_remap - Add a trace for a request-remap operation
* @ignore: trace callback data parameter (not used)
* @rq: the source request
* @dev: target device
* @from: source sector
*
* Description:
* Device mapper remaps request to other devices.
* Add a trace for that action.
*
**/
static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt;
struct blk_io_trace_remap r;
rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->q->disk));
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
/**
* blk_add_driver_data - Add binary message with driver-specific data
* @rq: io request
* @data: driver-specific data
* @len: length of driver-specific data
*
* Description:
* Some drivers might want to write driver-specific data per request.
*
**/
void blk_add_driver_data(struct request *rq, void *data, size_t len)
{
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
static void blk_register_tracepoints(void)
{
int ret;
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
WARN_ON(ret);
ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
}
static void blk_unregister_tracepoints(void)
{
unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
tracepoint_synchronize_unregister();
}
/*
* struct blk_io_tracer formatting routines
*/
static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
{
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
rwbs[i++] = 'N';
goto out;
}
if (tc & BLK_TC_FLUSH)
rwbs[i++] = 'F';
if (tc & BLK_TC_DISCARD)
rwbs[i++] = 'D';
else if (tc & BLK_TC_WRITE)
rwbs[i++] = 'W';
else if (t->bytes)
rwbs[i++] = 'R';
else
rwbs[i++] = 'N';
if (tc & BLK_TC_FUA)
rwbs[i++] = 'F';
if (tc & BLK_TC_AHEAD)
rwbs[i++] = 'A';
if (tc & BLK_TC_SYNC)
rwbs[i++] = 'S';
if (tc & BLK_TC_META)
rwbs[i++] = 'M';
out:
rwbs[i] = '\0';
}
static inline
const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
{
return (const struct blk_io_trace *)ent;
}
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
{
return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0);
}
static inline u64 t_cgid(const struct trace_entry *ent)
{
return *(u64 *)(te_blk_io_trace(ent) + 1);
}
static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
{
return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0);
}
static inline u32 t_action(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->action;
}
static inline u32 t_bytes(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->bytes;
}
static inline u32 t_sec(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->bytes >> 9;
}
static inline unsigned long long t_sector(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->sector;
}
static inline __u16 t_error(const struct trace_entry *ent)
{
return te_blk_io_trace(ent)->error;
}
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
bool has_cg)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
unsigned secs = (unsigned long)ts;
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
trace_seq_printf(&iter->seq,
"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
MAJOR(t->device), MINOR(t->device), iter->cpu,
secs, nsec_rem, iter->ent->pid, act, rwbs);
}
static void blk_log_action(struct trace_iterator *iter, const char *act,
bool has_cg)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
if (has_cg) {
u64 id = t_cgid(iter->ent);
if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
char blkcg_name_buf[NAME_MAX + 1] = "<...>";
cgroup_path_from_kernfs_id(id, blkcg_name_buf,
sizeof(blkcg_name_buf));
trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
MAJOR(t->device), MINOR(t->device),
blkcg_name_buf, act, rwbs);
} else {
/*
* The cgid portion used to be "INO,GEN". Userland
* builds a FILEID_INO32_GEN fid out of them and
* opens the cgroup using open_by_handle_at(2).
* While 32bit ino setups are still the same, 64bit
* ones now use the 64bit ino as the whole ID and
* no longer use generation.
*
* Regardless of the content, always output
* "LOW32,HIGH32" so that FILEID_INO32_GEN fid can
* be mapped back to @id on both 64 and 32bit ino
* setups. See __kernfs_fh_to_dentry().
*/
trace_seq_printf(&iter->seq,
"%3d,%-3d %llx,%-llx %2s %3s ",
MAJOR(t->device), MINOR(t->device),
id & U32_MAX, id >> 32, act, rwbs);
}
} else
trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
MAJOR(t->device), MINOR(t->device), act, rwbs);
}
static void blk_log_dump_pdu(struct trace_seq *s,
const struct trace_entry *ent, bool has_cg)
{
const unsigned char *pdu_buf;
int pdu_len;
int i, end;
pdu_buf = pdu_start(ent, has_cg);
pdu_len = pdu_real_len(ent, has_cg);
if (!pdu_len)
return;
/* find the last zero that needs to be printed */
for (end = pdu_len - 1; end >= 0; end--)
if (pdu_buf[end])
break;
end++;
trace_seq_putc(s, '(');
for (i = 0; i < pdu_len; i++) {
trace_seq_printf(s, "%s%02x",
i == 0 ? "" : " ", pdu_buf[i]);
/*
* stop when the rest is just zeros and indicate so
* with a ".." appended
*/
if (i == end && end != pdu_len - 1) {
trace_seq_puts(s, " ..) ");
return;
}
}
trace_seq_puts(s, ") ");
}
static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
trace_seq_printf(s, "%u ", t_bytes(ent));
blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
trace_seq_printf(s, "%llu + %u [%s]\n",
t_sector(ent), t_sec(ent), cmd);
else
trace_seq_printf(s, "[%s]\n", cmd);
}
}
static void blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent, bool has_cg)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
blk_log_dump_pdu(s, ent, has_cg);
trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
trace_seq_printf(s, "%llu + %u [%d]\n",
t_sector(ent),
t_sec(ent), t_error(ent));
else
trace_seq_printf(s, "%llu [%d]\n",
t_sector(ent), t_error(ent));
}
}
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
MAJOR(be32_to_cpu(__r->device_from)),
MINOR(be32_to_cpu(__r->device_from)),
be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "[%s]\n", cmd);
}
static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
}
static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
get_pdu_int(ent, has_cg), cmd);
}
static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
bool has_cg)
{
trace_seq_putmem(s, pdu_start(ent, has_cg),
pdu_real_len(ent, has_cg));
trace_seq_putc(s, '\n');
}
/*
* struct tracer operations
*/
static void blk_tracer_print_header(struct seq_file *m)
{
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return;
seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
"# | | | | | |\n");
}
static void blk_tracer_start(struct trace_array *tr)
{
blk_tracer_enabled = true;
}
static int blk_tracer_init(struct trace_array *tr)
{
blk_tr = tr;
blk_tracer_start(tr);
return 0;
}
static void blk_tracer_stop(struct trace_array *tr)
{
blk_tracer_enabled = false;
}
static void blk_tracer_reset(struct trace_array *tr)
{
blk_tracer_stop(tr);
}
static const struct {
const char *act[2];
void (*print)(struct trace_seq *s, const struct trace_entry *ent,
bool has_cg);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
[__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
[__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
[__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
[__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
[__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
[__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
[__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
[__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
[__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
[__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
[__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
[__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
[__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
};
static enum print_line_t print_one_line(struct trace_iterator *iter,
bool classic)
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
bool long_act;
blk_log_action_t *log_action;
bool has_cg;
t = te_blk_io_trace(iter->ent);
what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
has_cg = t->action & __BLK_TA_CGROUP;
if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
log_action(iter, long_act ? "message" : "m", has_cg);
blk_log_msg(s, iter->ent, has_cg);
return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
trace_seq_printf(s, "Unknown action %x\n", what);
else {
log_action(iter, what2act[what].act[long_act], has_cg);
what2act[what].print(s, iter->ent, has_cg);
}
return trace_handle_return(s);
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
int flags, struct trace_event *event)
{
return print_one_line(iter, false);
}
static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
const int offset = offsetof(struct blk_io_trace, sector);
struct blk_io_trace old = {
.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
.time = iter->ts,
};
trace_seq_putmem(s, &old, offset);
trace_seq_putmem(s, &t->sector,
sizeof(old) - offset + t->pdu_len);
}
static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
blk_trace_synthesize_old_trace(iter);
return trace_handle_return(&iter->seq);
}
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
}
static int
blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
if (set)
tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
else
tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
}
return 0;
}
static struct tracer blk_tracer __read_mostly = {
.name = "blk",
.init = blk_tracer_init,
.reset = blk_tracer_reset,
.start = blk_tracer_start,
.stop = blk_tracer_stop,
.print_header = blk_tracer_print_header,
.print_line = blk_tracer_print_line,
.flags = &blk_tracer_flags,
.set_flag = blk_tracer_set_flag,
};
static struct trace_event_functions trace_blk_event_funcs = {
.trace = blk_trace_event_print,
.binary = blk_trace_event_print_binary,
};
static struct trace_event trace_blk_event = {
.type = TRACE_BLK,
.funcs = &trace_blk_event_funcs,
};
static int __init init_blk_tracer(void)
{
if (!register_trace_event(&trace_blk_event)) {
pr_warn("Warning: could not register block events\n");
return 1;
}
if (register_tracer(&blk_tracer) != 0) {
pr_warn("Warning: could not register the block tracer\n");
unregister_trace_event(&trace_blk_event);
return 1;
}
return 0;
}
device_initcall(init_blk_tracer);
static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
bt = rcu_replace_pointer(q->blk_trace, NULL,
lockdep_is_held(&q->debugfs_mutex));
if (bt == NULL)
return -EINVAL;
if (bt->trace_state == Blktrace_running) {
bt->trace_state = Blktrace_stopped;
raw_spin_lock_irq(&running_trace_lock);
list_del_init(&bt->running_list);
raw_spin_unlock_irq(&running_trace_lock);
relay_flush(bt->rchan);
}
put_probe_ref();
synchronize_rcu();
blk_trace_free(q, bt);
return 0;
}
/*
* Setup everything required to start tracing
*/
static int blk_trace_setup_queue(struct request_queue *q,
struct block_device *bdev)
{
struct blk_trace *bt = NULL;
int ret = -ENOMEM;
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
if (!bt->msg_data)
goto free_bt;
bt->dev = bdev->bd_dev;
bt->act_mask = (u16)-1;
blk_trace_setup_lba(bt, bdev);
rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
free_bt:
blk_trace_free(q, bt);
return ret;
}
/*
* sysfs interface to enable and configure tracing
*/
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf);
static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count);
#define BLK_TRACE_DEVICE_ATTR(_name) \
DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
sysfs_blk_trace_attr_show, \
sysfs_blk_trace_attr_store)
static BLK_TRACE_DEVICE_ATTR(enable);
static BLK_TRACE_DEVICE_ATTR(act_mask);
static BLK_TRACE_DEVICE_ATTR(pid);
static BLK_TRACE_DEVICE_ATTR(start_lba);
static BLK_TRACE_DEVICE_ATTR(end_lba);
static struct attribute *blk_trace_attrs[] = {
&dev_attr_enable.attr,
&dev_attr_act_mask.attr,
&dev_attr_pid.attr,
&dev_attr_start_lba.attr,
&dev_attr_end_lba.attr,
NULL
};
struct attribute_group blk_trace_attr_group = {
.name = "trace",
.attrs = blk_trace_attrs,
};
static const struct {
int mask;
const char *str;
} mask_maps[] = {
{ BLK_TC_READ, "read" },
{ BLK_TC_WRITE, "write" },
{ BLK_TC_FLUSH, "flush" },
{ BLK_TC_SYNC, "sync" },
{ BLK_TC_QUEUE, "queue" },
{ BLK_TC_REQUEUE, "requeue" },
{ BLK_TC_ISSUE, "issue" },
{ BLK_TC_COMPLETE, "complete" },
{ BLK_TC_FS, "fs" },
{ BLK_TC_PC, "pc" },
{ BLK_TC_NOTIFY, "notify" },
{ BLK_TC_AHEAD, "ahead" },
{ BLK_TC_META, "meta" },
{ BLK_TC_DISCARD, "discard" },
{ BLK_TC_DRV_DATA, "drv_data" },
{ BLK_TC_FUA, "fua" },
};
static int blk_trace_str2mask(const char *str)
{
int i;
int mask = 0;
char *buf, *s, *token;
buf = kstrdup(str, GFP_KERNEL);
if (buf == NULL)
return -ENOMEM;
s = strstrip(buf);
while (1) {
token = strsep(&s, ",");
if (token == NULL)
break;
if (*token == '\0')
continue;
for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
if (strcasecmp(token, mask_maps[i].str) == 0) {
mask |= mask_maps[i].mask;
break;
}
}
if (i == ARRAY_SIZE(mask_maps)) {
mask = -EINVAL;
break;
}
}
kfree(buf);
return mask;
}
static ssize_t blk_trace_mask2str(char *buf, int mask)
{
int i;
char *p = buf;
for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
if (mask & mask_maps[i].mask) {
p += sprintf(p, "%s%s",
(p == buf) ? "" : ",", mask_maps[i].str);
}
}
*p++ = '\n';
return p - buf;
}
static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
ssize_t ret = -ENXIO;
mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
return ret;
}
static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
if (count == 0)
goto out;
if (attr == &dev_attr_act_mask) {
if (kstrtoull(buf, 0, &value)) {
/* Assume it is a list of trace category names */
ret = blk_trace_str2mask(buf);
if (ret < 0)
goto out;
value = ret;
}
} else {
if (kstrtoull(buf, 0, &value))
goto out;
}
mutex_lock(&q->debugfs_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
if (attr == &dev_attr_enable) {
if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
if (value)
ret = blk_trace_setup_queue(q, bdev);
else
ret = blk_trace_remove_queue(q);
goto out_unlock_bdev;
}
ret = 0;
if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex));
}
if (ret == 0) {
if (attr == &dev_attr_act_mask)
bt->act_mask = value;
else if (attr == &dev_attr_pid)
bt->pid = value;
else if (attr == &dev_attr_start_lba)
bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
bt->end_lba = value;
}
out_unlock_bdev:
mutex_unlock(&q->debugfs_mutex);
out:
return ret ? ret : count;
}
int blk_trace_init_sysfs(struct device *dev)
{
return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
}
void blk_trace_remove_sysfs(struct device *dev)
{
sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
}
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
/**
* blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string.
* @rwbs: buffer to be filled
* @op: REQ_OP_XXX for the tracepoint
*
* Description:
* Maps the REQ_OP_XXX to character and fills the buffer provided by the
* caller with resulting string.
*
**/
void blk_fill_rwbs(char *rwbs, unsigned int op)
{
int i = 0;
if (op & REQ_PREFLUSH)
rwbs[i++] = 'F';
switch (op & REQ_OP_MASK) {
case REQ_OP_WRITE:
case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
rwbs[i++] = 'D';
break;
case REQ_OP_SECURE_ERASE:
rwbs[i++] = 'D';
rwbs[i++] = 'E';
break;
case REQ_OP_FLUSH:
rwbs[i++] = 'F';
break;
case REQ_OP_READ:
rwbs[i++] = 'R';
break;
default:
rwbs[i++] = 'N';
}
if (op & REQ_FUA)
rwbs[i++] = 'F';
if (op & REQ_RAHEAD)
rwbs[i++] = 'A';
if (op & REQ_SYNC)
rwbs[i++] = 'S';
if (op & REQ_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
}
EXPORT_SYMBOL_GPL(blk_fill_rwbs);
#endif /* CONFIG_EVENT_TRACING */
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include <linux/bitmap.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/jhash.h>
#include <linux/random.h>
#define BLOOM_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
struct bpf_bloom_filter {
struct bpf_map map;
u32 bitset_mask;
u32 hash_seed;
/* If the size of the values in the bloom filter is u32 aligned,
* then it is more performant to use jhash2 as the underlying hash
* function, else we use jhash. This tracks the number of u32s
* in an u32-aligned value size. If the value size is not u32 aligned,
* this will be 0.
*/
u32 aligned_u32_count;
u32 nr_hash_funcs;
unsigned long bitset[];
};
static u32 hash(struct bpf_bloom_filter *bloom, void *value,
u32 value_size, u32 index)
{
u32 h;
if (bloom->aligned_u32_count)
h = jhash2(value, bloom->aligned_u32_count,
bloom->hash_seed + index);
else
h = jhash(value, value_size, bloom->hash_seed + index);
return h & bloom->bitset_mask;
}
static int bloom_map_peek_elem(struct bpf_map *map, void *value)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
u32 i, h;
for (i = 0; i < bloom->nr_hash_funcs; i++) {
h = hash(bloom, value, map->value_size, i);
if (!test_bit(h, bloom->bitset))
return -ENOENT;
}
return 0;
}
static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
u32 i, h;
if (flags != BPF_ANY)
return -EINVAL;
for (i = 0; i < bloom->nr_hash_funcs; i++) {
h = hash(bloom, value, map->value_size, i);
set_bit(h, bloom->bitset);
}
return 0;
}
static int bloom_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
static int bloom_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
return -EOPNOTSUPP;
}
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_bloom_filter *bloom;
if (!bpf_capable())
return ERR_PTR(-EPERM);
if (attr->key_size != 0 || attr->value_size == 0 ||
attr->max_entries == 0 ||
attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
!bpf_map_flags_access_ok(attr->map_flags) ||
/* The lower 4 bits of map_extra (0xF) specify the number
* of hash functions
*/
(attr->map_extra & ~0xF))
return ERR_PTR(-EINVAL);
nr_hash_funcs = attr->map_extra;
if (nr_hash_funcs == 0)
/* Default to using 5 hash functions if unspecified */
nr_hash_funcs = 5;
/* For the bloom filter, the optimal bit array size that minimizes the
* false positive probability is n * k / ln(2) where n is the number of
* expected entries in the bloom filter and k is the number of hash
* functions. We use 7 / 5 to approximate 1 / ln(2).
*
* We round this up to the nearest power of two to enable more efficient
* hashing using bitmasks. The bitmask will be the bit array size - 1.
*
* If this overflows a u32, the bit array size will have 2^32 (4
* GB) bits.
*/
if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
nr_bits > (1UL << 31)) {
/* The bit array size is 2^32 bits but to avoid overflowing the
* u32, we use U32_MAX, which will round up to the equivalent
* number of bytes
*/
bitset_bytes = BITS_TO_BYTES(U32_MAX);
bitset_mask = U32_MAX;
} else {
if (nr_bits <= BITS_PER_LONG)
nr_bits = BITS_PER_LONG;
else
nr_bits = roundup_pow_of_two(nr_bits);
bitset_bytes = BITS_TO_BYTES(nr_bits);
bitset_mask = nr_bits - 1;
}
bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
if (!bloom)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&bloom->map, attr);
bloom->nr_hash_funcs = nr_hash_funcs;
bloom->bitset_mask = bitset_mask;
/* Check whether the value size is u32-aligned */
if ((attr->value_size & (sizeof(u32) - 1)) == 0)
bloom->aligned_u32_count =
attr->value_size / sizeof(u32);
if (!(attr->map_flags & BPF_F_ZERO_SEED))
bloom->hash_seed = get_random_int();
return &bloom->map;
}
static void bloom_map_free(struct bpf_map *map)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
bpf_map_area_free(bloom);
}
static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
{
/* The eBPF program should use map_peek_elem instead */
return ERR_PTR(-EINVAL);
}
static int bloom_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
/* The eBPF program should use map_push_elem instead */
return -EINVAL;
}
static int bloom_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
/* Bloom filter maps are keyless */
return btf_type_is_void(key_type) ? 0 : -EINVAL;
}
static int bpf_bloom_map_btf_id;
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
.map_get_next_key = bloom_map_get_next_key,
.map_push_elem = bloom_map_push_elem,
.map_peek_elem = bloom_map_peek_elem,
.map_pop_elem = bloom_map_pop_elem,
.map_lookup_elem = bloom_map_lookup_elem,
.map_update_elem = bloom_map_update_elem,
.map_delete_elem = bloom_map_delete_elem,
.map_check_btf = bloom_map_check_btf,
.map_btf_name = "bpf_bloom_filter",
.map_btf_id = &bpf_bloom_map_btf_id,
};
// SPDX-License-Identifier: GPL-2.0
/*
* Generate definitions needed by the preprocessor.
* This code generates raw asm output which is post-processed
* to extract and format the required data.
*/
#define __GENERATING_BOUNDS_H
/* Include headers that define the enum constants of interest */
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
int main(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
/* End of constants */
return 0;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2019 Facebook
* Copyright 2020 Google LLC.
*/
#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
static struct bpf_local_storage __rcu **
inode_storage_ptr(void *owner)
{
struct inode *inode = owner;
struct bpf_storage_blob *bsb;
bsb = bpf_inode(inode);
if (!bsb)
return NULL;
return &bsb->storage;
}
static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
struct bpf_map *map,
bool cacheit_lockit)
{
struct bpf_local_storage *inode_storage;
struct bpf_local_storage_map *smap;
struct bpf_storage_blob *bsb;
bsb = bpf_inode(inode);
if (!bsb)
return NULL;
inode_storage =
rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
smap = (struct bpf_local_storage_map *)map;
return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
}
void bpf_inode_storage_free(struct inode *inode)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
return;
rcu_read_lock();
local_storage = rcu_dereference(bsb->storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
/* Neither the bpf_prog nor the bpf-map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* local_storage->list by the bpf_prog or by the bpf-map's syscall.
*
* It is racing with bpf_local_storage_map_free() alone
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
raw_spin_lock_bh(&local_storage->lock);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
*/
bpf_selem_unlink_map(selem);
free_inode_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, false);
}
raw_spin_unlock_bh(&local_storage->lock);
rcu_read_unlock();
/* free_inoode_storage should always be true as long as
* local_storage->list was non-empty.
*/
if (free_inode_storage)
kfree_rcu(local_storage, rcu);
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
struct file *f;
int fd;
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
return ERR_PTR(-EBADF);
sdata = inode_storage_lookup(f->f_inode, map, true);
fput(f);
return sdata ? sdata->data : NULL;
}
static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct file *f;
int fd;
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
return -EBADF;
if (!inode_storage_ptr(f->f_inode)) {
fput(f);
return -EBADF;
}
sdata = bpf_local_storage_update(f->f_inode,
(struct bpf_local_storage_map *)map,
value, map_flags);
fput(f);
return PTR_ERR_OR_ZERO(sdata);
}
static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
{
struct bpf_local_storage_data *sdata;
sdata = inode_storage_lookup(inode, map, false);
if (!sdata)
return -ENOENT;
bpf_selem_unlink(SELEM(sdata));
return 0;
}
static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
struct file *f;
int fd, err;
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
return -EBADF;
err = inode_storage_delete(f->f_inode, map);
fput(f);
return err;
}
BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
/* explicitly check that the inode_storage_ptr is not
* NULL as inode_storage_lookup returns NULL in this case and
* bpf_local_storage_update expects the owner to have a
* valid storage pointer.
*/
if (!inode || !inode_storage_ptr(inode))
return (unsigned long)NULL;
sdata = inode_storage_lookup(inode, map, true);
if (sdata)
return (unsigned long)sdata->data;
/* This helper must only called from where the inode is guaranteed
* to have a refcount and cannot be freed.
*/
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
return (unsigned long)NULL;
}
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;
/* This helper must only called from where the inode is guaranteed
* to have a refcount and cannot be freed.
*/
return inode_storage_delete(inode, map);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
void *next_key)
{
return -ENOTSUPP;
}
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
smap = bpf_local_storage_map_alloc(attr);
if (IS_ERR(smap))
return ERR_CAST(smap);
smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
return &smap->map;
}
static void inode_storage_map_free(struct bpf_map *map)
{
struct bpf_local_storage_map *smap;
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
bpf_local_storage_map_free(smap, NULL);
}
static int inode_storage_map_btf_id;
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
.map_alloc = inode_storage_map_alloc,
.map_free = inode_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
.map_btf_name = "bpf_local_storage_map",
.map_btf_id = &inode_storage_map_btf_id,
.map_owner_storage_ptr = inode_storage_ptr,
};
BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)
const struct bpf_func_proto bpf_inode_storage_get_proto = {
.func = bpf_inode_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.func = bpf_inode_storage_delete,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2020 Facebook */
#include <linux/fs.h>
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
struct bpf_iter_target_info {
struct list_head list;
const struct bpf_iter_reg *reg_info;
u32 btf_id; /* cached value */
};
struct bpf_iter_link {
struct bpf_link link;
struct bpf_iter_aux_info aux;
struct bpf_iter_target_info *tinfo;
};
struct bpf_iter_priv_data {
struct bpf_iter_target_info *tinfo;
const struct bpf_iter_seq_info *seq_info;
struct bpf_prog *prog;
u64 session_id;
u64 seq_num;
bool done_stop;
u8 target_private[] __aligned(8);
};
static struct list_head targets = LIST_HEAD_INIT(targets);
static DEFINE_MUTEX(targets_mutex);
/* protect bpf_iter_link changes */
static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
const struct bpf_iter_seq_info *seq_info);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num++;
}
static void bpf_iter_dec_seq_num(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->seq_num--;
}
static void bpf_iter_done_stop(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
iter_priv->done_stop = true;
}
static bool bpf_iter_support_resched(struct seq_file *seq)
{
struct bpf_iter_priv_data *iter_priv;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
}
/* maximum visited objects before bailing out */
#define MAX_ITER_OBJECTS 1000000
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* no_llseek is assumed for this file.
* The following are differences from seq_read():
* . fixed buffer size (PAGE_SIZE)
* . assuming no_llseek
* . stop() may call bpf program, handling potential overflow there
*/
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos)
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
int err = 0, num_objs = 0;
bool can_resched;
void *p;
mutex_lock(&seq->lock);
if (!seq->buf) {
seq->size = PAGE_SIZE << 3;
seq->buf = kvmalloc(seq->size, GFP_KERNEL);
if (!seq->buf) {
err = -ENOMEM;
goto done;
}
}
if (seq->count) {
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf + seq->from, n);
if (err) {
err = -EFAULT;
goto done;
}
seq->count -= n;
seq->from += n;
copied = n;
goto done;
}
seq->from = 0;
p = seq->op->start(seq, &seq->index);
if (!p)
goto stop;
if (IS_ERR(p)) {
err = PTR_ERR(p);
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
err = seq->op->show(seq, p);
if (err > 0) {
/* object is skipped, decrease seq_num, so next
* valid object can reuse the same seq_num.
*/
bpf_iter_dec_seq_num(seq);
seq->count = 0;
} else if (err < 0 || seq_has_overflowed(seq)) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
seq->count = 0;
goto done;
}
can_resched = bpf_iter_support_resched(seq);
while (1) {
loff_t pos = seq->index;
num_objs++;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
pr_info_ratelimited("buggy seq_file .next function %ps "
"did not updated position index\n",
seq->op->next);
seq->index++;
}
if (IS_ERR_OR_NULL(p))
break;
/* got a valid next object, increase seq_num */
bpf_iter_inc_seq_num(seq);
if (seq->count >= size)
break;
if (num_objs >= MAX_ITER_OBJECTS) {
if (offs == 0) {
err = -EAGAIN;
seq->op->stop(seq, p);
goto done;
}
break;
}
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
seq->count = offs;
} else if (err < 0 || seq_has_overflowed(seq)) {
seq->count = offs;
if (offs == 0) {
if (!err)
err = -E2BIG;
seq->op->stop(seq, p);
goto done;
}
break;
}
if (can_resched)
cond_resched();
}
stop:
offs = seq->count;
/* bpf program called if !p */
seq->op->stop(seq, p);
if (!p) {
if (!seq_has_overflowed(seq)) {
bpf_iter_done_stop(seq);
} else {
seq->count = offs;
if (offs == 0) {
err = -E2BIG;
goto done;
}
}
}
n = min(seq->count, size);
err = copy_to_user(buf, seq->buf, n);
if (err) {
err = -EFAULT;
goto done;
}
copied = n;
seq->count -= n;
seq->from = n;
done:
if (!copied)
copied = err;
else
*ppos += copied;
mutex_unlock(&seq->lock);
return copied;
}
static const struct bpf_iter_seq_info *
__get_seq_info(struct bpf_iter_link *link)
{
const struct bpf_iter_seq_info *seq_info;
if (link->aux.map) {
seq_info = link->aux.map->ops->iter_seq_info;
if (seq_info)
return seq_info;
}
return link->tinfo->reg_info->seq_info;
}
static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
return prepare_seq_file(file, link, __get_seq_info(link));
}
static int iter_release(struct inode *inode, struct file *file)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
seq = file->private_data;
if (!seq)
return 0;
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
target_private);
if (iter_priv->seq_info->fini_seq_private)
iter_priv->seq_info->fini_seq_private(seq->private);
bpf_prog_put(iter_priv->prog);
seq->private = iter_priv;
return seq_release_private(inode, file);
}
const struct file_operations bpf_iter_fops = {
.open = iter_open,
.llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
/* The argument reg_info will be cached in bpf_iter_target_info.
* The common practice is to declare target reg_info as
* a const static variable and passed as an argument to
* bpf_iter_reg_target().
*/
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
if (!tinfo)
return -ENOMEM;
tinfo->reg_info = reg_info;
INIT_LIST_HEAD(&tinfo->list);
mutex_lock(&targets_mutex);
list_add(&tinfo->list, &targets);
mutex_unlock(&targets_mutex);
return 0;
}
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
{
struct bpf_iter_target_info *tinfo;
bool found = false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (reg_info == tinfo->reg_info) {
list_del(&tinfo->list);
kfree(tinfo);
found = true;
break;
}
}
mutex_unlock(&targets_mutex);
WARN_ON(found == false);
}
static void cache_btf_id(struct bpf_iter_target_info *tinfo,
struct bpf_prog *prog)
{
tinfo->btf_id = prog->aux->attach_btf_id;
}
bool bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
u32 prog_btf_id = prog->aux->attach_btf_id;
const char *prefix = BPF_ITER_FUNC_PREFIX;
struct bpf_iter_target_info *tinfo;
int prefix_len = strlen(prefix);
bool supported = false;
if (strncmp(attach_fname, prefix, prefix_len))
return false;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
supported = true;
break;
}
if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
cache_btf_id(tinfo, prog);
supported = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (supported) {
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
}
return supported;
}
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_iter_target_info *tinfo;
const struct bpf_func_proto *fn = NULL;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog->aux->attach_btf_id) {
const struct bpf_iter_reg *reg_info;
reg_info = tinfo->reg_info;
if (reg_info->get_func_proto)
fn = reg_info->get_func_proto(func_id, prog);
break;
}
}
mutex_unlock(&targets_mutex);
return fn;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
if (iter_link->tinfo->reg_info->detach_target)
iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
kfree(iter_link);
}
static int bpf_iter_link_replace(struct bpf_link *link,
struct bpf_prog *new_prog,
struct bpf_prog *old_prog)
{
int ret = 0;
mutex_lock(&link_mutex);
if (old_prog && link->prog != old_prog) {
ret = -EPERM;
goto out_unlock;
}
if (link->prog->type != new_prog->type ||
link->prog->expected_attach_type != new_prog->expected_attach_type ||
link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
ret = -EINVAL;
goto out_unlock;
}
old_prog = xchg(&link->prog, new_prog);
bpf_prog_put(old_prog);
out_unlock:
mutex_unlock(&link_mutex);
return ret;
}
static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
struct seq_file *seq)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
bpf_iter_show_fdinfo_t show_fdinfo;
seq_printf(seq,
"target_name:\t%s\n",
iter_link->tinfo->reg_info->target);
show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
if (show_fdinfo)
show_fdinfo(&iter_link->aux, seq);
}
static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
struct bpf_link_info *info)
{
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
bpf_iter_fill_link_info_t fill_link_info;
u32 ulen = info->iter.target_name_len;
const char *target_name;
u32 target_len;
if (!ulen ^ !ubuf)
return -EINVAL;
target_name = iter_link->tinfo->reg_info->target;
target_len = strlen(target_name);
info->iter.target_name_len = target_len + 1;
if (ubuf) {
if (ulen >= target_len + 1) {
if (copy_to_user(ubuf, target_name, target_len + 1))
return -EFAULT;
} else {
char zero = '\0';
if (copy_to_user(ubuf, target_name, ulen - 1))
return -EFAULT;
if (put_user(zero, ubuf + ulen - 1))
return -EFAULT;
return -ENOSPC;
}
}
fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
if (fill_link_info)
return fill_link_info(&iter_link->aux, info);
return 0;
}
static const struct bpf_link_ops bpf_iter_link_lops = {
.release = bpf_iter_link_release,
.dealloc = bpf_iter_link_dealloc,
.update_prog = bpf_iter_link_replace,
.show_fdinfo = bpf_iter_link_show_fdinfo,
.fill_link_info = bpf_iter_link_fill_link_info,
};
bool bpf_link_is_iter(struct bpf_link *link)
{
return link->ops == &bpf_iter_link_lops;
}
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
u32 prog_btf_id, linfo_len;
bool existed = false;
bpfptr_t ulinfo;
int err;
if (attr->link_create.target_fd || attr->link_create.flags)
return -EINVAL;
memset(&linfo, 0, sizeof(union bpf_iter_link_info));
ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
linfo_len = attr->link_create.iter_info_len;
if (bpfptr_is_null(ulinfo) ^ !linfo_len)
return -EINVAL;
if (!bpfptr_is_null(ulinfo)) {
err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
linfo_len);
if (err)
return err;
linfo_len = min_t(u32, linfo_len, sizeof(linfo));
if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
return -EFAULT;
}
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog_btf_id) {
existed = true;
break;
}
}
mutex_unlock(&targets_mutex);
if (!existed)
return -ENOENT;
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
link->tinfo = tinfo;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
return err;
}
if (tinfo->reg_info->attach_target) {
err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
if (err) {
bpf_link_cleanup(&link_primer);
return err;
}
}
return bpf_link_settle(&link_primer);
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
struct bpf_iter_target_info *tinfo,
const struct bpf_iter_seq_info *seq_info,
struct bpf_prog *prog)
{
priv_data->tinfo = tinfo;
priv_data->seq_info = seq_info;
priv_data->prog = prog;
priv_data->session_id = atomic64_inc_return(&session_id);
priv_data->seq_num = 0;
priv_data->done_stop = false;
}
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
const struct bpf_iter_seq_info *seq_info)
{
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
u32 total_priv_dsize;
struct seq_file *seq;
int err = 0;
mutex_lock(&link_mutex);
prog = link->link.prog;
bpf_prog_inc(prog);
mutex_unlock(&link_mutex);
tinfo = link->tinfo;
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
seq_info->seq_priv_size;
priv_data = __seq_open_private(file, seq_info->seq_ops,
total_priv_dsize);
if (!priv_data) {
err = -ENOMEM;
goto release_prog;
}
if (seq_info->init_seq_private) {
err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
if (err)
goto release_seq_file;
}
init_seq_meta(priv_data, tinfo, seq_info, prog);
seq = file->private_data;
seq->private = priv_data->target_private;
return 0;
release_seq_file:
seq_release_private(file->f_inode, file);
file->private_data = NULL;
release_prog:
bpf_prog_put(prog);
return err;
}
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
struct file *file;
unsigned int flags;
int err, fd;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
fd = get_unused_fd_flags(flags);
if (fd < 0)
return fd;
file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto free_fd;
}
iter_link = container_of(link, struct bpf_iter_link, link);
err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
if (err)
goto free_file;
fd_install(fd, file);
return fd;
free_file:
fput(file);
free_fd:
put_unused_fd(fd);
return err;
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
{
struct bpf_iter_priv_data *iter_priv;
struct seq_file *seq;
void *seq_priv;
seq = meta->seq;
if (seq->file->f_op != &bpf_iter_fops)
return NULL;
seq_priv = seq->private;
iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
target_private);
if (in_stop && iter_priv->done_stop)
return NULL;
meta->session_id = iter_priv->session_id;
meta->seq_num = iter_priv->seq_num;
return iter_priv->prog;
}
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
rcu_read_lock();
migrate_disable();
ret = bpf_prog_run(prog, ctx);
migrate_enable();
rcu_read_unlock();
/* bpf program can only return 0 or 1:
* 0 : okay
* 1 : retry the same object
* The bpf_iter_run_prog() return value
* will be seq_ops->show() return value.
*/
return ret == 0 ? 0 : -EAGAIN;
}
BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
void *, callback_ctx, u64, flags)
{
return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
}
const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.func = bpf_for_each_map_elem,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_FUNC,
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
/* maximum number of loops */
#define MAX_LOOPS BIT(23)
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
bpf_callback_t callback = (bpf_callback_t)callback_fn;
u64 ret;
u32 i;
if (flags)
return -EINVAL;
if (nr_loops > MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
return i + 1;
}
return i;
}
const struct bpf_func_proto bpf_loop_proto = {
.func = bpf_loop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_FUNC,
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/btf_ids.h>
#include <linux/bpf_local_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem)
{
return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
}
static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
{
struct bpf_map *map = &smap->map;
if (!map->ops->map_local_storage_charge)
return 0;
return map->ops->map_local_storage_charge(smap, owner, size);
}
static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
u32 size)
{
struct bpf_map *map = &smap->map;
if (map->ops->map_local_storage_uncharge)
map->ops->map_local_storage_uncharge(smap, owner, size);
}
static struct bpf_local_storage __rcu **
owner_storage(struct bpf_local_storage_map *smap, void *owner)
{
struct bpf_map *map = &smap->map;
return map->ops->map_owner_storage_ptr(owner);
}
static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
}
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
void *value, bool charge_mem)
{
struct bpf_local_storage_elem *selem;
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
GFP_ATOMIC | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
return selem;
}
if (charge_mem)
mem_uncharge(smap, owner, smap->elem_size);
return NULL;
}
void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
kfree_rcu(local_storage, rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
kfree_rcu(selem, rcu);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
bool uncharge_mem)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
void *owner;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
* The owner may be freed once the last selem is unlinked
* from local_storage.
*/
if (uncharge_mem)
mem_uncharge(smap, owner, smap->elem_size);
free_local_storage = hlist_is_singular_node(&selem->snode,
&local_storage->list);
if (free_local_storage) {
mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
local_storage->owner = NULL;
/* After this RCU_INIT, owner may be freed and cannot be used */
RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
/* local_storage is not freed now. local_storage->lock is
* still held and raw_spin_unlock_bh(&local_storage->lock)
* will be done by the caller.
*
* Although the unlock will be done under
* rcu_read_lock(), it is more intutivie to
* read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
* to the caller which then calls then frees the storage after
* all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
return free_local_storage;
}
static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
unsigned long flags;
if (unlikely(!selem_linked_to_storage(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_rcu);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem)
{
RCU_INIT_POINTER(selem->local_storage, local_storage);
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
unsigned long flags;
if (unlikely(!selem_linked_to_map(selem)))
/* selem has already be unlinked from smap */
return;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
hlist_del_init_rcu(&selem->map_node);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
unsigned long flags;
raw_spin_lock_irqsave(&b->lock, flags);
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
hlist_add_head_rcu(&selem->map_node, &b->list);
raw_spin_unlock_irqrestore(&b->lock, flags);
}
void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
__bpf_selem_unlink_storage(selem);
}
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
bool cacheit_lockit)
{
struct bpf_local_storage_data *sdata;
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
if (!selem)
return NULL;
sdata = SDATA(selem);
if (cacheit_lockit) {
unsigned long flags;
/* spinlock is needed to avoid racing with the
* parallel delete. Otherwise, publishing an already
* deleted sdata to the cache will become a use-after-free
* problem in the next bpf_local_storage_lookup().
*/
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (selem_linked_to_storage(selem))
rcu_assign_pointer(local_storage->cache[smap->cache_idx],
sdata);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
}
return sdata;
}
static int check_flags(const struct bpf_local_storage_data *old_sdata,
u64 map_flags)
{
if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
return 0;
}
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *first_selem)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
int err;
err = mem_charge(smap, owner, sizeof(*storage));
if (err)
return err;
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
GFP_ATOMIC | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
bpf_selem_link_storage_nolock(storage, first_selem);
bpf_selem_link_map(smap, first_selem);
owner_storage_ptr =
(struct bpf_local_storage **)owner_storage(smap, owner);
/* Publish storage to the owner.
* Instead of using any lock of the kernel object (i.e. owner),
* cmpxchg will work with any kernel object regardless what
* the running context is, bh, irq...etc.
*
* From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
* is protected by the storage->lock. Hence, when freeing
* the owner->storage, the storage->lock must be held before
* setting owner->storage ptr to NULL.
*/
prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
if (unlikely(prev_storage)) {
bpf_selem_unlink_map(first_selem);
err = -EAGAIN;
goto uncharge;
/* Note that even first_selem was linked to smap's
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
* synchronize_rcu_mult (waiting for both sleepable and
* normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
}
return 0;
uncharge:
kfree(storage);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
/* sk cannot be going away because it is linking new elem
* to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
* Otherwise, it will become a leak (and other memory issues
* during map destruction).
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
if (err)
return ERR_PTR(err);
selem = bpf_selem_alloc(smap, owner, value, true);
if (!selem)
return ERR_PTR(-ENOMEM);
err = bpf_local_storage_alloc(owner, smap, selem);
if (err) {
kfree(selem);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
return SDATA(selem);
}
if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
/* Hoping to find an old_sdata to do inline update
* such that it can avoid taking the local_storage->lock
* and changing the lists.
*/
old_sdata =
bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
copy_map_value_locked(&smap->map, old_sdata->data,
value, false);
return old_sdata;
}
}
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
if (unlikely(hlist_empty(&local_storage->list))) {
/* A parallel del is happening and local_storage is going
* away. It has just been checked before, so very
* unlikely. Return instead of retry to keep things
* simple.
*/
err = -EAGAIN;
goto unlock_err;
}
old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
goto unlock_err;
if (old_sdata && (map_flags & BPF_F_LOCK)) {
copy_map_value_locked(&smap->map, old_sdata->data, value,
false);
selem = SELEM(old_sdata);
goto unlock;
}
/* local_storage->lock is held. Hence, we are sure
* we can unlink and uncharge the old_sdata successfully
* later. Hence, instead of charging the new selem now
* and then uncharge the old selem later (which may cause
* a potential but unnecessary charge failure), avoid taking
* a charge at all here (the "!old_sdata" check) and the
* old_sdata will not be uncharged later during
* bpf_selem_unlink_storage_nolock().
*/
selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
if (!selem) {
err = -ENOMEM;
goto unlock_err;
}
/* First, link the new selem to the map */
bpf_selem_link_map(smap, selem);
/* Second, link (and publish) the new selem to local_storage */
bpf_selem_link_storage_nolock(local_storage, selem);
/* Third, remove old selem, SELEM(old_sdata) */
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
false);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return SDATA(selem);
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
return ERR_PTR(err);
}
u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
spin_lock(&cache->idx_lock);
for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
if (cache->idx_usage_counts[i] < min_usage) {
min_usage = cache->idx_usage_counts[i];
res = i;
/* Found a free cache_idx */
if (!min_usage)
break;
}
}
cache->idx_usage_counts[res]++;
spin_unlock(&cache->idx_lock);
return res;
}
void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
int __percpu *busy_counter)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map_bucket *b;
unsigned int i;
/* Note that this map might be concurrently cloned from
* bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
* RCU read section to finish before proceeding. New RCU
* read sections should be prevented via bpf_map_inc_not_zero.
*/
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
* now. No new selem (of this map) can be added
* to the owner->storage or to the map bucket's list.
*
* The elem of this map can be cleaned up here
* or when the storage is freed e.g.
* by bpf_sk_storage_free() during __sk_destruct().
*/
for (i = 0; i < (1U << smap->bucket_log); i++) {
b = &smap->buckets[i];
rcu_read_lock();
/* No one is adding to b->list now */
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
if (busy_counter) {
migrate_disable();
__this_cpu_inc(*busy_counter);
}
bpf_selem_unlink(selem);
if (busy_counter) {
__this_cpu_dec(*busy_counter);
migrate_enable();
}
cond_resched_rcu();
}
rcu_read_unlock();
}
/* While freeing the storage we may still need to access the map.
*
* e.g. when bpf_sk_storage_free() has unlinked selem from the map
* which then made the above while((selem = ...)) loop
* exit immediately.
*
* However, while freeing the storage one still needs to access the
* smap->elem_size to do the uncharging in
* bpf_selem_unlink_storage_nolock().
*
* Hence, wait another rcu grace period for the storage to be freed.
*/
synchronize_rcu();
kvfree(smap->buckets);
kfree(smap);
}
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
if (!bpf_capable())
return -EPERM;
if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
return -E2BIG;
return 0;
}
struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
unsigned int i;
u32 nbuckets;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&smap->map, attr);
nbuckets = roundup_pow_of_two(num_possible_cpus());
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
if (!smap->buckets) {
kfree(smap);
return ERR_PTR(-ENOMEM);
}
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
raw_spin_lock_init(&smap->buckets[i].lock);
}
smap->elem_size =
sizeof(struct bpf_local_storage_elem) + attr->value_size;
return smap;
}
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
u32 int_data;
if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
int_data = *(u32 *)(key_type + 1);
if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
return -EINVAL;
return 0;
}
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
*/
#include <linux/cpumask.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include "bpf_lru_list.h"
#define LOCAL_FREE_TARGET (128)
#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
#define PERCPU_FREE_TARGET (4)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
/* Helpers to get the local list index */
#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
static int get_next_cpu(int cpu)
{
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_possible_mask);
return cpu;
}
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
return &loc_l->lists[LOCAL_FREE_LIST_IDX];
}
static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
{
return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
}
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
return node->ref;
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
enum bpf_lru_list_type type)
{
if (type < NR_BPF_LRU_LIST_COUNT)
l->counts[type]++;
}
static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
enum bpf_lru_list_type type)
{
if (type < NR_BPF_LRU_LIST_COUNT)
l->counts[type]--;
}
static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
struct bpf_lru_node *node,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
/* If the removing node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
*/
if (&node->list == l->next_inactive_rotation)
l->next_inactive_rotation = l->next_inactive_rotation->prev;
bpf_lru_list_count_dec(l, node->type);
node->type = tgt_free_type;
list_move(&node->list, free_list);
}
/* Move nodes from local list to the LRU list */
static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
struct bpf_lru_node *node,
enum bpf_lru_list_type tgt_type)
{
if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
return;
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
node->ref = 0;
list_move(&node->list, &l->lists[tgt_type]);
}
/* Move nodes between or within active and inactive list (like
* active to inactive, inactive to active or tail of active back to
* the head of active).
*/
static void __bpf_lru_node_move(struct bpf_lru_list *l,
struct bpf_lru_node *node,
enum bpf_lru_list_type tgt_type)
{
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
return;
if (node->type != tgt_type) {
bpf_lru_list_count_dec(l, node->type);
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
node->ref = 0;
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
*/
if (&node->list == l->next_inactive_rotation)
l->next_inactive_rotation = l->next_inactive_rotation->prev;
list_move(&node->list, &l->lists[tgt_type]);
}
static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
{
return l->counts[BPF_LRU_LIST_T_INACTIVE] <
l->counts[BPF_LRU_LIST_T_ACTIVE];
}
/* Rotate the active list:
* 1. Start from tail
* 2. If the node has the ref bit set, it will be rotated
* back to the head of active list with the ref bit cleared.
* Give this node one more chance to survive in the active list.
* 3. If the ref bit is not set, move it to the head of the
* inactive list.
* 4. It will at most scan nr_scans nodes
*/
static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
struct bpf_lru_list *l)
{
struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
struct bpf_lru_node *node, *tmp_node, *first_node;
unsigned int i = 0;
first_node = list_first_entry(active, struct bpf_lru_node, list);
list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
if (++i == lru->nr_scans || node == first_node)
break;
}
}
/* Rotate the inactive list. It starts from the next_inactive_rotation
* 1. If the node has ref bit set, it will be moved to the head
* of active list with the ref bit cleared.
* 2. If the node does not have ref bit set, it will leave it
* at its current location (i.e. do nothing) so that it can
* be considered during the next inactive_shrink.
* 3. It will at most scan nr_scans nodes
*/
static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
struct bpf_lru_list *l)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
struct list_head *cur, *last, *next = inactive;
struct bpf_lru_node *node;
unsigned int i = 0;
if (list_empty(inactive))
return;
last = l->next_inactive_rotation->next;
if (last == inactive)
last = last->next;
cur = l->next_inactive_rotation;
while (i < lru->nr_scans) {
if (cur == inactive) {
cur = cur->prev;
continue;
}
node = list_entry(cur, struct bpf_lru_node, list);
next = cur->prev;
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
if (cur == last)
break;
cur = next;
i++;
}
l->next_inactive_rotation = next;
}
/* Shrink the inactive list. It starts from the tail of the
* inactive list and only move the nodes without the ref bit
* set to the designated free list.
*/
static unsigned int
__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
struct bpf_lru_list *l,
unsigned int tgt_nshrink,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
struct bpf_lru_node *node, *tmp_node;
unsigned int nshrinked = 0;
unsigned int i = 0;
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
if (bpf_lru_node_is_ref(node)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
} else if (lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
if (++nshrinked == tgt_nshrink)
break;
}
if (++i == lru->nr_scans)
break;
}
return nshrinked;
}
/* 1. Rotate the active list (if needed)
* 2. Always rotate the inactive list
*/
static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
{
if (bpf_lru_list_inactive_low(l))
__bpf_lru_list_rotate_active(lru, l);
__bpf_lru_list_rotate_inactive(lru, l);
}
/* Calls __bpf_lru_list_shrink_inactive() to shrink some
* ref-bit-cleared nodes and move them to the designated
* free list.
*
* If it cannot get a free node after calling
* __bpf_lru_list_shrink_inactive(). It will just remove
* one node from either inactive or active list without
* honoring the ref-bit. It prefers inactive list to active
* list in this situation.
*/
static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
struct bpf_lru_list *l,
unsigned int tgt_nshrink,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
struct bpf_lru_node *node, *tmp_node;
struct list_head *force_shrink_list;
unsigned int nshrinked;
nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
free_list, tgt_free_type);
if (nshrinked)
return nshrinked;
/* Do a force shrink by ignoring the reference bit */
if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
else
force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
list) {
if (lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
return 1;
}
}
return 0;
}
/* Flush the nodes from the local pending list to the LRU list */
static void __local_list_flush(struct bpf_lru_list *l,
struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node, *tmp_node;
list_for_each_entry_safe_reverse(node, tmp_node,
local_pending_list(loc_l), list) {
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move_in(l, node,
BPF_LRU_LIST_T_INACTIVE);
}
}
static void bpf_lru_list_push_free(struct bpf_lru_list *l,
struct bpf_lru_node *node)
{
unsigned long flags;
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
raw_spin_unlock_irqrestore(&l->lock, flags);
}
static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
struct bpf_lru_node *node, *tmp_node;
unsigned int nfree = 0;
raw_spin_lock(&l->lock);
__local_list_flush(l, loc_l);
__bpf_lru_list_rotate(lru, l);
list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
if (++nfree == LOCAL_FREE_TARGET)
break;
}
if (nfree < LOCAL_FREE_TARGET)
__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
raw_spin_unlock(&l->lock);
}
static void __local_list_add_pending(struct bpf_lru *lru,
struct bpf_lru_locallist *loc_l,
int cpu,
struct bpf_lru_node *node,
u32 hash)
{
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
node->ref = 0;
list_add(&node->list, local_pending_list(loc_l));
}
static struct bpf_lru_node *
__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
node = list_first_entry_or_null(local_free_list(loc_l),
struct bpf_lru_node,
list);
if (node)
list_del(&node->list);
return node;
}
static struct bpf_lru_node *
__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
bool force = false;
ignore_ref:
/* Get from the tail (i.e. older element) of the pending list. */
list_for_each_entry_reverse(node, local_pending_list(loc_l),
list) {
if ((!bpf_lru_node_is_ref(node) || force) &&
lru->del_from_htab(lru->del_arg, node)) {
list_del(&node->list);
return node;
}
}
if (!force) {
force = true;
goto ignore_ref;
}
return NULL;
}
static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
u32 hash)
{
struct list_head *free_list;
struct bpf_lru_node *node = NULL;
struct bpf_lru_list *l;
unsigned long flags;
int cpu = raw_smp_processor_id();
l = per_cpu_ptr(lru->percpu_lru, cpu);
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_list_rotate(lru, l);
free_list = &l->lists[BPF_LRU_LIST_T_FREE];
if (list_empty(free_list))
__bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
BPF_LRU_LIST_T_FREE);
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->ref = 0;
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
raw_spin_unlock_irqrestore(&l->lock, flags);
return node;
}
static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
u32 hash)
{
struct bpf_lru_locallist *loc_l, *steal_loc_l;
struct bpf_common_lru *clru = &lru->common_lru;
struct bpf_lru_node *node;
int steal, first_steal;
unsigned long flags;
int cpu = raw_smp_processor_id();
loc_l = per_cpu_ptr(clru->local_list, cpu);
raw_spin_lock_irqsave(&loc_l->lock, flags);
node = __local_list_pop_free(loc_l);
if (!node) {
bpf_lru_list_pop_free_to_local(lru, loc_l);
node = __local_list_pop_free(loc_l);
}
if (node)
__local_list_add_pending(lru, loc_l, cpu, node, hash);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
if (node)
return node;
/* No free nodes found from the local free list and
* the global LRU list.
*
* Steal from the local free/pending list of the
* current CPU and remote CPU in RR. It starts
* with the loc_l->next_steal CPU.
*/
first_steal = loc_l->next_steal;
steal = first_steal;
do {
steal_loc_l = per_cpu_ptr(clru->local_list, steal);
raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
node = __local_list_pop_free(steal_loc_l);
if (!node)
node = __local_list_pop_pending(lru, steal_loc_l);
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
steal = get_next_cpu(steal);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
if (node) {
raw_spin_lock_irqsave(&loc_l->lock, flags);
__local_list_add_pending(lru, loc_l, cpu, node, hash);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
}
return node;
}
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
{
if (lru->percpu)
return bpf_percpu_lru_pop_free(lru, hash);
else
return bpf_common_lru_pop_free(lru, hash);
}
static void bpf_common_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
u8 node_type = READ_ONCE(node->type);
unsigned long flags;
if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
return;
if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
raw_spin_lock_irqsave(&loc_l->lock, flags);
if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
goto check_lru_list;
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
node->ref = 0;
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
return;
}
check_lru_list:
bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
}
static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
struct bpf_lru_list *l;
unsigned long flags;
l = per_cpu_ptr(lru->percpu_lru, node->cpu);
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
raw_spin_unlock_irqrestore(&l->lock, flags);
}
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
{
if (lru->percpu)
bpf_percpu_lru_push_free(lru, node);
else
bpf_common_lru_push_free(lru, node);
}
static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
u32 node_offset, u32 elem_size,
u32 nr_elems)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
u32 i;
for (i = 0; i < nr_elems; i++) {
struct bpf_lru_node *node;
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
node->ref = 0;
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
}
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
u32 node_offset, u32 elem_size,
u32 nr_elems)
{
u32 i, pcpu_entries;
int cpu;
struct bpf_lru_list *l;
pcpu_entries = nr_elems / num_possible_cpus();
i = 0;
for_each_possible_cpu(cpu) {
struct bpf_lru_node *node;
l = per_cpu_ptr(lru->percpu_lru, cpu);
again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
node->ref = 0;
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
if (i == nr_elems)
break;
if (i % pcpu_entries)
goto again;
}
}
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems)
{
if (lru->percpu)
bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
nr_elems);
else
bpf_common_lru_populate(lru, buf, node_offset, elem_size,
nr_elems);
}
static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
{
int i;
for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
INIT_LIST_HEAD(&loc_l->lists[i]);
loc_l->next_steal = cpu;
raw_spin_lock_init(&loc_l->lock);
}
static void bpf_lru_list_init(struct bpf_lru_list *l)
{
int i;
for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
INIT_LIST_HEAD(&l->lists[i]);
for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
l->counts[i] = 0;
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
raw_spin_lock_init(&l->lock);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
del_from_htab_func del_from_htab, void *del_arg)
{
int cpu;
if (percpu) {
lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
if (!lru->percpu_lru)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct bpf_lru_list *l;
l = per_cpu_ptr(lru->percpu_lru, cpu);
bpf_lru_list_init(l);
}
lru->nr_scans = PERCPU_NR_SCANS;
} else {
struct bpf_common_lru *clru = &lru->common_lru;
clru->local_list = alloc_percpu(struct bpf_lru_locallist);
if (!clru->local_list)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(clru->local_list, cpu);
bpf_lru_locallist_init(loc_l, cpu);
}
bpf_lru_list_init(&clru->lru_list);
lru->nr_scans = LOCAL_NR_SCANS;
}
lru->percpu = percpu;
lru->del_from_htab = del_from_htab;
lru->del_arg = del_arg;
lru->hash_offset = hash_offset;
return 0;
}
void bpf_lru_destroy(struct bpf_lru *lru)
{
if (lru->percpu)
free_percpu(lru->percpu_lru);
else
free_percpu(lru->common_lru.local_list);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2020 Google LLC.
*/
#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/binfmts.h>
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
*/
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
{ \
return DEFAULT; \
}
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
BTF_SET_START(bpf_lsm_hooks)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
if (!prog->gpl_compatible) {
bpf_log(vlog,
"LSM programs must have a GPL compatible license\n");
return -EINVAL;
}
if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
prog->aux->attach_btf_id, prog->aux->attach_func_name);
return -EINVAL;
}
return 0;
}
/* Mask for all the currently supported BPRM option flags */
#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
{
if (flags & ~BPF_F_BRPM_OPTS_MASK)
return -EINVAL;
bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
return 0;
}
BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
.func = bpf_bprm_opts_set,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
{
return ima_inode_hash(inode, dst, size);
}
static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
{
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
}
BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE,
.allowed = bpf_ima_inode_hash_allowed,
};
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_inode_storage_get:
return &bpf_inode_storage_get_proto;
case BPF_FUNC_inode_storage_delete:
return &bpf_inode_storage_delete_proto;
#ifdef CONFIG_NET
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#endif /* CONFIG_NET */
case BPF_FUNC_spin_lock:
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}
}
/* The set of hooks which are called without pagefaults disabled and are allowed
* to "sleep" and thus can be used for sleepable BPF programs.
*/
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
BTF_ID(func, bpf_lsm_bpf_map_free_security)
BTF_ID(func, bpf_lsm_bpf_prog)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
BTF_ID(func, bpf_lsm_bprm_creds_from_file)
BTF_ID(func, bpf_lsm_capget)
BTF_ID(func, bpf_lsm_capset)
BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
BTF_ID(func, bpf_lsm_file_receive)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_inet_conn_established)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
BTF_ID(func, bpf_lsm_inode_readlink)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
BTF_ID(func, bpf_lsm_kernfs_init_security)
#ifdef CONFIG_KEYS
BTF_ID(func, bpf_lsm_key_free)
#endif /* CONFIG_KEYS */
BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
BTF_ID(func, bpf_lsm_path_notify)
BTF_ID(func, bpf_lsm_release_secctx)
BTF_ID(func, bpf_lsm_sb_alloc_security)
BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
BTF_ID(func, bpf_lsm_sb_kern_mount)
BTF_ID(func, bpf_lsm_sb_mount)
BTF_ID(func, bpf_lsm_sb_remount)
BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
BTF_ID(func, bpf_lsm_sb_show_options)
BTF_ID(func, bpf_lsm_sb_statfs)
BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
BTF_ID(func, bpf_lsm_socket_create)
BTF_ID(func, bpf_lsm_socket_getpeername)
BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
BTF_ID(func, bpf_lsm_socket_getsockname)
BTF_ID(func, bpf_lsm_socket_getsockopt)
BTF_ID(func, bpf_lsm_socket_listen)
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_recvmsg)
BTF_ID(func, bpf_lsm_socket_sendmsg)
BTF_ID(func, bpf_lsm_socket_shutdown)
BTF_ID(func, bpf_lsm_socket_socketpair)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
BTF_SET_END(sleepable_lsm_hooks)
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
const struct bpf_prog_ops lsm_prog_ops = {
};
const struct bpf_verifier_ops lsm_verifier_ops = {
.get_func_proto = bpf_lsm_func_proto,
.is_valid_access = btf_ctx_access,
};
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/pid.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include "bpf_preload.h"
extern char bpf_preload_umd_start;
extern char bpf_preload_umd_end;
static int preload(struct bpf_preload_info *obj);
static int finish(void);
static struct bpf_preload_ops umd_ops = {
.info.driver_name = "bpf_preload",
.preload = preload,
.finish = finish,
.owner = THIS_MODULE,
};
static int preload(struct bpf_preload_info *obj)
{
int magic = BPF_PRELOAD_START;
loff_t pos = 0;
int i, err;
ssize_t n;
err = fork_usermode_driver(&umd_ops.info);
if (err)
return err;
/* send the start magic to let UMD proceed with loading BPF progs */
n = kernel_write(umd_ops.info.pipe_to_umh,
&magic, sizeof(magic), &pos);
if (n != sizeof(magic))
return -EPIPE;
/* receive bpf_link IDs and names from UMD */
pos = 0;
for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
n = kernel_read(umd_ops.info.pipe_from_umh,
&obj[i], sizeof(*obj), &pos);
if (n != sizeof(*obj))
return -EPIPE;
}
return 0;
}
static int finish(void)
{
int magic = BPF_PRELOAD_END;
struct pid *tgid;
loff_t pos = 0;
ssize_t n;
/* send the last magic to UMD. It will do a normal exit. */
n = kernel_write(umd_ops.info.pipe_to_umh,
&magic, sizeof(magic), &pos);
if (n != sizeof(magic))
return -EPIPE;
tgid = umd_ops.info.tgid;
if (tgid) {
wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
umd_cleanup_helper(&umd_ops.info);
}
return 0;
}
static int __init load_umd(void)
{
int err;
err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
&bpf_preload_umd_end - &bpf_preload_umd_start);
if (err)
return err;
bpf_preload_ops = &umd_ops;
return err;
}
static void __exit fini_umd(void)
{
struct pid *tgid;
bpf_preload_ops = NULL;
/* kill UMD in case it's still there due to earlier error */
tgid = umd_ops.info.tgid;
if (tgid) {
kill_pid(tgid, SIGKILL, 1);
wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
umd_cleanup_helper(&umd_ops.info);
}
umd_unload_blob(&umd_ops.info);
}
late_initcall(load_umd);
module_exit(fini_umd);
MODULE_LICENSE("GPL");
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2019 Facebook */
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/slab.h>
#include <linux/numa.h>
#include <linux/seq_file.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
refcount_t refcnt; \
enum bpf_struct_ops_state state
struct bpf_struct_ops_value {
BPF_STRUCT_OPS_COMMON_VALUE;
char data[] ____cacheline_aligned_in_smp;
};
struct bpf_struct_ops_map {
struct bpf_map map;
struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
/* progs has all the bpf_prog that is populated
* to the func ptr of the kernel's struct
* (in kvalue.data).
*/
struct bpf_prog **progs;
/* image is a page that has all the trampolines
* that stores the func args before calling the bpf_prog.
* A PAGE_SIZE "image" is enough to store all trampoline for
* "progs[]".
*/
void *image;
/* uvalue->data stores the kernel struct
* (e.g. tcp_congestion_ops) that is more useful
* to userspace than the kvalue. For example,
* the bpf_prog's id is stored instead of the kernel
* address of a func ptr.
*/
struct bpf_struct_ops_value *uvalue;
/* kvalue.data stores the actual kernel's struct
* (e.g. tcp_congestion_ops) that will be
* registered to the kernel subsystem.
*/
struct bpf_struct_ops_value kvalue;
};
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
* the map's value exposed to the userspace and its btf-type-id is
* stored at the map->btf_vmlinux_value_type_id.
*
*/
#define BPF_STRUCT_OPS_TYPE(_name) \
extern struct bpf_struct_ops bpf_##_name; \
\
struct bpf_struct_ops_##_name { \
BPF_STRUCT_OPS_COMMON_VALUE; \
struct _name data ____cacheline_aligned_in_smp; \
};
#include "bpf_struct_ops_types.h"
#undef BPF_STRUCT_OPS_TYPE
enum {
#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
#include "bpf_struct_ops_types.h"
#undef BPF_STRUCT_OPS_TYPE
__NR_BPF_STRUCT_OPS_TYPE,
};
static struct bpf_struct_ops * const bpf_struct_ops[] = {
#define BPF_STRUCT_OPS_TYPE(_name) \
[BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
#include "bpf_struct_ops_types.h"
#undef BPF_STRUCT_OPS_TYPE
};
const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
#ifdef CONFIG_NET
.test_run = bpf_struct_ops_test_run,
#endif
};
static const struct btf_type *module_type;
void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
{
s32 type_id, value_id, module_id;
const struct btf_member *member;
struct bpf_struct_ops *st_ops;
const struct btf_type *t;
char value_name[128];
const char *mname;
u32 i, j;
/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
#include "bpf_struct_ops_types.h"
#undef BPF_STRUCT_OPS_TYPE
module_id = btf_find_by_name_kind(btf, "module", BTF_KIND_STRUCT);
if (module_id < 0) {
pr_warn("Cannot find struct module in btf_vmlinux\n");
return;
}
module_type = btf_type_by_id(btf, module_id);
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
st_ops = bpf_struct_ops[i];
if (strlen(st_ops->name) + VALUE_PREFIX_LEN >=
sizeof(value_name)) {
pr_warn("struct_ops name %s is too long\n",
st_ops->name);
continue;
}
sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
value_id = btf_find_by_name_kind(btf, value_name,
BTF_KIND_STRUCT);
if (value_id < 0) {
pr_warn("Cannot find struct %s in btf_vmlinux\n",
value_name);
continue;
}
type_id = btf_find_by_name_kind(btf, st_ops->name,
BTF_KIND_STRUCT);
if (type_id < 0) {
pr_warn("Cannot find struct %s in btf_vmlinux\n",
st_ops->name);
continue;
}
t = btf_type_by_id(btf, type_id);
if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
pr_warn("Cannot support #%u members in struct %s\n",
btf_type_vlen(t), st_ops->name);
continue;
}
for_each_member(j, t, member) {
const struct btf_type *func_proto;
mname = btf_name_by_offset(btf, member->name_off);
if (!*mname) {
pr_warn("anon member in struct %s is not supported\n",
st_ops->name);
break;
}
if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
}
func_proto = btf_type_resolve_func_ptr(btf,
member->type,
NULL);
if (func_proto &&
btf_distill_func_proto(log, btf,
func_proto, mname,
&st_ops->func_models[j])) {
pr_warn("Error in parsing func ptr %s in struct %s\n",
mname, st_ops->name);
break;
}
}
if (j == btf_type_vlen(t)) {
if (st_ops->init(btf)) {
pr_warn("Error in init bpf_struct_ops %s\n",
st_ops->name);
} else {
st_ops->type_id = type_id;
st_ops->type = t;
st_ops->value_id = value_id;
st_ops->value_type = btf_type_by_id(btf,
value_id);
}
}
}
}
extern struct btf *btf_vmlinux;
static const struct bpf_struct_ops *
bpf_struct_ops_find_value(u32 value_id)
{
unsigned int i;
if (!value_id || !btf_vmlinux)
return NULL;
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
if (bpf_struct_ops[i]->value_id == value_id)
return bpf_struct_ops[i];
}
return NULL;
}
const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
{
unsigned int i;
if (!type_id || !btf_vmlinux)
return NULL;
for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
if (bpf_struct_ops[i]->type_id == type_id)
return bpf_struct_ops[i];
}
return NULL;
}
static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key,
void *next_key)
{
if (key && *(u32 *)key == 0)
return -ENOENT;
*(u32 *)next_key = 0;
return 0;
}
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
void *value)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
kvalue = &st_map->kvalue;
/* Pair with smp_store_release() during map_update */
state = smp_load_acquire(&kvalue->state);
if (state == BPF_STRUCT_OPS_STATE_INIT) {
memset(value, 0, map->value_size);
return 0;
}
/* No lock is needed. state and refcnt do not need
* to be updated together under atomic context.
*/
uvalue = (struct bpf_struct_ops_value *)value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
return 0;
}
static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
{
return ERR_PTR(-EINVAL);
}
static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
const struct btf_type *t = st_map->st_ops->type;
u32 i;
for (i = 0; i < btf_type_vlen(t); i++) {
if (st_map->progs[i]) {
bpf_prog_put(st_map->progs[i]);
st_map->progs[i] = NULL;
}
}
}
static int check_zero_holes(const struct btf_type *t, void *data)
{
const struct btf_member *member;
u32 i, moff, msize, prev_mend = 0;
const struct btf_type *mtype;
for_each_member(i, t, member) {
moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
mtype = btf_type_by_id(btf_vmlinux, member->type);
mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
}
if (t->size > prev_mend &&
memchr_inv(data + prev_mend, 0, t->size - prev_mend))
return -EINVAL;
return 0;
}
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
struct bpf_prog *prog,
const struct btf_func_model *model,
void *image, void *image_end)
{
u32 flags;
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
model, flags, tprogs, NULL);
}
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
struct bpf_tramp_progs *tprogs = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
void *image, *image_end;
u32 i;
if (flags)
return -EINVAL;
if (*(u32 *)key != 0)
return -E2BIG;
err = check_zero_holes(st_ops->value_type, value);
if (err)
return err;
uvalue = (struct bpf_struct_ops_value *)value;
err = check_zero_holes(t, uvalue->data);
if (err)
return err;
if (uvalue->state || refcount_read(&uvalue->refcnt))
return -EINVAL;
tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
if (!tprogs)
return -ENOMEM;
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
mutex_lock(&st_map->lock);
if (kvalue->state != BPF_STRUCT_OPS_STATE_INIT) {
err = -EBUSY;
goto unlock;
}
memcpy(uvalue, value, map->value_size);
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
goto reset_unlock;
*(void **)(kdata + moff) = BPF_MODULE_OWNER;
continue;
}
err = st_ops->init_member(t, member, kdata, udata);
if (err < 0)
goto reset_unlock;
/* The ->init_member() has handled this member */
if (err > 0)
continue;
/* If st_ops->init_member does not handle it,
* we will only handle func ptrs and zero-ed members
* here. Reject everything else.
*/
/* All non func ptr member must be 0 */
if (!ptype || !btf_type_is_func_proto(ptype)) {
u32 msize;
mtype = btf_type_by_id(btf_vmlinux, member->type);
mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
}
if (memchr_inv(udata + moff, 0, msize)) {
err = -EINVAL;
goto reset_unlock;
}
continue;
}
prog_fd = (int)(*(unsigned long *)(udata + moff));
/* Similar check as the attr->attach_prog_fd */
if (!prog_fd)
continue;
prog = bpf_prog_get(prog_fd);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto reset_unlock;
}
st_map->progs[i] = prog;
if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
prog->aux->attach_btf_id != st_ops->type_id ||
prog->expected_attach_type != i) {
err = -EINVAL;
goto reset_unlock;
}
err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
&st_ops->func_models[i],
image, image_end);
if (err < 0)
goto reset_unlock;
*(void **)(kdata + moff) = image;
image += err;
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
}
refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
set_memory_ro((long)st_map->image, 1);
set_memory_x((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
*/
smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_INUSE);
goto unlock;
}
/* Error during st_ops->reg(). It is very unlikely since
* the above init_member() should have caught it earlier
* before reg(). The only possibility is if there was a race
* in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
kfree(tprogs);
mutex_unlock(&st_map->lock);
return err;
}
static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
if (refcount_dec_and_test(&st_map->kvalue.refcnt))
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
case BPF_STRUCT_OPS_STATE_INIT:
return -ENOENT;
default:
WARN_ON_ONCE(1);
/* Should never happen. Treat it as not found. */
return -ENOENT;
}
}
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
void *value;
int err;
value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
if (!value)
return;
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
if (!err) {
btf_type_seq_show(btf_vmlinux, map->btf_vmlinux_value_type_id,
value, m);
seq_puts(m, "\n");
}
kfree(value);
}
static void bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
if (st_map->progs)
bpf_struct_ops_map_put_progs(st_map);
bpf_map_area_free(st_map->progs);
bpf_jit_free_exec(st_map->image);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
}
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
attr->map_flags || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops *st_ops;
size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
struct bpf_map *map;
if (!bpf_capable())
return ERR_PTR(-EPERM);
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
if (!st_ops)
return ERR_PTR(-ENOTSUPP);
vt = st_ops->value_type;
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
t = st_ops->type;
st_map_size = sizeof(*st_map) +
/* kvalue stores the
* struct bpf_struct_ops_tcp_congestions_ops
*/
(vt->size - sizeof(struct bpf_struct_ops_value));
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
if (!st_map)
return ERR_PTR(-ENOMEM);
st_map->st_ops = st_ops;
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->progs =
bpf_map_area_alloc(btf_type_vlen(t) * sizeof(struct bpf_prog *),
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->progs || !st_map->image) {
bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
mutex_init(&st_map->lock);
set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
return map;
}
static int bpf_struct_ops_map_btf_id;
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
.map_free = bpf_struct_ops_map_free,
.map_get_next_key = bpf_struct_ops_map_get_next_key,
.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
.map_btf_name = "bpf_struct_ops_map",
.map_btf_id = &bpf_struct_ops_map_btf_id,
};
/* "const void *" because some subsystem is
* passing a const (e.g. const struct tcp_congestion_ops *)
*/
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
return refcount_inc_not_zero(&kvalue->refcnt);
}
static void bpf_struct_ops_put_rcu(struct rcu_head *head)
{
struct bpf_struct_ops_map *st_map;
st_map = container_of(head, struct bpf_struct_ops_map, rcu);
bpf_map_put(&st_map->map);
}
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
if (refcount_dec_and_test(&kvalue->refcnt)) {
struct bpf_struct_ops_map *st_map;
st_map = container_of(kvalue, struct bpf_struct_ops_map,
kvalue);
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
* another tcp_cc_y by calling
* setsockopt(TCP_CONGESTION, "tcp_cc_y").
* During the switch, bpf_struct_ops_put(tcp_cc_x) is called
* and its map->refcnt may reach 0 which then free its
* trampoline image while tcp_cc_x is still running.
*
* Thus, a rcu grace period is needed here.
*/
call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
}
}
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2020 Facebook
* Copyright 2020 Google LLC.
*/
#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
migrate_disable();
__this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
__this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
migrate_disable();
if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
__this_cpu_dec(bpf_task_storage_busy);
migrate_enable();
return false;
}
return true;
}
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
{
struct task_struct *task = owner;
return &task->bpf_storage;
}
static struct bpf_local_storage_data *
task_storage_lookup(struct task_struct *task, struct bpf_map *map,
bool cacheit_lockit)
{
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
task_storage =
rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
smap = (struct bpf_local_storage_map *)map;
return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
}
void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
struct hlist_node *n;
unsigned long flags;
rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
if (!local_storage) {
rcu_read_unlock();
return;
}
/* Neither the bpf_prog nor the bpf-map's syscall
* could be modifying the local_storage->list now.
* Thus, no elem can be added-to or deleted-from the
* local_storage->list by the bpf_prog or by the bpf-map's syscall.
*
* It is racing with bpf_local_storage_map_free() alone
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
bpf_task_storage_lock();
raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
*/
bpf_selem_unlink_map(selem);
free_task_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, false);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_task_storage_unlock();
rcu_read_unlock();
/* free_task_storage should always be true as long as
* local_storage->list was non-empty.
*/
if (free_task_storage)
kfree_rcu(local_storage, rcu);
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
unsigned int f_flags;
struct pid *pid;
int fd, err;
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
return ERR_CAST(pid);
/* We should be in an RCU read side critical section, it should be safe
* to call pid_task.
*/
WARN_ON_ONCE(!rcu_read_lock_held());
task = pid_task(pid, PIDTYPE_PID);
if (!task) {
err = -ENOENT;
goto out;
}
bpf_task_storage_lock();
sdata = task_storage_lookup(task, map, true);
bpf_task_storage_unlock();
put_pid(pid);
return sdata ? sdata->data : NULL;
out:
put_pid(pid);
return ERR_PTR(err);
}
static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
unsigned int f_flags;
struct pid *pid;
int fd, err;
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
/* We should be in an RCU read side critical section, it should be safe
* to call pid_task.
*/
WARN_ON_ONCE(!rcu_read_lock_held());
task = pid_task(pid, PIDTYPE_PID);
if (!task) {
err = -ENOENT;
goto out;
}
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
out:
put_pid(pid);
return err;
}
static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
{
struct bpf_local_storage_data *sdata;
sdata = task_storage_lookup(task, map, false);
if (!sdata)
return -ENOENT;
bpf_selem_unlink(SELEM(sdata));
return 0;
}
static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
struct pid *pid;
int fd, err;
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
/* We should be in an RCU read side critical section, it should be safe
* to call pid_task.
*/
WARN_ON_ONCE(!rcu_read_lock_held());
task = pid_task(pid, PIDTYPE_PID);
if (!task) {
err = -ENOENT;
goto out;
}
bpf_task_storage_lock();
err = task_storage_delete(task, map);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
task, void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
if (!task)
return (unsigned long)NULL;
if (!bpf_task_storage_trylock())
return (unsigned long)NULL;
sdata = task_storage_lookup(task, map, true);
if (sdata)
goto unlock;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST);
unlock:
bpf_task_storage_unlock();
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
task)
{
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
if (!bpf_task_storage_trylock())
return -EBUSY;
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
ret = task_storage_delete(task, map);
bpf_task_storage_unlock();
return ret;
}
static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
return -ENOTSUPP;
}
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
smap = bpf_local_storage_map_alloc(attr);
if (IS_ERR(smap))
return ERR_CAST(smap);
smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
return &smap->map;
}
static void task_storage_map_free(struct bpf_map *map)
{
struct bpf_local_storage_map *smap;
smap = (struct bpf_local_storage_map *)map;
bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
}
static int task_storage_map_btf_id;
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
.map_alloc = task_storage_map_alloc,
.map_free = task_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_pid_task_storage_lookup_elem,
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
.map_btf_name = "bpf_local_storage_map",
.map_btf_id = &task_storage_map_btf_id,
.map_owner_storage_ptr = task_storage_ptr,
};
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/bpf_perf_event.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
#include <net/bpf_sk_storage.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/btf.h>
#include <asm/tlb.h>
#include "trace_probe.h"
#include "trace.h"
#define CREATE_TRACE_POINTS
#include "bpf_trace.h"
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
struct list_head list;
};
static LIST_HEAD(bpf_trace_modules);
static DEFINE_MUTEX(bpf_module_mutex);
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
struct bpf_raw_event_map *btp, *ret = NULL;
struct bpf_trace_module *btm;
unsigned int i;
mutex_lock(&bpf_module_mutex);
list_for_each_entry(btm, &bpf_trace_modules, list) {
for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
btp = &btm->module->bpf_raw_events[i];
if (!strcmp(btp->tp->name, name)) {
if (try_module_get(btm->module))
ret = btp;
goto out;
}
}
}
out:
mutex_unlock(&bpf_module_mutex);
return ret;
}
#else
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
return NULL;
}
#endif /* CONFIG_MODULES */
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id);
/**
* trace_call_bpf - invoke BPF program
* @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
* Can be used from static tracepoints in the future.
*
* Return: BPF programs always return an integer which is interpreted by
* kprobe handler as:
* 0 - return from kprobe (event is filtered out)
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/*
* since some bpf program is already running on this cpu,
* don't call into another bpf program (same or different)
* and don't send kprobe event into ring-buffer,
* so return zero here
*/
ret = 0;
goto out;
}
/*
* Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
* to all call sites, we did a bpf_prog_array_valid() there to check
* whether call->prog_array is empty or not, which is
* a heuristic to speed up execution.
*
* If bpf_prog_array_valid() fetched prog_array was
* non-NULL, we go into trace_call_bpf() and do the actual
* proper rcu_dereference() under RCU lock.
* If it turns out that prog_array is NULL then, we bail out.
* For the opposite, if the bpf_prog_array_valid() fetched pointer
* was NULL, you'll skip the prog_array with the risk of missing
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
out:
__this_cpu_dec(bpf_prog_active);
return ret;
}
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
regs_set_return_value(regs, rc);
override_function_with_return(regs);
return 0;
}
static const struct bpf_func_proto bpf_override_return_proto = {
.func = bpf_override_return,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
#endif
static __always_inline int
bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
int ret;
ret = copy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
const void __user *, unsafe_ptr)
{
return bpf_probe_read_user_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
static __always_inline int
bpf_probe_read_user_str_common(void *dst, u32 size,
const void __user *unsafe_ptr)
{
int ret;
/*
* NB: We rely on strncpy_from_user() not copying junk past the NUL
* terminator into `dst`.
*
* strncpy_from_user() does long-sized strides in the fast path. If the
* strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
* then there could be junk after the NUL in `dst`. If user takes `dst`
* and keys a hash map with it, then semantically identical strings can
* occupy multiple entries in the map.
*/
ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
const void __user *, unsafe_ptr)
{
return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret;
ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.func = bpf_probe_read_kernel,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret;
/*
* The strncpy_from_kernel_nofault() call will likely not fill the
* entire buffer, but that's okay in this circumstance as we're probing
* arbitrary memory anyway similar to bpf_probe_read_*() and might
* as well probe the stack. Thus, memory is explicitly cleared
* only in error case, so that improper users ignoring return
* code altogether don't copy garbage; otherwise length of string
* is returned that can be used for bpf_perf_event_output() et al.
*/
ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.func = bpf_probe_read_kernel_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
const void *, unsafe_ptr)
{
if ((unsigned long)unsafe_ptr < TASK_SIZE) {
return bpf_probe_read_user_common(dst, size,
(__force void __user *)unsafe_ptr);
}
return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_proto = {
.func = bpf_probe_read_compat,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
if ((unsigned long)unsafe_ptr < TASK_SIZE) {
return bpf_probe_read_user_str_common(dst, size,
(__force void __user *)unsafe_ptr);
}
return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
.func = bpf_probe_read_compat_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
{
/*
* Ensure we're in user context which is safe for the helper to
* run. This helper has no business in a kthread.
*
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
*
* nmi_uaccess_okay() ensures the probe is not run in an interim
* state, when the task or mm are switched. This is specifically
* required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
return copy_to_user_nofault(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
.func = bpf_probe_write_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
if (!capable(CAP_SYS_ADMIN))
return NULL;
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
return &bpf_probe_write_user_proto;
}
static DEFINE_RAW_SPINLOCK(trace_printk_lock);
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
{
u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 };
u32 *bin_args;
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
int ret;
ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args,
MAX_TRACE_PRINTK_VARARGS);
if (ret < 0)
return ret;
raw_spin_lock_irqsave(&trace_printk_lock, flags);
ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
trace_bpf_trace_printk(buf);
raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
bpf_bprintf_cleanup();
return ret;
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
};
static void __set_printk_clr_event(void)
{
/*
* This program might be calling bpf_trace_printk,
* so enable the associated bpf_trace/bpf_trace_printk event.
* Repeat this each time as it is possible a user has
* disabled bpf_trace_printk events. By loading a program
* calling bpf_trace_printk() however the user has expressed
* the intent to see such events.
*/
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
__set_printk_clr_event();
return &bpf_trace_printk_proto;
}
BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data,
u32, data_len)
{
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
int ret, num_args;
u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
(data_len && !data))
return -EINVAL;
num_args = data_len / 8;
ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
if (ret < 0)
return ret;
raw_spin_lock_irqsave(&trace_printk_lock, flags);
ret = bstr_printf(buf, sizeof(buf), fmt, bin_args);
trace_bpf_trace_printk(buf);
raw_spin_unlock_irqrestore(&trace_printk_lock, flags);
bpf_bprintf_cleanup();
return ret;
}
static const struct bpf_func_proto bpf_trace_vprintk_proto = {
.func = bpf_trace_vprintk,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
};
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
__set_printk_clr_event();
return &bpf_trace_vprintk_proto;
}
BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
const void *, data, u32, data_len)
{
int err, num_args;
u32 *bin_args;
if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
(data_len && !data))
return -EINVAL;
num_args = data_len / 8;
err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args);
if (err < 0)
return err;
seq_bprintf(m, fmt, bin_args);
bpf_bprintf_cleanup();
return seq_has_overflowed(m) ? -EOVERFLOW : 0;
}
BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file)
static const struct bpf_func_proto bpf_seq_printf_proto = {
.func = bpf_seq_printf,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
{
return seq_write(m, data, len) ? -EOVERFLOW : 0;
}
static const struct bpf_func_proto bpf_seq_write_proto = {
.func = bpf_seq_write,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr,
u32, btf_ptr_size, u64, flags)
{
const struct btf *btf;
s32 btf_id;
int ret;
ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
if (ret)
return ret;
return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags);
}
static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
.func = bpf_seq_printf_btf,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_seq_file_ids[0],
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
if (index == BPF_F_CURRENT_CPU)
index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
ee = READ_ONCE(array->ptrs[index]);
if (!ee)
return -ENOENT;
return perf_event_read_local(ee->event, value, enabled, running);
}
BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
u64 value = 0;
int err;
err = get_map_perf_counter(map, flags, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
*/
if (err)
return err;
return value;
}
static const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
struct bpf_perf_event_value *, buf, u32, size)
{
int err = -EINVAL;
if (unlikely(size != sizeof(struct bpf_perf_event_value)))
goto clear;
err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
&buf->running);
if (unlikely(err))
goto clear;
return 0;
clear:
memset(buf, 0, size);
return err;
}
static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.func = bpf_perf_event_read_value,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_UNINIT_MEM,
.arg4_type = ARG_CONST_SIZE,
};
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
struct perf_event *event;
if (index == BPF_F_CURRENT_CPU)
index = cpu;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
ee = READ_ONCE(array->ptrs[index]);
if (!ee)
return -ENOENT;
event = ee->event;
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
return perf_event_output(event, sd, regs);
}
/*
* Support executing tracepoints in normal, irq, and nmi context that each call
* bpf_perf_event_output
*/
struct bpf_trace_sample_data {
struct perf_sample_data sds[3];
};
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
struct perf_raw_record raw = {
.frag = {
.size = size,
.data = data,
},
};
struct perf_sample_data *sd;
int err;
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
err = -EBUSY;
goto out;
}
sd = &sds->sds[nest_level - 1];
if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
err = -EINVAL;
goto out;
}
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
err = __bpf_perf_event_output(regs, map, flags, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
return err;
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
.func = bpf_perf_event_output,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
struct bpf_nested_pt_regs {
struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
struct perf_raw_frag frag = {
.copy = ctx_copy,
.size = ctx_size,
.data = ctx,
};
struct perf_raw_record raw = {
.frag = {
{
.next = ctx_size ? &frag : NULL,
},
.size = meta_size,
.data = meta,
},
};
struct perf_sample_data *sd;
struct pt_regs *regs;
u64 ret;
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
ret = -EBUSY;
goto out;
}
sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
return ret;
}
BPF_CALL_0(bpf_get_current_task)
{
return (long) current;
}
const struct bpf_func_proto bpf_get_current_task_proto = {
.func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
};
BPF_CALL_0(bpf_get_current_task_btf)
{
return (unsigned long) current;
}
const struct bpf_func_proto bpf_get_current_task_btf_proto = {
.func = bpf_get_current_task_btf,
.gpl_only = true,
.ret_type = RET_PTR_TO_BTF_ID,
.ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
{
return (unsigned long) task_pt_regs(task);
}
BTF_ID_LIST(bpf_task_pt_regs_ids)
BTF_ID(struct, pt_regs)
const struct bpf_func_proto bpf_task_pt_regs_proto = {
.func = bpf_task_pt_regs,
.gpl_only = true,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.ret_type = RET_PTR_TO_BTF_ID,
.ret_btf_id = &bpf_task_pt_regs_ids[0],
};
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct cgroup *cgrp;
if (unlikely(idx >= array->map.max_entries))
return -E2BIG;
cgrp = READ_ONCE(array->ptrs[idx]);
if (unlikely(!cgrp))
return -EAGAIN;
return task_under_cgroup_hierarchy(current, cgrp);
}
static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
.func = bpf_current_task_under_cgroup,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
struct send_signal_irq_work {
struct irq_work irq_work;
struct task_struct *task;
u32 sig;
enum pid_type type;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
static void do_bpf_send_signal(struct irq_work *entry)
{
struct send_signal_irq_work *work;
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
{
struct send_signal_irq_work *work = NULL;
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task.
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
if (irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
if (unlikely(!valid_signal(sig)))
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
if (irq_work_is_busy(&work->irq_work))
return -EBUSY;
/* Add the current task, which is the target of sending signal,
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
work->task = current;
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
}
BPF_CALL_1(bpf_send_signal, u32, sig)
{
return bpf_send_signal_common(sig, PIDTYPE_TGID);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
.func = bpf_send_signal,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
return bpf_send_signal_common(sig, PIDTYPE_PID);
}
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
.func = bpf_send_signal_thread,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
};
BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
{
long len;
char *p;
if (!sz)
return 0;
p = d_path(path, buf, sz);
if (IS_ERR(p)) {
len = PTR_ERR(p);
} else {
len = buf + sz - p;
memmove(buf, p, len);
}
return len;
}
BTF_SET_START(btf_allowlist_d_path)
#ifdef CONFIG_SECURITY
BTF_ID(func, security_file_permission)
BTF_ID(func, security_inode_getattr)
BTF_ID(func, security_file_open)
#endif
#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, security_path_truncate)
#endif
BTF_ID(func, vfs_truncate)
BTF_ID(func, vfs_fallocate)
BTF_ID(func, dentry_open)
BTF_ID(func, vfs_getattr)
BTF_ID(func, filp_close)
BTF_SET_END(btf_allowlist_d_path)
static bool bpf_d_path_allowed(const struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING &&
prog->expected_attach_type == BPF_TRACE_ITER)
return true;
if (prog->type == BPF_PROG_TYPE_LSM)
return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
return btf_id_set_contains(&btf_allowlist_d_path,
prog->aux->attach_btf_id);
}
BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path)
static const struct bpf_func_proto bpf_d_path_proto = {
.func = bpf_d_path,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_d_path_btf_ids[0],
.arg2_type = ARG_PTR_TO_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.allowed = bpf_d_path_allowed,
};
#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \
BTF_F_PTR_RAW | BTF_F_ZERO)
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id)
{
const struct btf_type *t;
if (unlikely(flags & ~(BTF_F_ALL)))
return -EINVAL;
if (btf_ptr_size != sizeof(struct btf_ptr))
return -EINVAL;
*btf = bpf_get_btf_vmlinux();
if (IS_ERR_OR_NULL(*btf))
return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
if (ptr->type_id > 0)
*btf_id = ptr->type_id;
else
return -EINVAL;
if (*btf_id > 0)
t = btf_type_by_id(*btf, *btf_id);
if (*btf_id <= 0 || !t)
return -ENOENT;
return 0;
}
BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr,
u32, btf_ptr_size, u64, flags)
{
const struct btf *btf;
s32 btf_id;
int ret;
ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id);
if (ret)
return ret;
return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size,
flags);
}
const struct bpf_func_proto bpf_snprintf_btf_proto = {
.func = bpf_snprintf_btf,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx)
{
/* This helper call is inlined by verifier. */
return ((u64 *)ctx)[-2];
}
static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.func = bpf_get_func_ip_tracing,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
struct kprobe *kp = kprobe_running();
return kp ? (uintptr_t)kp->addr : 0;
}
static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.func = bpf_get_func_ip_kprobe,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
return run_ctx->bpf_cookie;
}
static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = {
.func = bpf_get_attach_cookie_trace,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx)
{
return ctx->event->bpf_cookie;
}
static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = {
.func = bpf_get_attach_cookie_pe,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
#ifndef CONFIG_X86
return -ENOENT;
#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt);
if (unlikely(flags))
return -EINVAL;
if (!entry_cnt)
return -ENOENT;
return entry_cnt * br_entry_size;
#endif
}
static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.func = bpf_get_branch_snapshot,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value)
{
/* This helper call is inlined by verifier. */
u64 nr_args = ((u64 *)ctx)[-1];
if ((u64) n >= nr_args)
return -EINVAL;
*value = ((u64 *)ctx)[n];
return 0;
}
static const struct bpf_func_proto bpf_get_func_arg_proto = {
.func = get_func_arg,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_LONG,
};
BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value)
{
/* This helper call is inlined by verifier. */
u64 nr_args = ((u64 *)ctx)[-1];
*value = ((u64 *)ctx)[nr_args];
return 0;
}
static const struct bpf_func_proto bpf_get_func_ret_proto = {
.func = get_func_ret,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_LONG,
};
BPF_CALL_1(get_func_arg_cnt, void *, ctx)
{
/* This helper call is inlined by verifier. */
return ((u64 *)ctx)[-1];
}
static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
.func = get_func_arg_cnt,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
};
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
return &bpf_map_lookup_elem_proto;
case BPF_FUNC_map_update_elem:
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
case BPF_FUNC_map_push_elem:
return &bpf_map_push_elem_proto;
case BPF_FUNC_map_pop_elem:
return &bpf_map_pop_elem_proto;
case BPF_FUNC_map_peek_elem:
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
case BPF_FUNC_task_pt_regs:
return &bpf_task_pt_regs_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
return &bpf_get_current_comm_proto;
case BPF_FUNC_trace_printk:
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_get_numa_node_id:
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_probe_write_user:
return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
case BPF_FUNC_send_signal_thread:
return &bpf_send_signal_thread_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
return &bpf_ringbuf_reserve_proto;
case BPF_FUNC_ringbuf_submit:
return &bpf_ringbuf_submit_proto;
case BPF_FUNC_ringbuf_discard:
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
case BPF_FUNC_task_storage_get:
return &bpf_task_storage_get_proto;
case BPF_FUNC_task_storage_delete:
return &bpf_task_storage_delete_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
case BPF_FUNC_get_branch_snapshot:
return &bpf_get_branch_snapshot_proto;
case BPF_FUNC_find_vma:
return &bpf_find_vma_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
/*
* Assertion for 32 bit to make sure last 8 byte access
* (BPF_DW) to the last 4 byte member is disallowed.
*/
if (off + size > sizeof(struct pt_regs))
return false;
return true;
}
const struct bpf_verifier_ops kprobe_verifier_ops = {
.get_func_proto = kprobe_prog_func_proto,
.is_valid_access = kprobe_prog_is_valid_access,
};
const struct bpf_prog_ops kprobe_prog_ops = {
};
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
/*
* r1 points to perf tracepoint buffer where first 8 bytes are hidden
* from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
* from there and call the same bpf_perf_event_output() helper inline.
*/
return ____bpf_perf_event_output(regs, map, flags, data, size);
}
static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.func = bpf_perf_event_output_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
u64, flags)
{
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
/*
* Same comment as in bpf_perf_event_output_tp(), only that this time
* the other helper's function body cannot be inlined due to being
* external, thus we need to call raw helper function.
*/
return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
flags, 0, 0);
}
static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.func = bpf_get_stackid_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
u64, flags)
{
struct pt_regs *regs = *(struct pt_regs **)tp_buff;
return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
(unsigned long) size, flags, 0);
}
static const struct bpf_func_proto bpf_get_stack_proto_tp = {
.func = bpf_get_stack_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_tp;
case BPF_FUNC_get_attach_cookie:
return &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
return true;
}
const struct bpf_verifier_ops tracepoint_verifier_ops = {
.get_func_proto = tp_prog_func_proto,
.is_valid_access = tp_prog_is_valid_access,
};
const struct bpf_prog_ops tracepoint_prog_ops = {
};
BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
struct bpf_perf_event_value *, buf, u32, size)
{
int err = -EINVAL;
if (unlikely(size != sizeof(struct bpf_perf_event_value)))
goto clear;
err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
&buf->running);
if (unlikely(err))
goto clear;
return 0;
clear:
memset(buf, 0, size);
return err;
}
static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
.func = bpf_perf_prog_read_value,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE,
};
BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
struct perf_branch_stack *br_stack = ctx->data->br_stack;
u32 to_copy;
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
return -EINVAL;
if (unlikely(!br_stack))
return -ENOENT;
if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE)
return br_stack->nr * br_entry_size;
if (!buf || (size % br_entry_size != 0))
return -EINVAL;
to_copy = min_t(u32, br_stack->nr * br_entry_size, size);
memcpy(buf, br_stack->entries, to_copy);
return to_copy;
}
static const struct bpf_func_proto bpf_read_branch_records_proto = {
.func = bpf_read_branch_records,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM_OR_NULL,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_pe;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_pe;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
case BPF_FUNC_read_branch_records:
return &bpf_read_branch_records_proto;
case BPF_FUNC_get_attach_cookie:
return &bpf_get_attach_cookie_proto_pe;
default:
return bpf_tracing_func_proto(func_id, prog);
}
}
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
* inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
*
* Since raw tracepoints run despite bpf_prog_active, support concurrent usage
* in normal, irq, and nmi context.
*/
struct bpf_raw_tp_regs {
struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
static struct pt_regs *get_bpf_raw_tp_regs(void)
{
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
return &tp_regs->regs[nest_level - 1];
}
static void put_bpf_raw_tp_regs(void)
{
this_cpu_dec(bpf_raw_tp_nest_level);
}
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
struct pt_regs *regs = get_bpf_raw_tp_regs();
int ret;
if (IS_ERR(regs))
return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
ret = ____bpf_perf_event_output(regs, map, flags, data, size);
put_bpf_raw_tp_regs();
return ret;
}
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
.func = bpf_perf_event_output_raw_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
struct pt_regs *regs = get_bpf_raw_tp_regs();
int ret;
if (IS_ERR(regs))
return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
flags, 0, 0);
put_bpf_raw_tp_regs();
return ret;
}
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
.func = bpf_get_stackid_raw_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
void *, buf, u32, size, u64, flags)
{
struct pt_regs *regs = get_bpf_raw_tp_regs();
int ret;
if (IS_ERR(regs))
return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
(unsigned long) size, flags, 0);
put_bpf_raw_tp_regs();
return ret;
}
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
.func = bpf_get_stack_raw_tp,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto_raw_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_raw_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
default:
return bpf_tracing_func_proto(func_id, prog);
}
}
const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_func_proto *fn;
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_skb_output:
return &bpf_skb_output_proto;
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
case BPF_FUNC_skc_to_tcp6_sock:
return &bpf_skc_to_tcp6_sock_proto;
case BPF_FUNC_skc_to_tcp_sock:
return &bpf_skc_to_tcp_sock_proto;
case BPF_FUNC_skc_to_tcp_timewait_sock:
return &bpf_skc_to_tcp_timewait_sock_proto;
case BPF_FUNC_skc_to_tcp_request_sock:
return &bpf_skc_to_tcp_request_sock_proto;
case BPF_FUNC_skc_to_udp6_sock:
return &bpf_skc_to_udp6_sock_proto;
case BPF_FUNC_skc_to_unix_sock:
return &bpf_skc_to_unix_sock_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_tracing_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_tracing_proto;
case BPF_FUNC_sock_from_file:
return &bpf_sock_from_file_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_printf_proto :
NULL;
case BPF_FUNC_seq_write:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_write_proto :
NULL;
case BPF_FUNC_seq_printf_btf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
&bpf_seq_printf_btf_proto :
NULL;
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
case BPF_FUNC_get_func_arg:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL;
case BPF_FUNC_get_func_ret:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL;
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
fn = bpf_iter_get_func_proto(func_id, prog);
return fn;
}
}
static bool raw_tp_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
return bpf_tracing_ctx_access(off, size, type);
}
static bool tracing_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
}
int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog,
const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
return -ENOTSUPP;
}
const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
.get_func_proto = raw_tp_prog_func_proto,
.is_valid_access = raw_tp_prog_is_valid_access,
};
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
#ifdef CONFIG_NET
.test_run = bpf_prog_test_run_raw_tp,
#endif
};
const struct bpf_verifier_ops tracing_verifier_ops = {
.get_func_proto = tracing_prog_func_proto,
.is_valid_access = tracing_prog_is_valid_access,
};
const struct bpf_prog_ops tracing_prog_ops = {
.test_run = bpf_prog_test_run_tracing,
};
static bool raw_tp_writable_prog_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off == 0) {
if (size != sizeof(u64) || type != BPF_READ)
return false;
info->reg_type = PTR_TO_TP_BUFFER;
}
return raw_tp_prog_is_valid_access(off, size, type, prog, info);
}
const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
.get_func_proto = raw_tp_prog_func_proto,
.is_valid_access = raw_tp_writable_prog_is_valid_access,
};
const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const int size_u64 = sizeof(u64);
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0) {
if (sizeof(unsigned long) != 4)
return false;
if (size != 8)
return false;
if (off % size != 4)
return false;
}
switch (off) {
case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
bpf_ctx_record_field_size(info, size_u64);
if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
return false;
break;
case bpf_ctx_range(struct bpf_perf_event_data, addr):
bpf_ctx_record_field_size(info, size_u64);
if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
return false;
break;
default:
if (size != sizeof(long))
return false;
}
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
bpf_target_off(struct perf_sample_data, period, 8,
target_size));
break;
case offsetof(struct bpf_perf_event_data, addr):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
bpf_target_off(struct perf_sample_data, addr, 8,
target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
regs), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, regs));
*insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
si->off);
break;
}
return insn - insn_buf;
}
const struct bpf_verifier_ops perf_event_verifier_ops = {
.get_func_proto = pe_prog_func_proto,
.is_valid_access = pe_prog_is_valid_access,
.convert_ctx_access = pe_prog_convert_ctx_access,
};
const struct bpf_prog_ops perf_event_prog_ops = {
};
static DEFINE_MUTEX(bpf_event_mutex);
#define BPF_TRACE_MAX_PROGS 64
int perf_event_attach_bpf_prog(struct perf_event *event,
struct bpf_prog *prog,
u64 bpf_cookie)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
int ret = -EEXIST;
/*
* Kprobe override only works if they are on the function entry,
* and only if they are on the opt-in list.
*/
if (prog->kprobe_override &&
(!trace_kprobe_on_func_entry(event->tp_event) ||
!trace_kprobe_error_injectable(event->tp_event)))
return -EINVAL;
mutex_lock(&bpf_event_mutex);
if (event->prog)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
if (old_array &&
bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
ret = -E2BIG;
goto unlock;
}
ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array);
if (ret < 0)
goto unlock;
/* set the new array to event->tp_event and set event->prog */
event->prog = prog;
event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
unlock:
mutex_unlock(&bpf_event_mutex);
return ret;
}
void perf_event_detach_bpf_prog(struct perf_event *event)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
int ret;
mutex_lock(&bpf_event_mutex);
if (!event->prog)
goto unlock;
old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array);
if (ret == -ENOENT)
goto unlock;
if (ret < 0) {
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
rcu_assign_pointer(event->tp_event->prog_array, new_array);
bpf_prog_array_free(old_array);
}
bpf_prog_put(event->prog);
event->prog = NULL;
unlock:
mutex_unlock(&bpf_event_mutex);
}
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
struct perf_event_query_bpf __user *uquery = info;
struct perf_event_query_bpf query = {};
struct bpf_prog_array *progs;
u32 *ids, prog_cnt, ids_len;
int ret;
if (!perfmon_capable())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
if (copy_from_user(&query, uquery, sizeof(query)))
return -EFAULT;
ids_len = query.ids_len;
if (ids_len > BPF_TRACE_MAX_PROGS)
return -E2BIG;
ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
if (!ids)
return -ENOMEM;
/*
* The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
* is required when user only wants to check for uquery->prog_cnt.
* There is no need to check for it since the case is handled
* gracefully in bpf_prog_array_copy_info.
*/
mutex_lock(&bpf_event_mutex);
progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
mutex_unlock(&bpf_event_mutex);
if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
ret = -EFAULT;
kfree(ids);
return ret;
}
extern struct bpf_raw_event_map __start__bpf_raw_tp[];
extern struct bpf_raw_event_map __stop__bpf_raw_tp[];
struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
struct bpf_raw_event_map *btp = __start__bpf_raw_tp;
for (; btp < __stop__bpf_raw_tp; btp++) {
if (!strcmp(btp->tp->name, name))
return btp;
}
return bpf_get_raw_tracepoint_module(name);
}
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
struct module *mod;
preempt_disable();
mod = __module_address((unsigned long)btp);
module_put(mod);
preempt_enable();
}
static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
}
#define UNPACK(...) __VA_ARGS__
#define REPEAT_1(FN, DL, X, ...) FN(X)
#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__)
#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__)
#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__)
#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__)
#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__)
#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__)
#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__)
#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__)
#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__)
#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__)
#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__)
#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__)
#define SARG(X) u64 arg##X
#define COPY(X) args[X] = arg##X
#define __DL_COM (,)
#define __DL_SEM (;)
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
#define BPF_TRACE_DEFN_x(x) \
void bpf_trace_run##x(struct bpf_prog *prog, \
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
{ \
u64 args[x]; \
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
__bpf_trace_run(prog, args); \
} \
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
BPF_TRACE_DEFN_x(2);
BPF_TRACE_DEFN_x(3);
BPF_TRACE_DEFN_x(4);
BPF_TRACE_DEFN_x(5);
BPF_TRACE_DEFN_x(6);
BPF_TRACE_DEFN_x(7);
BPF_TRACE_DEFN_x(8);
BPF_TRACE_DEFN_x(9);
BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
struct tracepoint *tp = btp->tp;
/*
* check that program doesn't access arguments beyond what's
* available in this tracepoint
*/
if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
return -EINVAL;
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
prog);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
return __bpf_probe_register(btp, prog);
}
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
{
return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
u32 *fd_type, const char **buf,
u64 *probe_offset, u64 *probe_addr)
{
bool is_tracepoint, is_syscall_tp;
struct bpf_prog *prog;
int flags, err = 0;
prog = event->prog;
if (!prog)
return -ENOENT;
/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
return -EOPNOTSUPP;
*prog_id = prog->aux->id;
flags = event->tp_event->flags;
is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
if (is_tracepoint || is_syscall_tp) {
*buf = is_tracepoint ? event->tp_event->tp->name
: event->tp_event->name;
*fd_type = BPF_FD_TYPE_TRACEPOINT;
*probe_offset = 0x0;
*probe_addr = 0x0;
} else {
/* kprobe/uprobe */
err = -EOPNOTSUPP;
#ifdef CONFIG_KPROBE_EVENTS
if (flags & TRACE_EVENT_FL_KPROBE)
err = bpf_get_kprobe_info(event, fd_type, buf,
probe_offset, probe_addr,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
#ifdef CONFIG_UPROBE_EVENTS
if (flags & TRACE_EVENT_FL_UPROBE)
err = bpf_get_uprobe_info(event, fd_type, buf,
probe_offset,
event->attr.type == PERF_TYPE_TRACEPOINT);
#endif
}
return err;
}
static int __init send_signal_irq_work_init(void)
{
int cpu;
struct send_signal_irq_work *work;
for_each_possible_cpu(cpu) {
work = per_cpu_ptr(&send_signal_work, cpu);
init_irq_work(&work->irq_work, do_bpf_send_signal);
}
return 0;
}
subsys_initcall(send_signal_irq_work_init);
#ifdef CONFIG_MODULES
static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
struct bpf_trace_module *btm, *tmp;
struct module *mod = module;
int ret = 0;
if (mod->num_bpf_raw_events == 0 ||
(op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
goto out;
mutex_lock(&bpf_module_mutex);
switch (op) {
case MODULE_STATE_COMING:
btm = kzalloc(sizeof(*btm), GFP_KERNEL);
if (btm) {
btm->module = module;
list_add(&btm->list, &bpf_trace_modules);
} else {
ret = -ENOMEM;
}
break;
case MODULE_STATE_GOING:
list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) {
if (btm->module == module) {
list_del(&btm->list);
kfree(btm);
break;
}
}
break;
}
mutex_unlock(&bpf_module_mutex);
out:
return notifier_from_errno(ret);
}
static struct notifier_block bpf_module_nb = {
.notifier_call = bpf_event_notify,
};
static int __init bpf_event_init(void)
{
register_module_notifier(&bpf_module_nb);
return 0;
}
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/console.h>
#include <linux/errno.h>
#include <linux/string.h>
#include "console_cmdline.h"
#include "braille.h"
int _braille_console_setup(char **str, char **brl_options)
{
size_t len;
len = str_has_prefix(*str, "brl,");
if (len) {
*brl_options = "";
*str += len;
return 0;
}
len = str_has_prefix(*str, "brl=");
if (len) {
*brl_options = *str + len;
*str = strchr(*brl_options, ',');
if (!*str) {
pr_err("need port name after brl=\n");
return -EINVAL;
}
*((*str)++) = 0;
}
return 0;
}
int
_braille_register_console(struct console *console, struct console_cmdline *c)
{
int rtn = 0;
if (c->brl_options) {
console->flags |= CON_BRL;
rtn = braille_register_console(console, c->index, c->options,
c->brl_options);
}
return rtn;
}
int
_braille_unregister_console(struct console *console)
{
if (console->flags & CON_BRL)
return braille_unregister_console(console);
return 0;
}
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
#include <uapi/linux/bpf.h>
#include <uapi/linux/bpf_perf_event.h>
#include <uapi/linux/types.h>
#include <linux/seq_file.h>
#include <linux/compiler.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/idr.h>
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <net/sock.h>
#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
* on the C programming language which the modern BPF is primary
* using.
*
* ELF Section:
* ~~~~~~~~~~~
* The BTF data is stored under the ".BTF" ELF section
*
* struct btf_type:
* ~~~~~~~~~~~~~~~
* Each 'struct btf_type' object describes a C data type.
* Depending on the type it is describing, a 'struct btf_type'
* object may be followed by more data. F.e.
* To describe an array, 'struct btf_type' is followed by
* 'struct btf_array'.
*
* 'struct btf_type' and any extra data following it are
* 4 bytes aligned.
*
* Type section:
* ~~~~~~~~~~~~~
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
* data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
* Each btf_type object is identified by a type_id. The type_id
* is implicitly implied by the location of the btf_type object in
* the BTF type section. The first one has type_id 1. The second
* one has type_id 2...etc. Hence, an earlier btf_type has
* a smaller type_id.
*
* A btf_type object may refer to another btf_type object by using
* type_id (i.e. the "type" in the "struct btf_type").
*
* NOTE that we cannot assume any reference-order.
* A btf_type object can refer to an earlier btf_type object
* but it can also refer to a later btf_type object.
*
* For example, to describe "const void *". A btf_type
* object describing "const" may refer to another btf_type
* object describing "void *". This type-reference is done
* by specifying type_id:
*
* [1] CONST (anon) type_id=2
* [2] PTR (anon) type_id=0
*
* The above is the btf_verifier debug log:
* - Each line started with "[?]" is a btf_type object
* - [?] is the type_id of the btf_type object.
* - CONST/PTR is the BTF_KIND_XXX
* - "(anon)" is the name of the type. It just
* happens that CONST and PTR has no name.
* - type_id=XXX is the 'u32 type' in btf_type
*
* NOTE: "void" has type_id 0
*
* String section:
* ~~~~~~~~~~~~~~
* The BTF string section contains the names used by the type section.
* Each string is referred by an "offset" from the beginning of the
* string section.
*
* Each string is '\0' terminated.
*
* The first character in the string section must be '\0'
* which is used to mean 'anonymous'. Some btf_type may not
* have a name.
*/
/* BTF verification:
*
* To verify BTF data, two passes are needed.
*
* Pass #1
* ~~~~~~~
* The first pass is to collect all btf_type objects to
* an array: "btf->types".
*
* Depending on the C type that a btf_type is describing,
* a btf_type may be followed by extra data. We don't know
* how many btf_type is there, and more importantly we don't
* know where each btf_type is located in the type section.
*
* Without knowing the location of each type_id, most verifications
* cannot be done. e.g. an earlier btf_type may refer to a later
* btf_type (recall the "const void *" above), so we cannot
* check this type-reference in the first pass.
*
* In the first pass, it still does some verifications (e.g.
* checking the name is a valid offset to the string section).
*
* Pass #2
* ~~~~~~~
* The main focus is to resolve a btf_type that is referring
* to another type.
*
* We have to ensure the referring type:
* 1) does exist in the BTF (i.e. in btf->types[])
* 2) does not cause a loop:
* struct A {
* struct B b;
* };
*
* struct B {
* struct A a;
* };
*
* btf_type_needs_resolve() decides if a btf_type needs
* to be resolved.
*
* The needs_resolve type implements the "resolve()" ops which
* essentially does a DFS and detects backedge.
*
* During resolve (or DFS), different C types have different
* "RESOLVED" conditions.
*
* When resolving a BTF_KIND_STRUCT, we need to resolve all its
* members because a member is always referring to another
* type. A struct's member can be treated as "RESOLVED" if
* it is referring to a BTF_KIND_PTR. Otherwise, the
* following valid C struct would be rejected:
*
* struct A {
* int m;
* struct A *a;
* };
*
* When resolving a BTF_KIND_PTR, it needs to keep resolving if
* it is referring to another BTF_KIND_PTR. Otherwise, we cannot
* detect a pointer loop, e.g.:
* BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
* ^ |
* +-----------------------------------------+
*
*/
#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
/* 16MB for 64k structs and each has 16 members and
* a few MB spaces for the string section.
* The hard limit is S32_MAX.
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
i++, member++)
#define for_each_vsi_from(i, from, struct_type, member) \
for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
i < btf_type_vlen(struct_type); \
i++, member++)
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
struct btf {
void *data;
struct btf_type **types;
u32 *resolved_ids;
u32 *resolved_sizes;
const char *strings;
void *nohdr_data;
struct btf_header hdr;
u32 nr_types; /* includes VOID for base BTF */
u32 types_size;
u32 data_size;
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
/* split BTF support */
struct btf *base_btf;
u32 start_id; /* first type ID in this BTF (0 for base BTF) */
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
};
enum verifier_phase {
CHECK_META,
CHECK_TYPE,
};
struct resolve_vertex {
const struct btf_type *t;
u32 type_id;
u16 next_member;
};
enum visit_state {
NOT_VISITED,
VISITED,
RESOLVED,
};
enum resolve_mode {
RESOLVE_TBD, /* To Be Determined */
RESOLVE_PTR, /* Resolving for Pointer */
RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union
* or array
*/
};
#define MAX_RESOLVE_DEPTH 32
struct btf_sec_info {
u32 off;
u32 len;
};
struct btf_verifier_env {
struct btf *btf;
u8 *visit_states;
struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
struct bpf_verifier_log log;
u32 log_type_id;
u32 top_stack;
enum verifier_phase phase;
enum resolve_mode resolve_mode;
};
static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_UNKN] = "UNKNOWN",
[BTF_KIND_INT] = "INT",
[BTF_KIND_PTR] = "PTR",
[BTF_KIND_ARRAY] = "ARRAY",
[BTF_KIND_STRUCT] = "STRUCT",
[BTF_KIND_UNION] = "UNION",
[BTF_KIND_ENUM] = "ENUM",
[BTF_KIND_FWD] = "FWD",
[BTF_KIND_TYPEDEF] = "TYPEDEF",
[BTF_KIND_VOLATILE] = "VOLATILE",
[BTF_KIND_CONST] = "CONST",
[BTF_KIND_RESTRICT] = "RESTRICT",
[BTF_KIND_FUNC] = "FUNC",
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
/* Chunk size we use in safe copy of data to be shown. */
#define BTF_SHOW_OBJ_SAFE_SIZE 32
/*
* This is the maximum size of a base type value (equivalent to a
* 128-bit int); if we are at the end of our safe buffer and have
* less than 16 bytes space we can't be assured of being able
* to copy the next type safely, so in such cases we will initiate
* a new copy.
*/
#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16
/* Type name size */
#define BTF_SHOW_NAME_SIZE 80
/*
* Common data to all BTF show operations. Private show functions can add
* their own data to a structure containing a struct btf_show and consult it
* in the show callback. See btf_type_show() below.
*
* One challenge with showing nested data is we want to skip 0-valued
* data, but in order to figure out whether a nested object is all zeros
* we need to walk through it. As a result, we need to make two passes
* when handling structs, unions and arrays; the first path simply looks
* for nonzero data, while the second actually does the display. The first
* pass is signalled by show->state.depth_check being set, and if we
* encounter a non-zero value we set show->state.depth_to_show to
* the depth at which we encountered it. When we have completed the
* first pass, we will know if anything needs to be displayed if
* depth_to_show > depth. See btf_[struct,array]_show() for the
* implementation of this.
*
* Another problem is we want to ensure the data for display is safe to
* access. To support this, the anonymous "struct {} obj" tracks the data
* object and our safe copy of it. We copy portions of the data needed
* to the object "copy" buffer, but because its size is limited to
* BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
* traverse larger objects for display.
*
* The various data type show functions all start with a call to
* btf_show_start_type() which returns a pointer to the safe copy
* of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
* raw data itself). btf_show_obj_safe() is responsible for
* using copy_from_kernel_nofault() to update the safe data if necessary
* as we traverse the object's data. skbuff-like semantics are
* used:
*
* - obj.head points to the start of the toplevel object for display
* - obj.size is the size of the toplevel object
* - obj.data points to the current point in the original data at
* which our safe data starts. obj.data will advance as we copy
* portions of the data.
*
* In most cases a single copy will suffice, but larger data structures
* such as "struct task_struct" will require many copies. The logic in
* btf_show_obj_safe() handles the logic that determines if a new
* copy_from_kernel_nofault() is needed.
*/
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
u8 depth;
u8 depth_to_show;
u8 depth_check;
u8 array_member:1,
array_terminated:1;
u16 array_encoding;
u32 type_id;
int status; /* non-zero for error */
const struct btf_type *type;
const struct btf_member *member;
char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */
} state;
struct {
u32 size;
void *head;
void *data;
u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
} obj;
};
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left);
int (*resolve)(struct btf_verifier_env *env,
const struct resolve_vertex *v);
int (*check_member)(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type);
int (*check_kflag_member)(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
void (*show)(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offsets,
struct btf_show *show);
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
* but they are grouped into the same bucket
* for BTF concern:
* A type (t) that refers to another
* type through t->type AND its size cannot
* be determined without following the t->type.
*
* ptr does not fall into this bucket
* because its size is always sizeof(void *).
*/
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_TYPEDEF:
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
case BTF_KIND_TYPE_TAG:
return true;
}
return false;
}
bool btf_type_is_void(const struct btf_type *t)
{
return t == &btf_void;
}
static bool btf_type_is_fwd(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
}
static bool btf_type_nosize(const struct btf_type *t)
{
return btf_type_is_void(t) || btf_type_is_fwd(t) ||
btf_type_is_func(t) || btf_type_is_func_proto(t);
}
static bool btf_type_nosize_or_null(const struct btf_type *t)
{
return !t || btf_type_nosize(t);
}
static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
}
static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
static bool btf_type_is_decl_tag(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
return btf_type_is_func(t) || btf_type_is_struct(t) ||
btf_type_is_var(t) || btf_type_is_typedef(t);
}
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
while (btf) {
total += btf->nr_types;
btf = btf->base_btf;
}
return total;
}
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
const struct btf_type *t;
const char *tname;
u32 i, total;
total = btf_nr_types(btf);
for (i = 1; i < total; i++) {
t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != kind)
continue;
tname = btf_name_by_offset(btf, t->name_off);
if (!strcmp(tname, name))
return i;
}
return -ENOENT;
}
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *t = btf_type_by_id(btf, id);
while (btf_type_is_modifier(t)) {
id = t->type;
t = btf_type_by_id(btf, t->type);
}
if (res_id)
*res_id = id;
return t;
}
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *t;
t = btf_type_skip_modifiers(btf, id, NULL);
if (!btf_type_is_ptr(t))
return NULL;
return btf_type_skip_modifiers(btf, t->type, res_id);
}
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *ptype;
ptype = btf_type_resolve_ptr(btf, id, res_id);
if (ptype && btf_type_is_func_proto(ptype))
return ptype;
return NULL;
}
/* Types that act only as a source, not sink or intermediate
* type when resolving.
*/
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
/* What types need to be resolved?
*
* btf_type_is_modifier() is an obvious one.
*
* btf_type_is_struct() because its member refers to
* another type (through member->type).
*
* btf_type_is_var() because the variable refers to
* another type. btf_type_is_datasec() holds multiple
* btf_type_is_var() types that need resolving.
*
* btf_type_is_array() because its element (array->type)
* refers to another type. Array can be thought of a
* special case of struct while array just has the same
* member-type repeated by array->nelems of times.
*/
static bool btf_type_needs_resolve(const struct btf_type *t)
{
return btf_type_is_modifier(t) ||
btf_type_is_ptr(t) ||
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
/* t->size can be used */
static bool btf_type_has_size(const struct btf_type *t)
{
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_INT:
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
case BTF_KIND_FLOAT:
return true;
}
return false;
}
static const char *btf_int_encoding_str(u8 encoding)
{
if (encoding == 0)
return "(none)";
else if (encoding == BTF_INT_SIGNED)
return "SIGNED";
else if (encoding == BTF_INT_CHAR)
return "CHAR";
else if (encoding == BTF_INT_BOOL)
return "BOOL";
else
return "UNKN";
}
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
}
static const struct btf_array *btf_type_array(const struct btf_type *t)
{
return (const struct btf_array *)(t + 1);
}
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
}
static const struct btf_var *btf_type_var(const struct btf_type *t)
{
return (const struct btf_var *)(t + 1);
}
static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
{
return (const struct btf_decl_tag *)(t + 1);
}
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
}
static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
if (!BTF_STR_OFFSET_VALID(offset))
return false;
while (offset < btf->start_str_off)
btf = btf->base_btf;
offset -= btf->start_str_off;
return offset < btf->hdr.str_len;
}
static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
{
if ((first ? !isalpha(c) :
!isalnum(c)) &&
c != '_' &&
((c == '.' && !dot_ok) ||
c != '.'))
return false;
return true;
}
static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
offset -= btf->start_str_off;
if (offset < btf->hdr.str_len)
return &btf->strings[offset];
return NULL;
}
static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
{
/* offset must be valid */
const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
if (!__btf_name_char_ok(*src, true, dot_ok))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
if (!__btf_name_char_ok(*src, false, dot_ok))
return false;
src++;
}
return !*src;
}
/* Only C-style identifier is permitted. This can be relaxed if
* necessary.
*/
static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
{
return __btf_name_valid(btf, offset, false);
}
static bool btf_name_valid_section(const struct btf *btf, u32 offset)
{
return __btf_name_valid(btf, offset, true);
}
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
const char *name;
if (!offset)
return "(anon)";
name = btf_str_by_offset(btf, offset);
return name ?: "(invalid-name-offset)";
}
const char *btf_name_by_offset(const struct btf *btf, u32 offset)
{
return btf_str_by_offset(btf, offset);
}
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
while (type_id < btf->start_id)
btf = btf->base_btf;
type_id -= btf->start_id;
if (type_id >= btf->nr_types)
return NULL;
return btf->types[type_id];
}
/*
* Regular int is not a bit field and it must be either
* u8/u16/u32/u64 or __int128.
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
u8 nr_bits, nr_bytes;
u32 int_data;
int_data = btf_type_int(t);
nr_bits = BTF_INT_BITS(int_data);
nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
if (BITS_PER_BYTE_MASKED(nr_bits) ||
BTF_INT_OFFSET(int_data) ||
(nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
nr_bytes != (2 * sizeof(u64)))) {
return false;
}
return true;
}
/*
* Check that given struct member is a regular int with expected
* offset and size.
*/
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size)
{
const struct btf_type *t;
u32 id, int_data;
u8 nr_bits;
id = m->type;
t = btf_type_id_size(btf, &id, NULL);
if (!t || !btf_type_is_int(t))
return false;
int_data = btf_type_int(t);
nr_bits = BTF_INT_BITS(int_data);
if (btf_type_kflag(s)) {
u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset);
u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset);
/* if kflag set, int should be a regular int and
* bit offset should be at byte boundary.
*/
return !bitfield_size &&
BITS_ROUNDUP_BYTES(bit_offset) == expected_offset &&
BITS_ROUNDUP_BYTES(nr_bits) == expected_size;
}
if (BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(m->offset) ||
BITS_ROUNDUP_BYTES(m->offset) != expected_offset ||
BITS_PER_BYTE_MASKED(nr_bits) ||
BITS_ROUNDUP_BYTES(nr_bits) != expected_size)
return false;
return true;
}
/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
u32 id)
{
const struct btf_type *t = btf_type_by_id(btf, id);
while (btf_type_is_modifier(t) &&
BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
t = btf_type_by_id(btf, t->type);
}
return t;
}
#define BTF_SHOW_MAX_ITER 10
#define BTF_KIND_BIT(kind) (1ULL << kind)
/*
* Populate show->state.name with type name information.
* Format of type name is
*
* [.member_name = ] (type_name)
*/
static const char *btf_show_name(struct btf_show *show)
{
/* BTF_MAX_ITER array suffixes "[]" */
const char *array_suffixes = "[][][][][][][][][][]";
const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
/* BTF_MAX_ITER pointer suffixes "*" */
const char *ptr_suffixes = "**********";
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
bool show_member = false;
u64 kinds = 0;
int i;
show->state.name[0] = '\0';
/*
* Don't show type name if we're showing an array member;
* in that case we show the array type so don't need to repeat
* ourselves for each member.
*/
if (show->state.array_member)
return "";
/* Retrieve member name, if any. */
if (m) {
member = btf_name_by_offset(show->btf, m->name_off);
show_member = strlen(member) > 0;
id = m->type;
}
/*
* Start with type_id, as we have resolved the struct btf_type *
* via btf_modifier_show() past the parent typedef to the child
* struct, int etc it is defined as. In such cases, the type_id
* still represents the starting type while the struct btf_type *
* in our show->state points at the resolved type of the typedef.
*/
t = btf_type_by_id(show->btf, id);
if (!t)
return "";
/*
* The goal here is to build up the right number of pointer and
* array suffixes while ensuring the type name for a typedef
* is represented. Along the way we accumulate a list of
* BTF kinds we have encountered, since these will inform later
* display; for example, pointer types will not require an
* opening "{" for struct, we will just display the pointer value.
*
* We also want to accumulate the right number of pointer or array
* indices in the format string while iterating until we get to
* the typedef/pointee/array member target type.
*
* We start by pointing at the end of pointer and array suffix
* strings; as we accumulate pointers and arrays we move the pointer
* or array string backwards so it will show the expected number of
* '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers
* and/or arrays and typedefs are supported as a precaution.
*
* We also want to get typedef name while proceeding to resolve
* type it points to so that we can add parentheses if it is a
* "typedef struct" etc.
*/
for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_TYPEDEF:
if (!name)
name = btf_name_by_offset(show->btf,
t->name_off);
kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
id = t->type;
break;
case BTF_KIND_ARRAY:
kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
parens = "[";
if (!t)
return "";
array = btf_type_array(t);
if (array_suffix > array_suffixes)
array_suffix -= 2;
id = array->type;
break;
case BTF_KIND_PTR:
kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
if (ptr_suffix > ptr_suffixes)
ptr_suffix -= 1;
id = t->type;
break;
default:
id = 0;
break;
}
if (!id)
break;
t = btf_type_skip_qualifiers(show->btf, id);
}
/* We may not be able to represent this type; bail to be safe */
if (i == BTF_SHOW_MAX_ITER)
return "";
if (!name)
name = btf_name_by_offset(show->btf, t->name_off);
switch (BTF_INFO_KIND(t->info)) {
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
"struct" : "union";
/* if it's an array of struct/union, parens is already set */
if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
parens = "{";
break;
case BTF_KIND_ENUM:
prefix = "enum";
break;
default:
break;
}
/* pointer does not require parens */
if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
parens = "";
/* typedef does not require struct/union/enum prefix */
if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
prefix = "";
if (!name)
name = "";
/* Even if we don't want type name info, we want parentheses etc */
if (show->flags & BTF_SHOW_NONAME)
snprintf(show->state.name, sizeof(show->state.name), "%s",
parens);
else
snprintf(show->state.name, sizeof(show->state.name),
"%s%s%s(%s%s%s%s%s%s)%s",
/* first 3 strings comprise ".member = " */
show_member ? "." : "",
show_member ? member : "",
show_member ? " = " : "",
/* ...next is our prefix (struct, enum, etc) */
prefix,
strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
/* ...this is the type name itself */
name,
/* ...suffixed by the appropriate '*', '[]' suffixes */
strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
array_suffix, parens);
return show->state.name;
}
static const char *__btf_show_indent(struct btf_show *show)
{
const char *indents = " ";
const char *indent = &indents[strlen(indents)];
if ((indent - show->state.depth) >= indents)
return indent - show->state.depth;
return indents;
}
static const char *btf_show_indent(struct btf_show *show)
{
return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
}
static const char *btf_show_newline(struct btf_show *show)
{
return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
}
static const char *btf_show_delim(struct btf_show *show)
{
if (show->state.depth == 0)
return "";
if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
return "|";
return ",";
}
__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
{
va_list args;
if (!show->state.depth_check) {
va_start(args, fmt);
show->showfn(show, fmt, args);
va_end(args);
}
}
/* Macros are used here as btf_show_type_value[s]() prepends and appends
* format specifiers to the format specifier passed in; these do the work of
* adding indentation, delimiters etc while the caller simply has to specify
* the type value(s) in the format specifier + value(s).
*/
#define btf_show_type_value(show, fmt, value) \
do { \
if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
show->state.depth == 0) { \
btf_show(show, "%s%s" fmt "%s%s", \
btf_show_indent(show), \
btf_show_name(show), \
value, btf_show_delim(show), \
btf_show_newline(show)); \
if (show->state.depth > show->state.depth_to_show) \
show->state.depth_to_show = show->state.depth; \
} \
} while (0)
#define btf_show_type_values(show, fmt, ...) \
do { \
btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \
btf_show_name(show), \
__VA_ARGS__, btf_show_delim(show), \
btf_show_newline(show)); \
if (show->state.depth > show->state.depth_to_show) \
show->state.depth_to_show = show->state.depth; \
} while (0)
/* How much is left to copy to safe buffer after @data? */
static int btf_show_obj_size_left(struct btf_show *show, void *data)
{
return show->obj.head + show->obj.size - data;
}
/* Is object pointed to by @data of @size already copied to our safe buffer? */
static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
{
return data >= show->obj.data &&
(data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
}
/*
* If object pointed to by @data of @size falls within our safe buffer, return
* the equivalent pointer to the same safe data. Assumes
* copy_from_kernel_nofault() has already happened and our safe buffer is
* populated.
*/
static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
{
if (btf_show_obj_is_safe(show, data, size))
return show->obj.safe + (data - show->obj.data);
return NULL;
}
/*
* Return a safe-to-access version of data pointed to by @data.
* We do this by copying the relevant amount of information
* to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
*
* If BTF_SHOW_UNSAFE is specified, just return data as-is; no
* safe copy is needed.
*
* Otherwise we need to determine if we have the required amount
* of data (determined by the @data pointer and the size of the
* largest base type we can encounter (represented by
* BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
* that we will be able to print some of the current object,
* and if more is needed a copy will be triggered.
* Some objects such as structs will not fit into the buffer;
* in such cases additional copies when we iterate over their
* members may be needed.
*
* btf_show_obj_safe() is used to return a safe buffer for
* btf_show_start_type(); this ensures that as we recurse into
* nested types we always have safe data for the given type.
* This approach is somewhat wasteful; it's possible for example
* that when iterating over a large union we'll end up copying the
* same data repeatedly, but the goal is safety not performance.
* We use stack data as opposed to per-CPU buffers because the
* iteration over a type can take some time, and preemption handling
* would greatly complicate use of the safe buffer.
*/
static void *btf_show_obj_safe(struct btf_show *show,
const struct btf_type *t,
void *data)
{
const struct btf_type *rt;
int size_left, size;
void *safe = NULL;
if (show->flags & BTF_SHOW_UNSAFE)
return data;
rt = btf_resolve_size(show->btf, t, &size);
if (IS_ERR(rt)) {
show->state.status = PTR_ERR(rt);
return NULL;
}
/*
* Is this toplevel object? If so, set total object size and
* initialize pointers. Otherwise check if we still fall within
* our safe object data.
*/
if (show->state.depth == 0) {
show->obj.size = size;
show->obj.head = data;
} else {
/*
* If the size of the current object is > our remaining
* safe buffer we _may_ need to do a new copy. However
* consider the case of a nested struct; it's size pushes
* us over the safe buffer limit, but showing any individual
* struct members does not. In such cases, we don't need
* to initiate a fresh copy yet; however we definitely need
* at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
* in our buffer, regardless of the current object size.
* The logic here is that as we resolve types we will
* hit a base type at some point, and we need to be sure
* the next chunk of data is safely available to display
* that type info safely. We cannot rely on the size of
* the current object here because it may be much larger
* than our current buffer (e.g. task_struct is 8k).
* All we want to do here is ensure that we can print the
* next basic type, which we can if either
* - the current type size is within the safe buffer; or
* - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
* the safe buffer.
*/
safe = __btf_show_obj_safe(show, data,
min(size,
BTF_SHOW_OBJ_BASE_TYPE_SIZE));
}
/*
* We need a new copy to our safe object, either because we haven't
* yet copied and are initializing safe data, or because the data
* we want falls outside the boundaries of the safe object.
*/
if (!safe) {
size_left = btf_show_obj_size_left(show, data);
if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
size_left = BTF_SHOW_OBJ_SAFE_SIZE;
show->state.status = copy_from_kernel_nofault(show->obj.safe,
data, size_left);
if (!show->state.status) {
show->obj.data = data;
safe = show->obj.safe;
}
}
return safe;
}
/*
* Set the type we are starting to show and return a safe data pointer
* to be used for showing the associated data.
*/
static void *btf_show_start_type(struct btf_show *show,
const struct btf_type *t,
u32 type_id, void *data)
{
show->state.type = t;
show->state.type_id = type_id;
show->state.name[0] = '\0';
return btf_show_obj_safe(show, t, data);
}
static void btf_show_end_type(struct btf_show *show)
{
show->state.type = NULL;
show->state.type_id = 0;
show->state.name[0] = '\0';
}
static void *btf_show_start_aggr_type(struct btf_show *show,
const struct btf_type *t,
u32 type_id, void *data)
{
void *safe_data = btf_show_start_type(show, t, type_id, data);
if (!safe_data)
return safe_data;
btf_show(show, "%s%s%s", btf_show_indent(show),
btf_show_name(show),
btf_show_newline(show));
show->state.depth++;
return safe_data;
}
static void btf_show_end_aggr_type(struct btf_show *show,
const char *suffix)
{
show->state.depth--;
btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
btf_show_delim(show), btf_show_newline(show));
btf_show_end_type(show);
}
static void btf_show_start_member(struct btf_show *show,
const struct btf_member *m)
{
show->state.member = m;
}
static void btf_show_start_array_member(struct btf_show *show)
{
show->state.array_member = 1;
btf_show_start_member(show, NULL);
}
static void btf_show_end_member(struct btf_show *show)
{
show->state.member = NULL;
}
static void btf_show_end_array_member(struct btf_show *show)
{
show->state.array_member = 0;
btf_show_end_member(show);
}
static void *btf_show_start_array_type(struct btf_show *show,
const struct btf_type *t,
u32 type_id,
u16 array_encoding,
void *data)
{
show->state.array_encoding = array_encoding;
show->state.array_terminated = 0;
return btf_show_start_aggr_type(show, t, type_id, data);
}
static void btf_show_end_array_type(struct btf_show *show)
{
show->state.array_encoding = 0;
show->state.array_terminated = 0;
btf_show_end_aggr_type(show, "]");
}
static void *btf_show_start_struct_type(struct btf_show *show,
const struct btf_type *t,
u32 type_id,
void *data)
{
return btf_show_start_aggr_type(show, t, type_id, data);
}
static void btf_show_end_struct_type(struct btf_show *show)
{
btf_show_end_aggr_type(show, "}");
}
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
va_start(args, fmt);
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
const struct btf_type *t,
bool log_details,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
u8 kind = BTF_INFO_KIND(t->info);
struct btf *btf = env->btf;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
/* btf verifier prints all types it is processing via
* btf_verifier_log_type(..., fmt = NULL).
* Skip those prints for in-kernel BTF verification.
*/
if (log->level == BPF_LOG_KERNEL && !fmt)
return;
__btf_verifier_log(log, "[%u] %s %s%s",
env->log_type_id,
btf_kind_str[kind],
__btf_name_by_offset(btf, t->name_off),
log_details ? " " : "");
if (log_details)
btf_type_ops(t)->log_details(env, t);
if (fmt && *fmt) {
__btf_verifier_log(log, " ");
va_start(args, fmt);
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
__btf_verifier_log(log, "\n");
}
#define btf_verifier_log_type(env, t, ...) \
__btf_verifier_log_type((env), (t), true, __VA_ARGS__)
#define btf_verifier_log_basic(env, t, ...) \
__btf_verifier_log_type((env), (t), false, __VA_ARGS__)
__printf(4, 5)
static void btf_verifier_log_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
struct btf *btf = env->btf;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
if (log->level == BPF_LOG_KERNEL && !fmt)
return;
/* The CHECK_META phase already did a btf dump.
*
* If member is logged again, it must hit an error in
* parsing this member. It is useful to print out which
* struct this member belongs to.
*/
if (env->phase != CHECK_META)
btf_verifier_log_type(env, struct_type, NULL);
if (btf_type_kflag(struct_type))
__btf_verifier_log(log,
"\t%s type_id=%u bitfield_size=%u bits_offset=%u",
__btf_name_by_offset(btf, member->name_off),
member->type,
BTF_MEMBER_BITFIELD_SIZE(member->offset),
BTF_MEMBER_BIT_OFFSET(member->offset));
else
__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
__btf_name_by_offset(btf, member->name_off),
member->type, member->offset);
if (fmt && *fmt) {
__btf_verifier_log(log, " ");
va_start(args, fmt);
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
__btf_verifier_log(log, "\n");
}
__printf(4, 5)
static void btf_verifier_log_vsi(struct btf_verifier_env *env,
const struct btf_type *datasec_type,
const struct btf_var_secinfo *vsi,
const char *fmt, ...)
{
struct bpf_verifier_log *log = &env->log;
va_list args;
if (!bpf_verifier_log_needed(log))
return;
if (log->level == BPF_LOG_KERNEL && !fmt)
return;
if (env->phase != CHECK_META)
btf_verifier_log_type(env, datasec_type, NULL);
__btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
vsi->type, vsi->offset, vsi->size);
if (fmt && *fmt) {
__btf_verifier_log(log, " ");
va_start(args, fmt);
bpf_verifier_vlog(log, fmt, args);
va_end(args);
}
__btf_verifier_log(log, "\n");
}
static void btf_verifier_log_hdr(struct btf_verifier_env *env,
u32 btf_data_size)
{
struct bpf_verifier_log *log = &env->log;
const struct btf *btf = env->btf;
const struct btf_header *hdr;
if (!bpf_verifier_log_needed(log))
return;
if (log->level == BPF_LOG_KERNEL)
return;
hdr = &btf->hdr;
__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
__btf_verifier_log(log, "version: %u\n", hdr->version);
__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
}
static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
{
struct btf *btf = env->btf;
if (btf->types_size == btf->nr_types) {
/* Expand 'types' array */
struct btf_type **new_types;
u32 expand_by, new_size;
if (btf->start_id + btf->types_size == BTF_MAX_TYPE) {
btf_verifier_log(env, "Exceeded max num of types");
return -E2BIG;
}
expand_by = max_t(u32, btf->types_size >> 2, 16);
new_size = min_t(u32, BTF_MAX_TYPE,
btf->types_size + expand_by);
new_types = kvcalloc(new_size, sizeof(*new_types),
GFP_KERNEL | __GFP_NOWARN);
if (!new_types)
return -ENOMEM;
if (btf->nr_types == 0) {
if (!btf->base_btf) {
/* lazily init VOID type */
new_types[0] = &btf_void;
btf->nr_types++;
}
} else {
memcpy(new_types, btf->types,
sizeof(*btf->types) * btf->nr_types);
}
kvfree(btf->types);
btf->types = new_types;
btf->types_size = new_size;
}
btf->types[btf->nr_types++] = t;
return 0;
}
static int btf_alloc_id(struct btf *btf)
{
int id;
idr_preload(GFP_KERNEL);
spin_lock_bh(&btf_idr_lock);
id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
if (id > 0)
btf->id = id;
spin_unlock_bh(&btf_idr_lock);
idr_preload_end();
if (WARN_ON_ONCE(!id))
return -ENOSPC;
return id > 0 ? 0 : id;
}
static void btf_free_id(struct btf *btf)
{
unsigned long flags;
/*
* In map-in-map, calling map_delete_elem() on outer
* map will call bpf_map_put on the inner map.
* It will then eventually call btf_free_id()
* on the inner map. Some of the map_delete_elem()
* implementation may have irq disabled, so
* we need to use the _irqsave() version instead
* of the _bh() version.
*/
spin_lock_irqsave(&btf_idr_lock, flags);
idr_remove(&btf_idr, btf->id);
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
static void btf_free(struct btf *btf)
{
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
kvfree(btf->data);
kfree(btf);
}
static void btf_free_rcu(struct rcu_head *rcu)
{
struct btf *btf = container_of(rcu, struct btf, rcu);
btf_free(btf);
}
void btf_get(struct btf *btf)
{
refcount_inc(&btf->refcnt);
}
void btf_put(struct btf *btf)
{
if (btf && refcount_dec_and_test(&btf->refcnt)) {
btf_free_id(btf);
call_rcu(&btf->rcu, btf_free_rcu);
}
}
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
u32 nr_types = btf->nr_types;
u32 *resolved_sizes = NULL;
u32 *resolved_ids = NULL;
u8 *visit_states = NULL;
resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_sizes)
goto nomem;
resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_ids)
goto nomem;
visit_states = kvcalloc(nr_types, sizeof(*visit_states),
GFP_KERNEL | __GFP_NOWARN);
if (!visit_states)
goto nomem;
btf->resolved_sizes = resolved_sizes;
btf->resolved_ids = resolved_ids;
env->visit_states = visit_states;
return 0;
nomem:
kvfree(resolved_sizes);
kvfree(resolved_ids);
kvfree(visit_states);
return -ENOMEM;
}
static void btf_verifier_env_free(struct btf_verifier_env *env)
{
kvfree(env->visit_states);
kfree(env);
}
static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
const struct btf_type *next_type)
{
switch (env->resolve_mode) {
case RESOLVE_TBD:
/* int, enum or void is a sink */
return !btf_type_needs_resolve(next_type);
case RESOLVE_PTR:
/* int, enum, void, struct, array, func or func_proto is a sink
* for ptr
*/
return !btf_type_is_modifier(next_type) &&
!btf_type_is_ptr(next_type);
case RESOLVE_STRUCT_OR_ARRAY:
/* int, enum, void, ptr, func or func_proto is a sink
* for struct and array
*/
return !btf_type_is_modifier(next_type) &&
!btf_type_is_array(next_type) &&
!btf_type_is_struct(next_type);
default:
BUG();
}
}
static bool env_type_is_resolved(const struct btf_verifier_env *env,
u32 type_id)
{
/* base BTF types should be resolved by now */
if (type_id < env->btf->start_id)
return true;
return env->visit_states[type_id - env->btf->start_id] == RESOLVED;
}
static int env_stack_push(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id)
{
const struct btf *btf = env->btf;
struct resolve_vertex *v;
if (env->top_stack == MAX_RESOLVE_DEPTH)
return -E2BIG;
if (type_id < btf->start_id
|| env->visit_states[type_id - btf->start_id] != NOT_VISITED)
return -EEXIST;
env->visit_states[type_id - btf->start_id] = VISITED;
v = &env->stack[env->top_stack++];
v->t = t;
v->type_id = type_id;
v->next_member = 0;
if (env->resolve_mode == RESOLVE_TBD) {
if (btf_type_is_ptr(t))
env->resolve_mode = RESOLVE_PTR;
else if (btf_type_is_struct(t) || btf_type_is_array(t))
env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
}
return 0;
}
static void env_stack_set_next_member(struct btf_verifier_env *env,
u16 next_member)
{
env->stack[env->top_stack - 1].next_member = next_member;
}
static void env_stack_pop_resolved(struct btf_verifier_env *env,
u32 resolved_type_id,
u32 resolved_size)
{
u32 type_id = env->stack[--(env->top_stack)].type_id;
struct btf *btf = env->btf;
type_id -= btf->start_id; /* adjust to local type id */
btf->resolved_sizes[type_id] = resolved_size;
btf->resolved_ids[type_id] = resolved_type_id;
env->visit_states[type_id] = RESOLVED;
}
static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
{
return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
}
/* Resolve the size of a passed-in "type"
*
* type: is an array (e.g. u32 array[x][y])
* return type: type "u32[x][y]", i.e. BTF_KIND_ARRAY,
* *type_size: (x * y * sizeof(u32)). Hence, *type_size always
* corresponds to the return type.
* *elem_type: u32
* *elem_id: id of u32
* *total_nelems: (x * y). Hence, individual elem size is
* (*type_size / *total_nelems)
* *type_id: id of type if it's changed within the function, 0 if not
*
* type: is not an array (e.g. const struct X)
* return type: type "struct X"
* *type_size: sizeof(struct X)
* *elem_type: same as return type ("struct X")
* *elem_id: 0
* *total_nelems: 1
* *type_id: id of type if it's changed within the function, 0 if not
*/
static const struct btf_type *
__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *elem_id, u32 *total_nelems, u32 *type_id)
{
const struct btf_type *array_type = NULL;
const struct btf_array *array = NULL;
u32 i, size, nelems = 1, id = 0;
for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
switch (BTF_INFO_KIND(type->info)) {
/* type->size can be used */
case BTF_KIND_INT:
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_FLOAT:
size = type->size;
goto resolved;
case BTF_KIND_PTR:
size = sizeof(void *);
goto resolved;
/* Modifiers */
case BTF_KIND_TYPEDEF:
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
case BTF_KIND_ARRAY:
if (!array_type)
array_type = type;
array = btf_type_array(type);
if (nelems && array->nelems > U32_MAX / nelems)
return ERR_PTR(-EINVAL);
nelems *= array->nelems;
type = btf_type_by_id(btf, array->type);
break;
/* type without size */
default:
return ERR_PTR(-EINVAL);
}
}
return ERR_PTR(-EINVAL);
resolved:
if (nelems && size > U32_MAX / nelems)
return ERR_PTR(-EINVAL);
*type_size = nelems * size;
if (total_nelems)
*total_nelems = nelems;
if (elem_type)
*elem_type = type;
if (elem_id)
*elem_id = array ? array->type : 0;
if (type_id && id)
*type_id = id;
return array_type ? : type;
}
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size)
{
return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
}
static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id)
{
while (type_id < btf->start_id)
btf = btf->base_btf;
return btf->resolved_ids[type_id - btf->start_id];
}
/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
u32 *type_id)
{
*type_id = btf_resolved_type_id(btf, *type_id);
return btf_type_by_id(btf, *type_id);
}
static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id)
{
while (type_id < btf->start_id)
btf = btf->base_btf;
return btf->resolved_sizes[type_id - btf->start_id];
}
const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 *type_id, u32 *ret_size)
{
const struct btf_type *size_type;
u32 size_type_id = *type_id;
u32 size = 0;
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
if (btf_type_has_size(size_type)) {
size = size_type->size;
} else if (btf_type_is_array(size_type)) {
size = btf_resolved_type_size(btf, size_type_id);
} else if (btf_type_is_ptr(size_type)) {
size = sizeof(void *);
} else {
if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
!btf_type_is_var(size_type)))
return NULL;
size_type_id = btf_resolved_type_id(btf, size_type_id);
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
else if (btf_type_has_size(size_type))
size = size_type->size;
else if (btf_type_is_array(size_type))
size = btf_resolved_type_size(btf, size_type_id);
else if (btf_type_is_ptr(size_type))
size = sizeof(void *);
else
return NULL;
}
*type_id = size_type_id;
if (ret_size)
*ret_size = size;
return size_type;
}
static int btf_df_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
btf_verifier_log_basic(env, struct_type,
"Unsupported check_member");
return -EINVAL;
}
static int btf_df_check_kflag_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
btf_verifier_log_basic(env, struct_type,
"Unsupported check_kflag_member");
return -EINVAL;
}
/* Used for ptr, array struct/union and float type members.
* int, enum and modifier types have their specific callback functions.
*/
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member bitfield_size");
return -EINVAL;
}
/* bitfield size is 0, so member->offset represents bit offset only.
* It is safe to call non kflag check_member variants.
*/
return btf_type_ops(member_type)->check_member(env, struct_type,
member,
member_type);
}
static int btf_df_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
btf_verifier_log_basic(env, v->t, "Unsupported resolve");
return -EINVAL;
}
static void btf_df_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offsets,
struct btf_show *show)
{
btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
}
static int btf_int_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 int_data = btf_type_int(member_type);
u32 struct_bits_off = member->offset;
u32 struct_size = struct_type->size;
u32 nr_copy_bits;
u32 bytes_offset;
if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
btf_verifier_log_member(env, struct_type, member,
"bits_offset exceeds U32_MAX");
return -EINVAL;
}
struct_bits_off += BTF_INT_OFFSET(int_data);
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
nr_copy_bits = BTF_INT_BITS(int_data) +
BITS_PER_BYTE_MASKED(struct_bits_off);
if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
"nr_copy_bits exceeds 128");
return -EINVAL;
}
if (struct_size < bytes_offset ||
struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static int btf_int_check_kflag_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset;
u32 int_data = btf_type_int(member_type);
u32 struct_size = struct_type->size;
u32 nr_copy_bits;
/* a regular int type is required for the kflag int member */
if (!btf_type_int_is_regular(member_type)) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member base type");
return -EINVAL;
}
/* check sanity of bitfield size */
nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
nr_int_data_bits = BTF_INT_BITS(int_data);
if (!nr_bits) {
/* Not a bitfield member, member offset must be at byte
* boundary.
*/
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member offset");
return -EINVAL;
}
nr_bits = nr_int_data_bits;
} else if (nr_bits > nr_int_data_bits) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member bitfield_size");
return -EINVAL;
}
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
"nr_copy_bits exceeds 128");
return -EINVAL;
}
if (struct_size < bytes_offset ||
struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static s32 btf_int_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
u32 int_data, nr_bits, meta_needed = sizeof(int_data);
u16 encoding;
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
int_data = btf_type_int(t);
if (int_data & ~BTF_INT_MASK) {
btf_verifier_log_basic(env, t, "Invalid int_data:%x",
int_data);
return -EINVAL;
}
nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
if (nr_bits > BITS_PER_U128) {
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
BITS_PER_U128);
return -EINVAL;
}
if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
return -EINVAL;
}
/*
* Only one of the encoding bits is allowed and it
* should be sufficient for the pretty print purpose (i.e. decoding).
* Multiple bits can be allowed later if it is found
* to be insufficient.
*/
encoding = BTF_INT_ENCODING(int_data);
if (encoding &&
encoding != BTF_INT_SIGNED &&
encoding != BTF_INT_CHAR &&
encoding != BTF_INT_BOOL) {
btf_verifier_log_type(env, t, "Unsupported encoding");
return -ENOTSUPP;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static void btf_int_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
int int_data = btf_type_int(t);
btf_verifier_log(env,
"size=%u bits_offset=%u nr_bits=%u encoding=%s",
t->size, BTF_INT_OFFSET(int_data),
BTF_INT_BITS(int_data),
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
static void btf_int128_print(struct btf_show *show, void *data)
{
/* data points to a __int128 number.
* Suppose
* int128_num = *(__int128 *)data;
* The below formulas shows what upper_num and lower_num represents:
* upper_num = int128_num >> 64;
* lower_num = int128_num & 0xffffffffFFFFFFFFULL;
*/
u64 upper_num, lower_num;
#ifdef __BIG_ENDIAN_BITFIELD
upper_num = *(u64 *)data;
lower_num = *(u64 *)(data + 8);
#else
upper_num = *(u64 *)(data + 8);
lower_num = *(u64 *)data;
#endif
if (upper_num == 0)
btf_show_type_value(show, "0x%llx", lower_num);
else
btf_show_type_values(show, "0x%llx%016llx", upper_num,
lower_num);
}
static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
u16 right_shift_bits)
{
u64 upper_num, lower_num;
#ifdef __BIG_ENDIAN_BITFIELD
upper_num = print_num[0];
lower_num = print_num[1];
#else
upper_num = print_num[1];
lower_num = print_num[0];
#endif
/* shake out un-needed bits by shift/or operations */
if (left_shift_bits >= 64) {
upper_num = lower_num << (left_shift_bits - 64);
lower_num = 0;
} else {
upper_num = (upper_num << left_shift_bits) |
(lower_num >> (64 - left_shift_bits));
lower_num = lower_num << left_shift_bits;
}
if (right_shift_bits >= 64) {
lower_num = upper_num >> (right_shift_bits - 64);
upper_num = 0;
} else {
lower_num = (lower_num >> right_shift_bits) |
(upper_num << (64 - right_shift_bits));
upper_num = upper_num >> right_shift_bits;
}
#ifdef __BIG_ENDIAN_BITFIELD
print_num[0] = upper_num;
print_num[1] = lower_num;
#else
print_num[0] = lower_num;
print_num[1] = upper_num;
#endif
}
static void btf_bitfield_show(void *data, u8 bits_offset,
u8 nr_bits, struct btf_show *show)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
u8 nr_copy_bits;
u64 print_num[2] = {};
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
memcpy(print_num, data, nr_copy_bytes);
#ifdef __BIG_ENDIAN_BITFIELD
left_shift_bits = bits_offset;
#else
left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
right_shift_bits = BITS_PER_U128 - nr_bits;
btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
btf_int128_print(show, print_num);
}
static void btf_int_bits_show(const struct btf *btf,
const struct btf_type *t,
void *data, u8 bits_offset,
struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 nr_bits = BTF_INT_BITS(int_data);
u8 total_bits_offset;
/*
* bits_offset is at most 7.
* BTF_INT_OFFSET() cannot exceed 128 bits.
*/
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
btf_bitfield_show(data, bits_offset, nr_bits, show);
}
static void btf_int_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
u8 nr_bits = BTF_INT_BITS(int_data);
void *safe_data;
safe_data = btf_show_start_type(show, t, type_id, data);
if (!safe_data)
return;
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
btf_int_bits_show(btf, t, safe_data, bits_offset, show);
goto out;
}
switch (nr_bits) {
case 128:
btf_int128_print(show, safe_data);
break;
case 64:
if (sign)
btf_show_type_value(show, "%lld", *(s64 *)safe_data);
else
btf_show_type_value(show, "%llu", *(u64 *)safe_data);
break;
case 32:
if (sign)
btf_show_type_value(show, "%d", *(s32 *)safe_data);
else
btf_show_type_value(show, "%u", *(u32 *)safe_data);
break;
case 16:
if (sign)
btf_show_type_value(show, "%d", *(s16 *)safe_data);
else
btf_show_type_value(show, "%u", *(u16 *)safe_data);
break;
case 8:
if (show->state.array_encoding == BTF_INT_CHAR) {
/* check for null terminator */
if (show->state.array_terminated)
break;
if (*(char *)data == '\0') {
show->state.array_terminated = 1;
break;
}
if (isprint(*(char *)data)) {
btf_show_type_value(show, "'%c'",
*(char *)safe_data);
break;
}
}
if (sign)
btf_show_type_value(show, "%d", *(s8 *)safe_data);
else
btf_show_type_value(show, "%u", *(u8 *)safe_data);
break;
default:
btf_int_bits_show(btf, t, safe_data, bits_offset, show);
break;
}
out:
btf_show_end_type(show);
}
static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_int_check_member,
.check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
.show = btf_int_show,
};
static int btf_modifier_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
const struct btf_type *resolved_type;
u32 resolved_type_id = member->type;
struct btf_member resolved_member;
struct btf *btf = env->btf;
resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
if (!resolved_type) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member");
return -EINVAL;
}
resolved_member = *member;
resolved_member.type = resolved_type_id;
return btf_type_ops(resolved_type)->check_member(env, struct_type,
&resolved_member,
resolved_type);
}
static int btf_modifier_check_kflag_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
const struct btf_type *resolved_type;
u32 resolved_type_id = member->type;
struct btf_member resolved_member;
struct btf *btf = env->btf;
resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
if (!resolved_type) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member");
return -EINVAL;
}
resolved_member = *member;
resolved_member.type = resolved_type_id;
return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type,
&resolved_member,
resolved_type);
}
static int btf_ptr_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_size, struct_bits_off, bytes_offset;
struct_size = struct_type->size;
struct_bits_off = member->offset;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
return -EINVAL;
}
if (struct_size - bytes_offset < sizeof(void *)) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const char *value;
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (!BTF_TYPE_ID_VALID(t->type)) {
btf_verifier_log_type(env, t, "Invalid type_id");
return -EINVAL;
}
/* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
if (!t->name_off ||
!btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
} else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
value = btf_name_by_offset(env->btf, t->name_off);
if (!value || !value[0]) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
}
btf_verifier_log_type(env, t, NULL);
return 0;
}
static int btf_modifier_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_type *t = v->t;
const struct btf_type *next_type;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
/* Figure out the resolved next_type_id with size.
* They will be stored in the current modifier's
* resolved_ids and resolved_sizes such that it can
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
if (!btf_type_id_size(btf, &next_type_id, NULL)) {
if (env_type_is_resolved(env, next_type_id))
next_type = btf_type_id_resolve(btf, &next_type_id);
/* "typedef void new_void", "const void"...etc */
if (!btf_type_is_void(next_type) &&
!btf_type_is_fwd(next_type) &&
!btf_type_is_func_proto(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
}
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
static int btf_var_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_type *next_type;
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
u32 resolved_type_id;
resolved_type_id = next_type_id;
resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
if (btf_type_is_ptr(resolved_type) &&
!env_type_is_resolve_sink(env, resolved_type) &&
!env_type_is_resolved(env, resolved_type_id))
return env_stack_push(env, resolved_type,
resolved_type_id);
}
/* We must resolve to something concrete at this point, no
* forward types or similar that would resolve to size of
* zero is allowed.
*/
if (!btf_type_id_size(btf, &next_type_id, NULL)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_type *next_type;
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
/* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
* the modifier may have stopped resolving when it was resolved
* to a ptr (last-resolved-ptr).
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
* the currenct ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
u32 resolved_type_id;
resolved_type_id = next_type_id;
resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
if (btf_type_is_ptr(resolved_type) &&
!env_type_is_resolve_sink(env, resolved_type) &&
!env_type_is_resolved(env, resolved_type_id))
return env_stack_push(env, resolved_type,
resolved_type_id);
}
if (!btf_type_id_size(btf, &next_type_id, NULL)) {
if (env_type_is_resolved(env, next_type_id))
next_type = btf_type_id_resolve(btf, &next_type_id);
if (!btf_type_is_void(next_type) &&
!btf_type_is_fwd(next_type) &&
!btf_type_is_func_proto(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
}
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
static void btf_modifier_show(const struct btf *btf,
const struct btf_type *t,
u32 type_id, void *data,
u8 bits_offset, struct btf_show *show)
{
if (btf->resolved_ids)
t = btf_type_id_resolve(btf, &type_id);
else
t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
static void btf_var_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
t = btf_type_id_resolve(btf, &type_id);
btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
void *safe_data;
safe_data = btf_show_start_type(show, t, type_id, data);
if (!safe_data)
return;
/* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
if (show->flags & BTF_SHOW_PTR_RAW)
btf_show_type_value(show, "0x%px", *(void **)safe_data);
else
btf_show_type_value(show, "0x%p", *(void **)safe_data);
btf_show_end_type(show);
}
static void btf_ref_type_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "type_id=%u", t->type);
}
static struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
.check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
.show = btf_modifier_show,
};
static struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
.show = btf_ptr_show,
};
static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (t->type) {
btf_verifier_log_type(env, t, "type != 0");
return -EINVAL;
}
/* fwd type must have a valid name */
if (!t->name_off ||
!btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return 0;
}
static void btf_fwd_type_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
static struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_fwd_type_log,
.show = btf_df_show,
};
static int btf_array_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_bits_off = member->offset;
u32 struct_size, bytes_offset;
u32 array_type_id, array_size;
struct btf *btf = env->btf;
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
return -EINVAL;
}
array_type_id = member->type;
btf_type_id_size(btf, &array_type_id, &array_size);
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
if (struct_size - bytes_offset < array_size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static s32 btf_array_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_array *array = btf_type_array(t);
u32 meta_needed = sizeof(*array);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
/* array type should not have a name */
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (t->size) {
btf_verifier_log_type(env, t, "size != 0");
return -EINVAL;
}
/* Array elem type and index type cannot be in type void,
* so !array->type and !array->index_type are not allowed.
*/
if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
btf_verifier_log_type(env, t, "Invalid elem");
return -EINVAL;
}
if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
btf_verifier_log_type(env, t, "Invalid index");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static int btf_array_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_array *array = btf_type_array(v->t);
const struct btf_type *elem_type, *index_type;
u32 elem_type_id, index_type_id;
struct btf *btf = env->btf;
u32 elem_size;
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
if (btf_type_nosize_or_null(index_type) ||
btf_type_is_resolve_source_only(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, index_type) &&
!env_type_is_resolved(env, index_type_id))
return env_stack_push(env, index_type, index_type_id);
index_type = btf_type_id_size(btf, &index_type_id, NULL);
if (!index_type || !btf_type_is_int(index_type) ||
!btf_type_int_is_regular(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
if (btf_type_nosize_or_null(elem_type) ||
btf_type_is_resolve_source_only(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, elem_type) &&
!env_type_is_resolved(env, elem_type_id))
return env_stack_push(env, elem_type, elem_type_id);
elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
if (!elem_type) {
btf_verifier_log_type(env, v->t, "Invalid elem");
return -EINVAL;
}
if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
btf_verifier_log_type(env, v->t, "Invalid array of int");
return -EINVAL;
}
if (array->nelems && elem_size > U32_MAX / array->nelems) {
btf_verifier_log_type(env, v->t,
"Array size overflows U32_MAX");
return -EINVAL;
}
env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
return 0;
}
static void btf_array_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
const struct btf_array *array = btf_type_array(t);
btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
array->type, array->index_type, array->nelems);
}
static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_array *array = btf_type_array(t);
const struct btf_kind_operations *elem_ops;
const struct btf_type *elem_type;
u32 i, elem_size = 0, elem_type_id;
u16 encoding = 0;
elem_type_id = array->type;
elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
if (elem_type && btf_type_has_size(elem_type))
elem_size = elem_type->size;
if (elem_type && btf_type_is_int(elem_type)) {
u32 int_type = btf_type_int(elem_type);
encoding = BTF_INT_ENCODING(int_type);
/*
* BTF_INT_CHAR encoding never seems to be set for
* char arrays, so if size is 1 and element is
* printable as a char, we'll do that.
*/
if (elem_size == 1)
encoding = BTF_INT_CHAR;
}
if (!btf_show_start_array_type(show, t, type_id, encoding, data))
return;
if (!elem_type)
goto out;
elem_ops = btf_type_ops(elem_type);
for (i = 0; i < array->nelems; i++) {
btf_show_start_array_member(show);
elem_ops->show(btf, elem_type, elem_type_id, data,
bits_offset, show);
data += elem_size;
btf_show_end_array_member(show);
if (show->state.array_terminated)
break;
}
out:
btf_show_end_array_type(show);
}
static void btf_array_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_member *m = show->state.member;
/*
* First check if any members would be shown (are non-zero).
* See comments above "struct btf_show" definition for more
* details on how this works at a high-level.
*/
if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
if (!show->state.depth_check) {
show->state.depth_check = show->state.depth + 1;
show->state.depth_to_show = 0;
}
__btf_array_show(btf, t, type_id, data, bits_offset, show);
show->state.member = m;
if (show->state.depth_check != show->state.depth + 1)
return;
show->state.depth_check = 0;
if (show->state.depth_to_show <= show->state.depth)
return;
/*
* Reaching here indicates we have recursed and found
* non-zero array member(s).
*/
}
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
.show = btf_array_show,
};
static int btf_struct_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_bits_off = member->offset;
u32 struct_size, bytes_offset;
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
return -EINVAL;
}
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static s32 btf_struct_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
const struct btf_member *member;
u32 meta_needed, last_offset;
struct btf *btf = env->btf;
u32 struct_size = t->size;
u32 offset;
u16 i;
meta_needed = btf_type_vlen(t) * sizeof(*member);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
/* struct type either no name or a valid one */
if (t->name_off &&
!btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
last_offset = 0;
for_each_member(i, t, member) {
if (!btf_name_offset_valid(btf, member->name_off)) {
btf_verifier_log_member(env, t, member,
"Invalid member name_offset:%u",
member->name_off);
return -EINVAL;
}
/* struct member either no name or a valid one */
if (member->name_off &&
!btf_name_valid_identifier(btf, member->name_off)) {
btf_verifier_log_member(env, t, member, "Invalid name");
return -EINVAL;
}
/* A member cannot be in type void */
if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
btf_verifier_log_member(env, t, member,
"Invalid type_id");
return -EINVAL;
}
offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
}
/*
* ">" instead of ">=" because the last member could be
* "char a[0];"
*/
if (last_offset > offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
return -EINVAL;
}
if (BITS_ROUNDUP_BYTES(offset) > struct_size) {
btf_verifier_log_member(env, t, member,
"Member bits_offset exceeds its struct size");
return -EINVAL;
}
btf_verifier_log_member(env, t, member, NULL);
last_offset = offset;
}
return meta_needed;
}
static int btf_struct_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_member *member;
int err;
u16 i;
/* Before continue resolving the next_member,
* ensure the last member is indeed resolved to a
* type with size info.
*/
if (v->next_member) {
const struct btf_type *last_member_type;
const struct btf_member *last_member;
u16 last_member_type_id;
last_member = btf_type_member(v->t) + v->next_member - 1;
last_member_type_id = last_member->type;
if (WARN_ON_ONCE(!env_type_is_resolved(env,
last_member_type_id)))
return -EINVAL;
last_member_type = btf_type_by_id(env->btf,
last_member_type_id);
if (btf_type_kflag(v->t))
err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t,
last_member,
last_member_type);
else
err = btf_type_ops(last_member_type)->check_member(env, v->t,
last_member,
last_member_type);
if (err)
return err;
}
for_each_member_from(i, v->next_member, v->t, member) {
u32 member_type_id = member->type;
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
if (btf_type_nosize_or_null(member_type) ||
btf_type_is_resolve_source_only(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, member_type) &&
!env_type_is_resolved(env, member_type_id)) {
env_stack_set_next_member(env, i + 1);
return env_stack_push(env, member_type, member_type_id);
}
if (btf_type_kflag(v->t))
err = btf_type_ops(member_type)->check_kflag_member(env, v->t,
member,
member_type);
else
err = btf_type_ops(member_type)->check_member(env, v->t,
member,
member_type);
if (err)
return err;
}
env_stack_pop_resolved(env, 0, 0);
return 0;
}
static void btf_struct_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
if (member_type->size != sz)
continue;
if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
if (off % align)
return -EINVAL;
}
return off;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
const struct btf_var_secinfo *vsi;
u32 i, off = -ENOENT;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
if (!__btf_type_is_struct(var_type))
continue;
if (var_type->size != sz)
continue;
if (vsi->size != sz)
continue;
if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
continue;
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
off = vsi->offset;
if (off % align)
return -EINVAL;
}
return off;
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
const char *name, int sz, int align)
{
if (__btf_type_is_struct(t))
return btf_find_struct_field(btf, t, name, sz, align);
else if (btf_type_is_datasec(t))
return btf_find_datasec_var(btf, t, name, sz, align);
return -EINVAL;
}
/* find 'struct bpf_spin_lock' in map value.
* return >= 0 offset if found
* and < 0 in case of error
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
{
return btf_find_field(btf, t, "bpf_spin_lock",
sizeof(struct bpf_spin_lock),
__alignof__(struct bpf_spin_lock));
}
int btf_find_timer(const struct btf *btf, const struct btf_type *t)
{
return btf_find_field(btf, t, "bpf_timer",
sizeof(struct bpf_timer),
__alignof__(struct bpf_timer));
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_member *member;
void *safe_data;
u32 i;
safe_data = btf_show_start_struct_type(show, t, type_id, data);
if (!safe_data)
return;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
const struct btf_kind_operations *ops;
u32 member_offset, bitfield_size;
u32 bytes_offset;
u8 bits8_offset;
btf_show_start_member(show, member);
member_offset = __btf_member_bit_offset(t, member);
bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
safe_data = btf_show_start_type(show, member_type,
member->type,
data + bytes_offset);
if (safe_data)
btf_bitfield_show(safe_data,
bits8_offset,
bitfield_size, show);
btf_show_end_type(show);
} else {
ops = btf_type_ops(member_type);
ops->show(btf, member_type, member->type,
data + bytes_offset, bits8_offset, show);
}
btf_show_end_member(show);
}
btf_show_end_struct_type(show);
}
static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_member *m = show->state.member;
/*
* First check if any members would be shown (are non-zero).
* See comments above "struct btf_show" definition for more
* details on how this works at a high-level.
*/
if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
if (!show->state.depth_check) {
show->state.depth_check = show->state.depth + 1;
show->state.depth_to_show = 0;
}
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
/* Restore saved member data here */
show->state.member = m;
if (show->state.depth_check != show->state.depth + 1)
return;
show->state.depth_check = 0;
if (show->state.depth_to_show <= show->state.depth)
return;
/*
* Reaching here indicates we have recursed and found
* non-zero child values.
*/
}
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
.show = btf_struct_show,
};
static int btf_enum_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_bits_off = member->offset;
u32 struct_size, bytes_offset;
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
return -EINVAL;
}
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static int btf_enum_check_kflag_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u32 struct_bits_off, nr_bits, bytes_end, struct_size;
u32 int_bitsize = sizeof(int) * BITS_PER_BYTE;
struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset);
nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset);
if (!nr_bits) {
if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
btf_verifier_log_member(env, struct_type, member,
"Member is not byte aligned");
return -EINVAL;
}
nr_bits = int_bitsize;
} else if (nr_bits > int_bitsize) {
btf_verifier_log_member(env, struct_type, member,
"Invalid member bitfield_size");
return -EINVAL;
}
struct_size = struct_type->size;
bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits);
if (struct_size < bytes_end) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static s32 btf_enum_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
u16 i, nr_enums;
u32 meta_needed;
nr_enums = btf_type_vlen(t);
meta_needed = nr_enums * sizeof(*enums);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
}
/* enum type either no name or a valid one */
if (t->name_off &&
!btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
for (i = 0; i < nr_enums; i++) {
if (!btf_name_offset_valid(btf, enums[i].name_off)) {
btf_verifier_log(env, "\tInvalid name_offset:%u",
enums[i].name_off);
return -EINVAL;
}
/* enum member must have a valid name */
if (!enums[i].name_off ||
!btf_name_valid_identifier(btf, enums[i].name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
if (env->log.level == BPF_LOG_KERNEL)
continue;
btf_verifier_log(env, "\t%s val=%d\n",
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
return meta_needed;
}
static void btf_enum_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_enum *enums = btf_type_enum(t);
u32 i, nr_enums = btf_type_vlen(t);
void *safe_data;
int v;
safe_data = btf_show_start_type(show, t, type_id, data);
if (!safe_data)
return;
v = *(int *)safe_data;
for (i = 0; i < nr_enums; i++) {
if (v != enums[i].val)
continue;
btf_show_type_value(show, "%s",
__btf_name_by_offset(btf,
enums[i].name_off));
btf_show_end_type(show);
return;
}
btf_show_type_value(show, "%d", v);
btf_show_end_type(show);
}
static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
.check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
.show = btf_enum_show,
};
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static void btf_func_proto_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
const struct btf_param *args = (const struct btf_param *)(t + 1);
u16 nr_args = btf_type_vlen(t), i;
btf_verifier_log(env, "return=%u args=(", t->type);
if (!nr_args) {
btf_verifier_log(env, "void");
goto done;
}
if (nr_args == 1 && !args[0].type) {
/* Only one vararg */
btf_verifier_log(env, "vararg");
goto done;
}
btf_verifier_log(env, "%u %s", args[0].type,
__btf_name_by_offset(env->btf,
args[0].name_off));
for (i = 1; i < nr_args - 1; i++)
btf_verifier_log(env, ", %u %s", args[i].type,
__btf_name_by_offset(env->btf,
args[i].name_off));
if (nr_args > 1) {
const struct btf_param *last_arg = &args[nr_args - 1];
if (last_arg->type)
btf_verifier_log(env, ", %u %s", last_arg->type,
__btf_name_by_offset(env->btf,
last_arg->name_off));
else
btf_verifier_log(env, ", vararg");
}
done:
btf_verifier_log(env, ")");
}
static struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
* It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
*/
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_func_proto_log,
.show = btf_df_show,
};
static s32 btf_func_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
if (!t->name_off ||
!btf_name_valid_identifier(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
if (btf_type_vlen(t) > BTF_FUNC_GLOBAL) {
btf_verifier_log_type(env, t, "Invalid func linkage");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return 0;
}
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
.show = btf_df_show,
};
static s32 btf_var_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_var *var;
u32 meta_needed = sizeof(*var);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (!t->name_off ||
!__btf_name_valid(env->btf, t->name_off, true)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
/* A var cannot be in type void */
if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
btf_verifier_log_type(env, t, "Invalid type_id");
return -EINVAL;
}
var = btf_type_var(t);
if (var->linkage != BTF_VAR_STATIC &&
var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
btf_verifier_log_type(env, t, "Linkage not supported");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
{
const struct btf_var *var = btf_type_var(t);
btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
}
static const struct btf_kind_operations var_ops = {
.check_meta = btf_var_check_meta,
.resolve = btf_var_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_var_log,
.show = btf_var_show,
};
static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_var_secinfo *vsi;
u64 last_vsi_end_off = 0, sum = 0;
u32 i, meta_needed;
meta_needed = btf_type_vlen(t) * sizeof(*vsi);
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
if (!t->size) {
btf_verifier_log_type(env, t, "size == 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (!t->name_off ||
!btf_name_valid_section(env->btf, t->name_off)) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
for_each_vsi(i, t, vsi) {
/* A var cannot be in type void */
if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
btf_verifier_log_vsi(env, t, vsi,
"Invalid type_id");
return -EINVAL;
}
if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
btf_verifier_log_vsi(env, t, vsi,
"Invalid offset");
return -EINVAL;
}
if (!vsi->size || vsi->size > t->size) {
btf_verifier_log_vsi(env, t, vsi,
"Invalid size");
return -EINVAL;
}
last_vsi_end_off = vsi->offset + vsi->size;
if (last_vsi_end_off > t->size) {
btf_verifier_log_vsi(env, t, vsi,
"Invalid offset+size");
return -EINVAL;
}
btf_verifier_log_vsi(env, t, vsi, NULL);
sum += vsi->size;
}
if (t->size < sum) {
btf_verifier_log_type(env, t, "Invalid btf_info size");
return -EINVAL;
}
return meta_needed;
}
static int btf_datasec_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_var_secinfo *vsi;
struct btf *btf = env->btf;
u16 i;
for_each_vsi_from(i, v->next_member, v->t, vsi) {
u32 var_type_id = vsi->type, type_id, type_size = 0;
const struct btf_type *var_type = btf_type_by_id(env->btf,
var_type_id);
if (!var_type || !btf_type_is_var(var_type)) {
btf_verifier_log_vsi(env, v->t, vsi,
"Not a VAR kind member");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, var_type) &&
!env_type_is_resolved(env, var_type_id)) {
env_stack_set_next_member(env, i + 1);
return env_stack_push(env, var_type, var_type_id);
}
type_id = var_type->type;
if (!btf_type_id_size(btf, &type_id, &type_size)) {
btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
return -EINVAL;
}
if (vsi->size < type_size) {
btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
return -EINVAL;
}
}
env_stack_pop_resolved(env, 0, 0);
return 0;
}
static void btf_datasec_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
static void btf_datasec_show(const struct btf *btf,
const struct btf_type *t, u32 type_id,
void *data, u8 bits_offset,
struct btf_show *show)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *var;
u32 i;
if (!btf_show_start_type(show, t, type_id, data))
return;
btf_show_type_value(show, "section (\"%s\") = {",
__btf_name_by_offset(btf, t->name_off));
for_each_vsi(i, t, vsi) {
var = btf_type_by_id(btf, vsi->type);
if (i)
btf_show(show, ",");
btf_type_ops(var)->show(btf, var, vsi->type,
data + vsi->offset, bits_offset, show);
}
btf_show_end_type(show);
}
static const struct btf_kind_operations datasec_ops = {
.check_meta = btf_datasec_check_meta,
.resolve = btf_datasec_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_datasec_log,
.show = btf_datasec_show,
};
static s32 btf_float_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
t->size != 16) {
btf_verifier_log_type(env, t, "Invalid type_size");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return 0;
}
static int btf_float_check_member(struct btf_verifier_env *env,
const struct btf_type *struct_type,
const struct btf_member *member,
const struct btf_type *member_type)
{
u64 start_offset_bytes;
u64 end_offset_bytes;
u64 misalign_bits;
u64 align_bytes;
u64 align_bits;
/* Different architectures have different alignment requirements, so
* here we check only for the reasonable minimum. This way we ensure
* that types after CO-RE can pass the kernel BTF verifier.
*/
align_bytes = min_t(u64, sizeof(void *), member_type->size);
align_bits = align_bytes * BITS_PER_BYTE;
div64_u64_rem(member->offset, align_bits, &misalign_bits);
if (misalign_bits) {
btf_verifier_log_member(env, struct_type, member,
"Member is not properly aligned");
return -EINVAL;
}
start_offset_bytes = member->offset / BITS_PER_BYTE;
end_offset_bytes = start_offset_bytes + member_type->size;
if (end_offset_bytes > struct_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
}
return 0;
}
static void btf_float_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
btf_verifier_log(env, "size=%u", t->size);
}
static const struct btf_kind_operations float_ops = {
.check_meta = btf_float_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_float_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_float_log,
.show = btf_df_show,
};
static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_decl_tag *tag;
u32 meta_needed = sizeof(*tag);
s32 component_idx;
const char *value;
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
value = btf_name_by_offset(env->btf, t->name_off);
if (!value || !value[0]) {
btf_verifier_log_type(env, t, "Invalid value");
return -EINVAL;
}
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx < -1) {
btf_verifier_log_type(env, t, "Invalid component_idx");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static int btf_decl_tag_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_type *next_type;
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
s32 component_idx;
u32 vlen;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx != -1) {
if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid component_idx");
return -EINVAL;
}
if (btf_type_is_struct(next_type)) {
vlen = btf_type_vlen(next_type);
} else {
/* next_type should be a function */
next_type = btf_type_by_id(btf, next_type->type);
vlen = btf_type_vlen(next_type);
}
if ((u32)component_idx >= vlen) {
btf_verifier_log_type(env, v->t, "Invalid component_idx");
return -EINVAL;
}
}
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
{
btf_verifier_log(env, "type=%u component_idx=%d", t->type,
btf_type_decl_tag(t)->component_idx);
}
static const struct btf_kind_operations decl_tag_ops = {
.check_meta = btf_decl_tag_check_meta,
.resolve = btf_decl_tag_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_decl_tag_log,
.show = btf_df_show,
};
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
const struct btf_type *ret_type;
const struct btf_param *args;
const struct btf *btf;
u16 nr_args, i;
int err;
btf = env->btf;
args = (const struct btf_param *)(t + 1);
nr_args = btf_type_vlen(t);
/* Check func return type which could be "void" (t->type == 0) */
if (t->type) {
u32 ret_type_id = t->type;
ret_type = btf_type_by_id(btf, ret_type_id);
if (!ret_type) {
btf_verifier_log_type(env, t, "Invalid return type");
return -EINVAL;
}
if (btf_type_needs_resolve(ret_type) &&
!env_type_is_resolved(env, ret_type_id)) {
err = btf_resolve(env, ret_type, ret_type_id);
if (err)
return err;
}
/* Ensure the return type is a type that has a size */
if (!btf_type_id_size(btf, &ret_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid return type");
return -EINVAL;
}
}
if (!nr_args)
return 0;
/* Last func arg type_id could be 0 if it is a vararg */
if (!args[nr_args - 1].type) {
if (args[nr_args - 1].name_off) {
btf_verifier_log_type(env, t, "Invalid arg#%u",
nr_args);
return -EINVAL;
}
nr_args--;
}
err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
arg_type_id = args[i].type;
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
err = -EINVAL;
break;
}
if (args[i].name_off &&
(!btf_name_offset_valid(btf, args[i].name_off) ||
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
err = -EINVAL;
break;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
break;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
err = -EINVAL;
break;
}
}
return err;
}
static int btf_func_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
const struct btf_type *proto_type;
const struct btf_param *args;
const struct btf *btf;
u16 nr_args, i;
btf = env->btf;
proto_type = btf_type_by_id(btf, t->type);
if (!proto_type || !btf_type_is_func_proto(proto_type)) {
btf_verifier_log_type(env, t, "Invalid type_id");
return -EINVAL;
}
args = (const struct btf_param *)(proto_type + 1);
nr_args = btf_type_vlen(proto_type);
for (i = 0; i < nr_args; i++) {
if (!args[i].name_off && args[i].type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
return -EINVAL;
}
}
return 0;
}
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_INT] = &int_ops,
[BTF_KIND_PTR] = &ptr_ops,
[BTF_KIND_ARRAY] = &array_ops,
[BTF_KIND_STRUCT] = &struct_ops,
[BTF_KIND_UNION] = &struct_ops,
[BTF_KIND_ENUM] = &enum_ops,
[BTF_KIND_FWD] = &fwd_ops,
[BTF_KIND_TYPEDEF] = &modifier_ops,
[BTF_KIND_VOLATILE] = &modifier_ops,
[BTF_KIND_CONST] = &modifier_ops,
[BTF_KIND_RESTRICT] = &modifier_ops,
[BTF_KIND_FUNC] = &func_ops,
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
[BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
u32 saved_meta_left = meta_left;
s32 var_meta_size;
if (meta_left < sizeof(*t)) {
btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
env->log_type_id, meta_left, sizeof(*t));
return -EINVAL;
}
meta_left -= sizeof(*t);
if (t->info & ~BTF_INFO_MASK) {
btf_verifier_log(env, "[%u] Invalid btf_info:%x",
env->log_type_id, t->info);
return -EINVAL;
}
if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
btf_verifier_log(env, "[%u] Invalid kind:%u",
env->log_type_id, BTF_INFO_KIND(t->info));
return -EINVAL;
}
if (!btf_name_offset_valid(env->btf, t->name_off)) {
btf_verifier_log(env, "[%u] Invalid name_offset:%u",
env->log_type_id, t->name_off);
return -EINVAL;
}
var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
if (var_meta_size < 0)
return var_meta_size;
meta_left -= var_meta_size;
return saved_meta_left - meta_left;
}
static int btf_check_all_metas(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
struct btf_header *hdr;
void *cur, *end;
hdr = &btf->hdr;
cur = btf->nohdr_data + hdr->type_off;
end = cur + hdr->type_len;
env->log_type_id = btf->base_btf ? btf->start_id : 1;
while (cur < end) {
struct btf_type *t = cur;
s32 meta_size;
meta_size = btf_check_meta(env, t, end - cur);
if (meta_size < 0)
return meta_size;
btf_add_type(env, t);
cur += meta_size;
env->log_type_id++;
}
return 0;
}
static bool btf_resolve_valid(struct btf_verifier_env *env,
const struct btf_type *t,
u32 type_id)
{
struct btf *btf = env->btf;
if (!env_type_is_resolved(env, type_id))
return false;
if (btf_type_is_struct(t) || btf_type_is_datasec(t))
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
if (btf_type_is_decl_tag(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
return t &&
!btf_type_is_modifier(t) &&
!btf_type_is_var(t) &&
!btf_type_is_datasec(t);
}
if (btf_type_is_array(t)) {
const struct btf_array *array = btf_type_array(t);
const struct btf_type *elem_type;
u32 elem_type_id = array->type;
u32 elem_size;
elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
return elem_type && !btf_type_is_modifier(elem_type) &&
(array->nelems * elem_size ==
btf_resolved_type_size(btf, type_id));
}
return false;
}
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id)
{
u32 save_log_type_id = env->log_type_id;
const struct resolve_vertex *v;
int err = 0;
env->resolve_mode = RESOLVE_TBD;
env_stack_push(env, t, type_id);
while (!err && (v = env_stack_peak(env))) {
env->log_type_id = v->type_id;
err = btf_type_ops(v->t)->resolve(env, v);
}
env->log_type_id = type_id;
if (err == -E2BIG) {
btf_verifier_log_type(env, t,
"Exceeded max resolving depth:%u",
MAX_RESOLVE_DEPTH);
} else if (err == -EEXIST) {
btf_verifier_log_type(env, t, "Loop detected");
}
/* Final sanity check */
if (!err && !btf_resolve_valid(env, t, type_id)) {
btf_verifier_log_type(env, t, "Invalid resolve state");
err = -EINVAL;
}
env->log_type_id = save_log_type_id;
return err;
}
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
const struct btf_type *t;
u32 type_id, i;
int err;
err = env_resolve_init(env);
if (err)
return err;
env->phase++;
for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) {
type_id = btf->start_id + i;
t = btf_type_by_id(btf, type_id);
env->log_type_id = type_id;
if (btf_type_needs_resolve(t) &&
!env_type_is_resolved(env, type_id)) {
err = btf_resolve(env, t, type_id);
if (err)
return err;
}
if (btf_type_is_func_proto(t)) {
err = btf_func_proto_check(env, t);
if (err)
return err;
}
if (btf_type_is_func(t)) {
err = btf_func_check(env, t);
if (err)
return err;
}
}
return 0;
}
static int btf_parse_type_sec(struct btf_verifier_env *env)
{
const struct btf_header *hdr = &env->btf->hdr;
int err;
/* Type section must align to 4 bytes */
if (hdr->type_off & (sizeof(u32) - 1)) {
btf_verifier_log(env, "Unaligned type_off");
return -EINVAL;
}
if (!env->btf->base_btf && !hdr->type_len) {
btf_verifier_log(env, "No type found");
return -EINVAL;
}
err = btf_check_all_metas(env);
if (err)
return err;
return btf_check_all_types(env);
}
static int btf_parse_str_sec(struct btf_verifier_env *env)
{
const struct btf_header *hdr;
struct btf *btf = env->btf;
const char *start, *end;
hdr = &btf->hdr;
start = btf->nohdr_data + hdr->str_off;
end = start + hdr->str_len;
if (end != btf->data + btf->data_size) {
btf_verifier_log(env, "String section is not at the end");
return -EINVAL;
}
btf->strings = start;
if (btf->base_btf && !hdr->str_len)
return 0;
if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) {
btf_verifier_log(env, "Invalid string section");
return -EINVAL;
}
if (!btf->base_btf && start[0]) {
btf_verifier_log(env, "Invalid string section");
return -EINVAL;
}
return 0;
}
static const size_t btf_sec_info_offset[] = {
offsetof(struct btf_header, type_off),
offsetof(struct btf_header, str_off),
};
static int btf_sec_info_cmp(const void *a, const void *b)
{
const struct btf_sec_info *x = a;
const struct btf_sec_info *y = b;
return (int)(x->off - y->off) ? : (int)(x->len - y->len);
}
static int btf_check_sec_info(struct btf_verifier_env *env,
u32 btf_data_size)
{
struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
u32 total, expected_total, i;
const struct btf_header *hdr;
const struct btf *btf;
btf = env->btf;
hdr = &btf->hdr;
/* Populate the secs from hdr */
for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
secs[i] = *(struct btf_sec_info *)((void *)hdr +
btf_sec_info_offset[i]);
sort(secs, ARRAY_SIZE(btf_sec_info_offset),
sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
/* Check for gaps and overlap among sections */
total = 0;
expected_total = btf_data_size - hdr->hdr_len;
for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
if (expected_total < secs[i].off) {
btf_verifier_log(env, "Invalid section offset");
return -EINVAL;
}
if (total < secs[i].off) {
/* gap */
btf_verifier_log(env, "Unsupported section found");
return -EINVAL;
}
if (total > secs[i].off) {
btf_verifier_log(env, "Section overlap found");
return -EINVAL;
}
if (expected_total - total < secs[i].len) {
btf_verifier_log(env,
"Total section length too long");
return -EINVAL;
}
total += secs[i].len;
}
/* There is data other than hdr and known sections */
if (expected_total != total) {
btf_verifier_log(env, "Unsupported section found");
return -EINVAL;
}
return 0;
}
static int btf_parse_hdr(struct btf_verifier_env *env)
{
u32 hdr_len, hdr_copy, btf_data_size;
const struct btf_header *hdr;
struct btf *btf;
int err;
btf = env->btf;
btf_data_size = btf->data_size;
if (btf_data_size <
offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
hdr = btf->data;
hdr_len = hdr->hdr_len;
if (btf_data_size < hdr_len) {
btf_verifier_log(env, "btf_header not found");
return -EINVAL;
}
/* Ensure the unsupported header fields are zero */
if (hdr_len > sizeof(btf->hdr)) {
u8 *expected_zero = btf->data + sizeof(btf->hdr);
u8 *end = btf->data + hdr_len;
for (; expected_zero < end; expected_zero++) {
if (*expected_zero) {
btf_verifier_log(env, "Unsupported btf_header");
return -E2BIG;
}
}
}
hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
memcpy(&btf->hdr, btf->data, hdr_copy);
hdr = &btf->hdr;
btf_verifier_log_hdr(env, btf_data_size);
if (hdr->magic != BTF_MAGIC) {
btf_verifier_log(env, "Invalid magic");
return -EINVAL;
}
if (hdr->version != BTF_VERSION) {
btf_verifier_log(env, "Unsupported version");
return -ENOTSUPP;
}
if (hdr->flags) {
btf_verifier_log(env, "Unsupported flags");
return -ENOTSUPP;
}
if (!btf->base_btf && btf_data_size == hdr->hdr_len) {
btf_verifier_log(env, "No data");
return -EINVAL;
}
err = btf_check_sec_info(env, btf_data_size);
if (err)
return err;
return 0;
}
static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
int err;
if (btf_data_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
log = &env->log;
if (log_level || log_ubuf || log_size) {
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
log->level = log_level;
log->ubuf = log_ubuf;
log->len_total = log_size;
/* log attributes have to be sane */
if (!bpf_verifier_log_attr_valid(log)) {
err = -EINVAL;
goto errout;
}
}
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
goto errout;
}
env->btf = btf;
data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
btf->data_size = btf_data_size;
if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
err = -EFAULT;
goto errout;
}
err = btf_parse_hdr(env);
if (err)
goto errout;
btf->nohdr_data = btf->data + btf->hdr.hdr_len;
err = btf_parse_str_sec(env);
if (err)
goto errout;
err = btf_parse_type_sec(env);
if (err)
goto errout;
if (log->level && bpf_verifier_log_full(log)) {
err = -ENOSPC;
goto errout;
}
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
return ERR_PTR(err);
}
extern char __weak __start_BTF[];
extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
prog_ctx_type _id##_prog; \
kern_ctx_type _id##_kern;
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
} *__t;
/* 't' is written once under lock. Read many times. */
const struct btf_type *t;
} bpf_ctx_convert;
enum {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
__ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
__ctx_convert_unused, /* to avoid empty enum in extreme .config */
};
static u8 bpf_ctx_convert_map[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
[_id] = __ctx_convert##_id,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
static const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
const struct btf_type *conv_struct;
const struct btf_type *ctx_struct;
const struct btf_member *ctx_type;
const char *tname, *ctx_tname;
conv_struct = bpf_ctx_convert.t;
if (!conv_struct) {
bpf_log(log, "btf_vmlinux is malformed\n");
return NULL;
}
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_struct(t)) {
/* Only pointer to struct is supported for now.
* That means that BPF_PROG_TYPE_TRACEPOINT with BTF
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
if (!tname) {
bpf_log(log, "arg#%d struct doesn't have a name\n", arg);
return NULL;
}
/* prog_type is valid bpf program type. No need for bounds check. */
ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
/* ctx_struct is a pointer to prog_ctx_type in vmlinux.
* Like 'struct __sk_buff'
*/
ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
if (!ctx_struct)
/* should not happen */
return NULL;
ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
if (!ctx_tname) {
/* should not happen */
bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
return NULL;
}
/* only compare that prog's ctx type name is the same as
* kernel expects. No need to compare field by field.
* It's ok for bpf prog to do:
* struct __sk_buff {};
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
if (strcmp(ctx_tname, tname))
return NULL;
return ctx_type;
}
static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
#define BPF_LINK_TYPE(_id, _name)
#define BPF_MAP_TYPE(_id, _ops) \
[_id] = &_ops,
#include <linux/bpf_types.h>
#undef BPF_PROG_TYPE
#undef BPF_LINK_TYPE
#undef BPF_MAP_TYPE
};
static int btf_vmlinux_map_ids_init(const struct btf *btf,
struct bpf_verifier_log *log)
{
const struct bpf_map_ops *ops;
int i, btf_id;
for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
ops = btf_vmlinux_map_ops[i];
if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
continue;
if (!ops->map_btf_name || !ops->map_btf_id) {
bpf_log(log, "map type %d is misconfigured\n", i);
return -EINVAL;
}
btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
BTF_KIND_STRUCT);
if (btf_id < 0)
return btf_id;
*ops->map_btf_id = btf_id;
}
return 0;
}
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
enum bpf_prog_type prog_type,
int arg)
{
const struct btf_member *prog_ctx_type, *kern_ctx_type;
prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type, arg);
if (!prog_ctx_type)
return -ENOENT;
kern_ctx_type = prog_ctx_type + 1;
return kern_ctx_type->type;
}
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
struct btf *btf_parse_vmlinux(void)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
log = &env->log;
log->level = BPF_LOG_KERNEL;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
goto errout;
}
env->btf = btf;
btf->data = __start_BTF;
btf->data_size = __stop_BTF - __start_BTF;
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "vmlinux");
err = btf_parse_hdr(env);
if (err)
goto errout;
btf->nohdr_data = btf->data + btf->hdr.hdr_len;
err = btf_parse_str_sec(env);
if (err)
goto errout;
err = btf_check_all_metas(env);
if (err)
goto errout;
/* btf_parse_vmlinux() runs under bpf_verifier_lock */
bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
/* find bpf map structs for map_ptr access checking */
err = btf_vmlinux_map_ids_init(btf, log);
if (err < 0)
goto errout;
bpf_struct_ops_init(btf, log);
refcount_set(&btf->refcnt, 1);
err = btf_alloc_id(btf);
if (err)
goto errout;
btf_verifier_env_free(env);
return btf;
errout:
btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
}
return ERR_PTR(err);
}
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL, *base_btf;
int err;
base_btf = bpf_get_btf_vmlinux();
if (IS_ERR(base_btf))
return base_btf;
if (!base_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
log = &env->log;
log->level = BPF_LOG_KERNEL;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
goto errout;
}
env->btf = btf;
btf->base_btf = base_btf;
btf->start_id = base_btf->nr_types;
btf->start_str_off = base_btf->hdr.str_len;
btf->kernel_btf = true;
snprintf(btf->name, sizeof(btf->name), "%s", module_name);
btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
if (!btf->data) {
err = -ENOMEM;
goto errout;
}
memcpy(btf->data, data, data_size);
btf->data_size = data_size;
err = btf_parse_hdr(env);
if (err)
goto errout;
btf->nohdr_data = btf->data + btf->hdr.hdr_len;
err = btf_parse_str_sec(env);
if (err)
goto errout;
err = btf_check_all_metas(env);
if (err)
goto errout;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
kfree(btf);
}
return ERR_PTR(err);
}
#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
struct bpf_prog *tgt_prog = prog->aux->dst_prog;
if (tgt_prog)
return tgt_prog->aux->btf;
else
return prog->aux->attach_btf;
}
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
/* allow const */
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const struct btf_type *t = prog->aux->attach_func_proto;
struct bpf_prog *tgt_prog = prog->aux->dst_prog;
struct btf *btf = bpf_prog_get_target_btf(prog);
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
u32 nr_args, arg;
int i, ret;
if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
tname, off);
return false;
}
arg = off / 8;
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
*/
nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
if (prog->aux->attach_btf_trace) {
/* skip first 'void *__data' argument in btf_trace_##name typedef */
args++;
nr_args--;
}
if (arg > nr_args) {
bpf_log(log, "func '%s' doesn't have %d-th argument\n",
tname, arg + 1);
return false;
}
if (arg == nr_args) {
switch (prog->expected_attach_type) {
case BPF_LSM_MAC:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
* they use FEXIT trampolines and when attached to
* int LSM hooks, they use MODIFY_RETURN trampolines.
*
* While the LSM programs are BPF_MODIFY_RETURN-like
* the check:
*
* if (ret_type != 'int')
* return -EINVAL;
*
* is _not_ done here. This is still safe as LSM hooks
* have only void and int return types.
*/
if (!t)
return true;
t = btf_type_by_id(btf, t->type);
break;
case BPF_MODIFY_RETURN:
/* For now the BPF_MODIFY_RETURN can only be attached to
* functions that return an int.
*/
if (!t)
return false;
t = btf_type_skip_modifiers(btf, t->type, NULL);
if (!btf_type_is_small_int(t)) {
bpf_log(log,
"ret type %s not allowed for fmod_ret\n",
btf_kind_str[BTF_INFO_KIND(t->info)]);
return false;
}
break;
default:
bpf_log(log, "func '%s' doesn't have %d-th argument\n",
tname, arg + 1);
return false;
}
} else {
if (!t)
/* Default prog with MAX_BPF_FUNC_REG_ARGS args */
return true;
t = btf_type_by_id(btf, args[arg].type);
}
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_small_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
bpf_log(log,
"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
tname, arg,
__btf_name_by_offset(btf, t->name_off),
btf_kind_str[BTF_INFO_KIND(t->info)]);
return false;
}
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
u32 type, flag;
type = base_type(ctx_arg_info->reg_type);
flag = type_flag(ctx_arg_info->reg_type);
if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
(flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
}
if (t->type == 0)
/* This is a pointer to void.
* It is the same as scalar from the verifier safety pov.
* No further pointer walking is allowed.
*/
return true;
if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
if (ctx_arg_info->offset == off) {
if (!ctx_arg_info->btf_id) {
bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
return false;
}
info->reg_type = ctx_arg_info->reg_type;
info->btf = btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
return true;
}
}
info->reg_type = PTR_TO_BTF_ID;
if (tgt_prog) {
enum bpf_prog_type tgt_type;
if (tgt_prog->type == BPF_PROG_TYPE_EXT)
tgt_type = tgt_prog->aux->saved_dst_prog_type;
else
tgt_type = tgt_prog->type;
ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
if (ret > 0) {
info->btf = btf_vmlinux;
info->btf_id = ret;
return true;
} else {
return false;
}
}
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
}
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
return false;
}
bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
__btf_name_by_offset(btf, t->name_off));
return true;
}
enum bpf_struct_walk_result {
/* < 0 error */
WALK_SCALAR = 0,
WALK_PTR,
WALK_STRUCT,
};
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
u32 *next_btf_id)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
u32 vlen, elem_id, mid;
again:
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
vlen = btf_type_vlen(t);
if (off + size > t->size) {
/* If the last element is a variable size array, we may
* need to relax the rule.
*/
struct btf_array *array_elem;
if (vlen == 0)
goto error;
member = btf_type_member(t) + vlen - 1;
mtype = btf_type_skip_modifiers(btf, member->type,
NULL);
if (!btf_type_is_array(mtype))
goto error;
array_elem = (struct btf_array *)(mtype + 1);
if (array_elem->nelems != 0)
goto error;
moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
/* Only allow structure for now, can be relaxed for
* other types later.
*/
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
if (!btf_type_is_struct(t))
goto error;
off = (off - moff) % t->size;
goto again;
error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;
}
for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
if (__btf_member_bitfield_size(t, member)) {
u32 end_bit = __btf_member_bit_offset(t, member) +
__btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
* bitfield like the ":16" here:
* struct {
* int :16;
* int x:8;
* };
*/
if (off <= moff &&
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
return WALK_SCALAR;
/* off may be accessing a following member
*
* or
*
* Doing partial access at either end of this
* bitfield. Continue on this case also to
* treat it as not accessing this bitfield
* and eventually error out as field not
* found to keep it simple.
* It could be relaxed if there was a legit
* partial access case later.
*/
continue;
}
/* In case of "off" is pointing to holes of a struct */
if (off < moff)
break;
/* type of the field */
mid = member->type;
mtype = btf_type_by_id(btf, member->type);
mname = __btf_name_by_offset(btf, member->name_off);
mtype = __btf_resolve_size(btf, mtype, &msize,
&elem_type, &elem_id, &total_nelems,
&mid);
if (IS_ERR(mtype)) {
bpf_log(log, "field %s doesn't have size\n", mname);
return -EFAULT;
}
mtrue_end = moff + msize;
if (off >= mtrue_end)
/* no overlap with member, keep iterating */
continue;
if (btf_type_is_array(mtype)) {
u32 elem_idx;
/* __btf_resolve_size() above helps to
* linearize a multi-dimensional array.
*
* The logic here is treating an array
* in a struct as the following way:
*
* struct outer {
* struct inner array[2][2];
* };
*
* looks like:
*
* struct outer {
* struct inner array_elem0;
* struct inner array_elem1;
* struct inner array_elem2;
* struct inner array_elem3;
* };
*
* When accessing outer->array[1][0], it moves
* moff to "array_elem2", set mtype to
* "struct inner", and msize also becomes
* sizeof(struct inner). Then most of the
* remaining logic will fall through without
* caring the current member is an array or
* not.
*
* Unlike mtype/msize/moff, mtrue_end does not
* change. The naming difference ("_true") tells
* that it is not always corresponding to
* the current mtype/msize/moff.
* It is the true end of the current
* member (i.e. array in this case). That
* will allow an int array to be accessed like
* a scratch space,
* i.e. allow access beyond the size of
* the array's element as long as it is
* within the mtrue_end boundary.
*/
/* skip empty array */
if (moff == mtrue_end)
continue;
msize /= total_nelems;
elem_idx = (off - moff) / msize;
moff += elem_idx * msize;
mtype = elem_type;
mid = elem_id;
}
/* the 'off' we're looking for is either equal to start
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
/* our field must be inside that union or struct */
t = mtype;
/* return if the offset matches the member offset */
if (off == moff) {
*next_btf_id = mid;
return WALK_STRUCT;
}
/* adjust offset we're looking for */
off -= moff;
goto again;
}
if (btf_type_is_ptr(mtype)) {
const struct btf_type *stype;
u32 id;
if (msize != size || off != moff) {
bpf_log(log,
"cannot access ptr member %s with moff %u in struct %s with off %u size %u\n",
mname, moff, tname, off, size);
return -EACCES;
}
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
return WALK_PTR;
}
}
/* Allow more flexible access within an int as long as
* it is within mtrue_end.
* Since mtrue_end could be the end of an array,
* that also allows using an array of int as a scratch
* space. e.g. skb->cb[].
*/
if (off + size > mtrue_end) {
bpf_log(log,
"access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n",
mname, mtrue_end, tname, off, size);
return -EACCES;
}
return WALK_SCALAR;
}
bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
return -EINVAL;
}
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype __maybe_unused,
u32 *next_btf_id)
{
int err;
u32 id;
do {
err = btf_struct_walk(log, btf, t, off, size, &id);
switch (err) {
case WALK_PTR:
/* If we found the pointer or scalar on t+off,
* we're done.
*/
*next_btf_id = id;
return PTR_TO_BTF_ID;
case WALK_SCALAR:
return SCALAR_VALUE;
case WALK_STRUCT:
/* We found nested struct, so continue the search
* by diving in it. At this point the offset is
* aligned with the new type, so set it to 0.
*/
t = btf_type_by_id(btf, id);
off = 0;
break;
default:
/* It's either error or unknown return value..
* scream and leave.
*/
if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
return -EINVAL;
return err;
}
} while (t);
return -EINVAL;
}
/* Check that two BTF types, each specified as an BTF object + id, are exactly
* the same. Trivial ID check is not enough due to module BTFs, because we can
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
static bool btf_types_are_same(const struct btf *btf1, u32 id1,
const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
if (btf1 == btf2)
return true;
return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
}
bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *btf, u32 id, int off,
const struct btf *need_btf, u32 need_type_id)
{
const struct btf_type *type;
int err;
/* Are we already done? */
if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
return true;
again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
err = btf_struct_walk(log, btf, type, off, 1, &id);
if (err != WALK_STRUCT)
return false;
/* We found nested struct object. If it matches
* the requested ID, we're done. Otherwise let's
* continue the search with offset 0 in the new
* type.
*/
if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
off = 0;
goto again;
}
return true;
}
static int __get_type_size(struct btf *btf, u32 btf_id,
const struct btf_type **bad_type)
{
const struct btf_type *t;
if (!btf_id)
/* void */
return 0;
t = btf_type_by_id(btf, btf_id);
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!t) {
*bad_type = btf_type_by_id(btf, 0);
return -EINVAL;
}
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
if (btf_type_is_int(t) || btf_type_is_enum(t))
return t->size;
*bad_type = t;
return -EINVAL;
}
int btf_distill_func_proto(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *func,
const char *tname,
struct btf_func_model *m)
{
const struct btf_param *args;
const struct btf_type *t;
u32 i, nargs;
int ret;
if (!func) {
/* BTF function prototype doesn't match the verifier types.
* Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
m->arg_size[i] = 8;
m->ret_size = 8;
m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
if (nargs >= MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
return -EINVAL;
}
ret = __get_type_size(btf, func->type, &t);
if (ret < 0) {
bpf_log(log,
"The function %s return type %s is unsupported.\n",
tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL;
}
m->ret_size = ret;
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
bpf_log(log,
"The function %s with variable args is unsupported.\n",
tname);
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
if (ret < 0) {
bpf_log(log,
"The function %s arg%d type %s is unsupported.\n",
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL;
}
if (ret == 0) {
bpf_log(log,
"The function %s has malformed void argument.\n",
tname);
return -EINVAL;
}
m->arg_size[i] = ret;
}
m->nr_args = nargs;
return 0;
}
/* Compare BTFs of two functions assuming only scalars and pointers to context.
* t1 points to BTF_KIND_FUNC in btf1
* t2 points to BTF_KIND_FUNC in btf2
* Returns:
* EINVAL - function prototype mismatch
* EFAULT - verifier bug
* 0 - 99% match. The last 1% is validated by the verifier.
*/
static int btf_check_func_type_match(struct bpf_verifier_log *log,
struct btf *btf1, const struct btf_type *t1,
struct btf *btf2, const struct btf_type *t2)
{
const struct btf_param *args1, *args2;
const char *fn1, *fn2, *s1, *s2;
u32 nargs1, nargs2, i;
fn1 = btf_name_by_offset(btf1, t1->name_off);
fn2 = btf_name_by_offset(btf2, t2->name_off);
if (btf_func_linkage(t1) != BTF_FUNC_GLOBAL) {
bpf_log(log, "%s() is not a global function\n", fn1);
return -EINVAL;
}
if (btf_func_linkage(t2) != BTF_FUNC_GLOBAL) {
bpf_log(log, "%s() is not a global function\n", fn2);
return -EINVAL;
}
t1 = btf_type_by_id(btf1, t1->type);
if (!t1 || !btf_type_is_func_proto(t1))
return -EFAULT;
t2 = btf_type_by_id(btf2, t2->type);
if (!t2 || !btf_type_is_func_proto(t2))
return -EFAULT;
args1 = (const struct btf_param *)(t1 + 1);
nargs1 = btf_type_vlen(t1);
args2 = (const struct btf_param *)(t2 + 1);
nargs2 = btf_type_vlen(t2);
if (nargs1 != nargs2) {
bpf_log(log, "%s() has %d args while %s() has %d args\n",
fn1, nargs1, fn2, nargs2);
return -EINVAL;
}
t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
if (t1->info != t2->info) {
bpf_log(log,
"Return type %s of %s() doesn't match type %s of %s()\n",
btf_type_str(t1), fn1,
btf_type_str(t2), fn2);
return -EINVAL;
}
for (i = 0; i < nargs1; i++) {
t1 = btf_type_skip_modifiers(btf1, args1[i].type, NULL);
t2 = btf_type_skip_modifiers(btf2, args2[i].type, NULL);
if (t1->info != t2->info) {
bpf_log(log, "arg%d in %s() is %s while %s() has %s\n",
i, fn1, btf_type_str(t1),
fn2, btf_type_str(t2));
return -EINVAL;
}
if (btf_type_has_size(t1) && t1->size != t2->size) {
bpf_log(log,
"arg%d in %s() has size %d while %s() has %d\n",
i, fn1, t1->size,
fn2, t2->size);
return -EINVAL;
}
/* global functions are validated with scalars and pointers
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
if (btf_type_is_int(t1) || btf_type_is_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
"arg%d in %s() has unrecognized type\n",
i, fn1);
return -EINVAL;
}
t1 = btf_type_skip_modifiers(btf1, t1->type, NULL);
t2 = btf_type_skip_modifiers(btf2, t2->type, NULL);
if (!btf_type_is_struct(t1)) {
bpf_log(log,
"arg%d in %s() is not a pointer to context\n",
i, fn1);
return -EINVAL;
}
if (!btf_type_is_struct(t2)) {
bpf_log(log,
"arg%d in %s() is not a pointer to context\n",
i, fn2);
return -EINVAL;
}
/* This is an optional check to make program writing easier.
* Compare names of structs and report an error to the user.
* btf_prepare_func_args() already checked that t2 struct
* is a context type. btf_prepare_func_args() will check
* later that t1 struct is a context type as well.
*/
s1 = btf_name_by_offset(btf1, t1->name_off);
s2 = btf_name_by_offset(btf2, t2->name_off);
if (strcmp(s1, s2)) {
bpf_log(log,
"arg%d %s(struct %s *) doesn't match %s(struct %s *)\n",
i, fn1, s1, fn2, s2);
return -EINVAL;
}
}
return 0;
}
/* Compare BTFs of given program with BTF of target program */
int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf2, const struct btf_type *t2)
{
struct btf *btf1 = prog->aux->btf;
const struct btf_type *t1;
u32 btf_id = 0;
if (!prog->aux->func_info) {
bpf_log(log, "Program extension requires BTF\n");
return -EINVAL;
}
btf_id = prog->aux->func_info[0].type_id;
if (!btf_id)
return -EFAULT;
t1 = btf_type_by_id(btf1, btf_id);
if (!t1 || !btf_type_is_func(t1))
return -EFAULT;
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
[PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
[PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
#endif
};
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int rec)
{
const struct btf_type *member_type;
const struct btf_member *member;
u32 i;
if (!btf_type_is_struct(t))
return false;
for_each_member(i, t, member) {
const struct btf_array *array;
member_type = btf_type_skip_modifiers(btf, member->type, NULL);
if (btf_type_is_struct(member_type)) {
if (rec >= 3) {
bpf_log(log, "max struct nesting depth exceeded\n");
return false;
}
if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
return false;
continue;
}
if (btf_type_is_array(member_type)) {
array = btf_type_array(member_type);
if (!array->nelems)
return false;
member_type = btf_type_skip_modifiers(btf, array->type, NULL);
if (!btf_type_is_scalar(member_type))
return false;
continue;
}
if (!btf_type_is_scalar(member_type))
return false;
}
return true;
}
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
u32 i, nargs, ref_id;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
* struct bpf_func_info or in add_kfunc_call().
*/
bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
func_id);
return -EFAULT;
}
func_name = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
bpf_log(log, "Invalid BTF of func %s\n", func_name);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (btf_type_is_scalar(t)) {
if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", regno);
return -EINVAL;
}
if (!btf_type_is_ptr(t)) {
bpf_log(log, "Unrecognized arg#%d type %s\n",
i, btf_type_str(t));
return -EINVAL;
}
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
if (btf_get_prog_ctx_type(log, btf, t,
env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
if (reg->type != PTR_TO_CTX) {
bpf_log(log,
"arg#%d expected pointer to ctx, but got %s\n",
i, btf_type_str(t));
return -EINVAL;
}
if (check_ptr_off_reg(env, reg, regno))
return -EINVAL;
} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
(reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
u32 reg_ref_id;
if (!btf_type_is_struct(ref_t)) {
bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
func_name, i, btf_type_str(ref_t),
ref_tname);
return -EINVAL;
}
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
} else {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
&reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf,
reg_ref_t->name_off);
if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
reg->off, btf, ref_id)) {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
func_name, i,
btf_type_str(ref_t), ref_tname,
regno, btf_type_str(reg_ref_t),
reg_ref_tname);
return -EINVAL;
}
} else if (ptr_to_mem_ok) {
const struct btf_type *resolve_ret;
u32 type_size;
if (is_kfunc) {
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
*/
if (!btf_type_is_scalar(ref_t) &&
!__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
bpf_log(log,
"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
i, btf_type_str(ref_t), ref_tname);
return -EINVAL;
}
}
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
i, btf_type_str(ref_t), ref_tname,
PTR_ERR(resolve_ret));
return -EINVAL;
}
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
is_kfunc ? "kernel " : "", func_name, func_id);
return -EINVAL;
}
}
return 0;
}
/* Compare BTF of a function with given bpf_reg_state.
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - there is a type mismatch or BTF is not available.
* 0 - BTF matches with what bpf_reg_state expects.
* Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
*/
int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *regs)
{
struct bpf_prog *prog = env->prog;
struct btf *btf = prog->aux->btf;
bool is_global;
u32 btf_id;
int err;
if (!prog->aux->func_info)
return -EINVAL;
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id)
return -EFAULT;
if (prog->aux->func_info_aux[subprog].unreliable)
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
* In such cases mark the function as unreliable from BTF point of view.
*/
if (err)
prog->aux->func_info_aux[subprog].unreliable = true;
return err;
}
int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs)
{
return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
* EINVAL - cannot convert BTF.
* 0 - Successfully converted BTF into bpf_reg_state
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
struct bpf_reg_state *regs)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
const struct btf_type *t, *ref_t;
u32 i, nargs, btf_id;
const char *tname;
if (!prog->aux->func_info ||
prog->aux->func_info_aux[subprog].linkage != BTF_FUNC_GLOBAL) {
bpf_log(log, "Verifier bug\n");
return -EFAULT;
}
btf_id = prog->aux->func_info[subprog].type_id;
if (!btf_id) {
bpf_log(log, "Global functions need valid BTF\n");
return -EFAULT;
}
t = btf_type_by_id(btf, btf_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
* struct bpf_func_info
*/
bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
subprog);
return -EFAULT;
}
tname = btf_name_by_offset(btf, t->name_off);
if (log->level & BPF_LOG_LEVEL)
bpf_log(log, "Validating %s() func#%d...\n",
tname, subprog);
if (prog->aux->func_info_aux[subprog].unreliable) {
bpf_log(log, "Verifier bug in function %s()\n", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
prog_type = prog->aux->dst_prog->type;
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
bpf_log(log, "Invalid type of function %s()\n", tname);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
if (nargs > MAX_BPF_FUNC_REG_ARGS) {
bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
/* check that function returns int */
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
return -EINVAL;
}
/* Convert BTF function arguments into verifier types.
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
struct bpf_reg_state *reg = &regs[i + 1];
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t) || btf_type_is_enum(t)) {
reg->type = SCALAR_VALUE;
continue;
}
if (btf_type_is_ptr(t)) {
if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
reg->type = PTR_TO_CTX;
continue;
}
t = btf_type_skip_modifiers(btf, t->type, NULL);
ref_t = btf_resolve_size(btf, t, &reg->mem_size);
if (IS_ERR(ref_t)) {
bpf_log(log,
"arg#%d reference type('%s %s') size cannot be determined: %ld\n",
i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
PTR_ERR(ref_t));
return -EINVAL;
}
reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
return -EINVAL;
}
return 0;
}
static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
struct btf_show *show)
{
const struct btf_type *t = btf_type_by_id(btf, type_id);
show->btf = btf;
memset(&show->state, 0, sizeof(show->state));
memset(&show->obj, 0, sizeof(show->obj));
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
static void btf_seq_show(struct btf_show *show, const char *fmt,
va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
void *obj, struct seq_file *m, u64 flags)
{
struct btf_show sseq;
sseq.target = m;
sseq.showfn = btf_seq_show;
sseq.flags = flags;
btf_type_show(btf, type_id, obj, &sseq);
return sseq.state.status;
}
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
struct seq_file *m)
{
(void) btf_type_seq_show_flags(btf, type_id, obj, m,
BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
}
struct btf_show_snprintf {
struct btf_show show;
int len_left; /* space left in string */
int len; /* length we would have written */
};
static void btf_snprintf_show(struct btf_show *show, const char *fmt,
va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);
if (len < 0) {
ssnprintf->len_left = 0;
ssnprintf->len = len;
} else if (len > ssnprintf->len_left) {
/* no space, drive on to get length we would have written */
ssnprintf->len_left = 0;
ssnprintf->len += len;
} else {
ssnprintf->len_left -= len;
ssnprintf->len += len;
show->target += len;
}
}
int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
char *buf, int len, u64 flags)
{
struct btf_show_snprintf ssnprintf;
ssnprintf.show.target = buf;
ssnprintf.show.flags = flags;
ssnprintf.show.showfn = btf_snprintf_show;
ssnprintf.len_left = len;
ssnprintf.len = 0;
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
/* If we encontered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
/* Otherwise return length we would have written */
return ssnprintf.len;
}
#ifdef CONFIG_PROC_FS
static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct btf *btf = filp->private_data;
seq_printf(m, "btf_id:\t%u\n", btf->id);
}
#endif
static int btf_release(struct inode *inode, struct file *filp)
{
btf_put(filp->private_data);
return 0;
}
const struct file_operations btf_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_btf_show_fdinfo,
#endif
.release = btf_release,
};
static int __btf_new_fd(struct btf *btf)
{
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
{
struct btf *btf;
int ret;
btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
attr->btf_size, attr->btf_log_level,
u64_to_user_ptr(attr->btf_log_buf),
attr->btf_log_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
ret = btf_alloc_id(btf);
if (ret) {
btf_free(btf);
return ret;
}
/*
* The BTF ID is published to the userspace.
* All BTF free must go through call_rcu() from
* now on (i.e. free by calling btf_put()).
*/
ret = __btf_new_fd(btf);
if (ret < 0)
btf_put(btf);
return ret;
}
struct btf *btf_get_by_fd(int fd)
{
struct btf *btf;
struct fd f;
f = fdget(fd);
if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &btf_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
btf = f.file->private_data;
refcount_inc(&btf->refcnt);
fdput(f);
return btf;
}
int btf_get_info_by_fd(const struct btf *btf,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo;
struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
char __user *uname;
u32 uinfo_len, uname_len, name_len;
int ret = 0;
uinfo = u64_to_user_ptr(attr->info.info);
uinfo_len = attr->info.info_len;
info_copy = min_t(u32, uinfo_len, sizeof(info));
memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
info.id = btf->id;
ubtf = u64_to_user_ptr(info.btf);
btf_copy = min_t(u32, btf->data_size, info.btf_size);
if (copy_to_user(ubtf, btf->data, btf_copy))
return -EFAULT;
info.btf_size = btf->data_size;
info.kernel_btf = btf->kernel_btf;
uname = u64_to_user_ptr(info.name);
uname_len = info.name_len;
if (!uname ^ !uname_len)
return -EINVAL;
name_len = strlen(btf->name);
info.name_len = name_len;
if (uname) {
if (uname_len >= name_len + 1) {
if (copy_to_user(uname, btf->name, name_len + 1))
return -EFAULT;
} else {
char zero = '\0';
if (copy_to_user(uname, btf->name, uname_len - 1))
return -EFAULT;
if (put_user(zero, uname + uname_len - 1))
return -EFAULT;
/* let user-space know about too short buffer */
ret = -ENOSPC;
}
}
if (copy_to_user(uinfo, &info, info_copy) ||
put_user(info_copy, &uattr->info.info_len))
return -EFAULT;
return ret;
}
int btf_get_fd_by_id(u32 id)
{
struct btf *btf;
int fd;
rcu_read_lock();
btf = idr_find(&btf_idr, id);
if (!btf || !refcount_inc_not_zero(&btf->refcnt))
btf = ERR_PTR(-ENOENT);
rcu_read_unlock();
if (IS_ERR(btf))
return PTR_ERR(btf);
fd = __btf_new_fd(btf);
if (fd < 0)
btf_put(btf);
return fd;
}
u32 btf_obj_id(const struct btf *btf)
{
return btf->id;
}
bool btf_is_kernel(const struct btf *btf)
{
return btf->kernel_btf;
}
bool btf_is_module(const struct btf *btf)
{
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
static int btf_id_cmp_func(const void *a, const void *b)
{
const int *pa = a, *pb = b;
return *pa - *pb;
}
bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
{
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
struct list_head list;
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
};
static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);
static ssize_t
btf_module_read(struct file *file, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
const struct btf *btf = bin_attr->private;
memcpy(buf, btf->data + off, len);
return len;
}
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
struct btf_module *btf_mod, *tmp;
struct module *mod = module;
struct btf *btf;
int err = 0;
if (mod->btf_data_size == 0 ||
(op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
goto out;
switch (op) {
case MODULE_STATE_COMING:
btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL);
if (!btf_mod) {
err = -ENOMEM;
goto out;
}
btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
if (IS_ERR(btf)) {
pr_warn("failed to validate module [%s] BTF: %ld\n",
mod->name, PTR_ERR(btf));
kfree(btf_mod);
err = PTR_ERR(btf);
goto out;
}
err = btf_alloc_id(btf);
if (err) {
btf_free(btf);
kfree(btf_mod);
goto out;
}
purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
list_add(&btf_mod->list, &btf_modules);
mutex_unlock(&btf_module_mutex);
if (IS_ENABLED(CONFIG_SYSFS)) {
struct bin_attribute *attr;
attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
goto out;
sysfs_bin_attr_init(attr);
attr->attr.name = btf->name;
attr->attr.mode = 0444;
attr->size = btf->data_size;
attr->private = btf;
attr->read = btf_module_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
mod->name, err);
kfree(attr);
err = 0;
goto out;
}
btf_mod->sysfs_attr = attr;
}
break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
if (btf_mod->module != module)
continue;
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
break;
}
mutex_unlock(&btf_module_mutex);
break;
}
out:
return notifier_from_errno(err);
}
static struct notifier_block btf_module_nb = {
.notifier_call = btf_module_notify,
};
static int __init btf_module_init(void)
{
register_module_notifier(&btf_module_nb);
return 0;
}
fs_initcall(btf_module_init);
#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
struct module *btf_try_get_module(const struct btf *btf)
{
struct module *res = NULL;
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module *btf_mod, *tmp;
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
if (btf_mod->btf != btf)
continue;
if (try_module_get(btf_mod->module))
res = btf_mod->module;
break;
}
mutex_unlock(&btf_module_mutex);
#endif
return res;
}
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
struct btf *btf;
long ret;
if (flags)
return -EINVAL;
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
btf = bpf_get_btf_vmlinux();
if (IS_ERR(btf))
return PTR_ERR(btf);
ret = btf_find_by_name_kind(btf, name, kind);
/* ret is never zero, since btf_find_by_name_kind returns
* positive btf_id or negative error.
*/
if (ret < 0) {
struct btf *mod_btf;
int id;
/* If name is not found in vmlinux's BTF then search in module's BTFs */
spin_lock_bh(&btf_idr_lock);
idr_for_each_entry(&btf_idr, mod_btf, id) {
if (!btf_is_module(mod_btf))
continue;
/* linear search could be slow hence unlock/lock
* the IDR to avoiding holding it for too long
*/
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
ret = btf_find_by_name_kind(mod_btf, name, kind);
if (ret > 0) {
int btf_obj_fd;
btf_obj_fd = __btf_new_fd(mod_btf);
if (btf_obj_fd < 0) {
btf_put(mod_btf);
return btf_obj_fd;
}
return ret | (((u64)btf_obj_fd) << 32);
}
spin_lock_bh(&btf_idr_lock);
btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
}
return ret;
}
const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
/* BTF ID set registration API for modules */
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
struct kfunc_btf_id_set *s)
{
mutex_lock(&l->mutex);
list_add(&s->list, &l->list);
mutex_unlock(&l->mutex);
}
EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
struct kfunc_btf_id_set *s)
{
mutex_lock(&l->mutex);
list_del_init(&s->list);
mutex_unlock(&l->mutex);
}
EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
struct module *owner)
{
struct kfunc_btf_id_set *s;
mutex_lock(&klist->mutex);
list_for_each_entry(s, &klist->list, list) {
if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
mutex_unlock(&klist->mutex);
return true;
}
}
mutex_unlock(&klist->mutex);
return false;
}
#define DEFINE_KFUNC_BTF_ID_LIST(name) \
struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
__MUTEX_INITIALIZER(name.mutex) }; \
EXPORT_SYMBOL_GPL(name)
DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
#endif
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
return -EOPNOTSUPP;
}
static bool bpf_core_is_flavor_sep(const char *s)
{
/* check X___Y name pattern, where X and Y are not underscores */
return s[0] != '_' && /* X */
s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
s[4] != '_'; /* Y */
}
size_t bpf_core_essential_name_len(const char *name)
{
size_t n = strlen(name);
int i;
for (i = n - 5; i >= 0; i--) {
if (bpf_core_is_flavor_sep(name + i))
return i + 1;
}
return n;
}
struct bpf_cand_cache {
const char *name;
u32 name_len;
u16 kind;
u16 cnt;
struct {
const struct btf *btf;
u32 id;
} cands[];
};
static void bpf_free_cands(struct bpf_cand_cache *cands)
{
if (!cands->cnt)
/* empty candidate array was allocated on stack */
return;
kfree(cands);
}
static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
{
kfree(cands->name);
kfree(cands);
}
#define VMLINUX_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
static DEFINE_MUTEX(cand_cache_mutex);
static void __print_cand_cache(struct bpf_verifier_log *log,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc;
int i, j;
for (i = 0; i < cache_size; i++) {
cc = cache[i];
if (!cc)
continue;
bpf_log(log, "[%d]%s(", i, cc->name);
for (j = 0; j < cc->cnt; j++) {
bpf_log(log, "%d", cc->cands[j].id);
if (j < cc->cnt - 1)
bpf_log(log, " ");
}
bpf_log(log, "), ");
}
}
static void print_cand_cache(struct bpf_verifier_log *log)
{
mutex_lock(&cand_cache_mutex);
bpf_log(log, "vmlinux_cand_cache:");
__print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
bpf_log(log, "\nmodule_cand_cache:");
__print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
bpf_log(log, "\n");
mutex_unlock(&cand_cache_mutex);
}
static u32 hash_cands(struct bpf_cand_cache *cands)
{
return jhash(cands->name, cands->name_len, 0);
}
static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
if (cc && cc->name_len == cands->name_len &&
!strncmp(cc->name, cands->name, cands->name_len))
return cc;
return NULL;
}
static size_t sizeof_cands(int cnt)
{
return offsetof(struct bpf_cand_cache, cands[cnt]);
}
static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
if (*cc) {
bpf_free_cands_from_cache(*cc);
*cc = NULL;
}
new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
}
/* strdup the name, since it will stay in cache.
* the cands->name points to strings in prog's BTF and the prog can be unloaded.
*/
new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
bpf_free_cands(cands);
if (!new_cands->name) {
kfree(new_cands);
return ERR_PTR(-ENOMEM);
}
*cc = new_cands;
return new_cands;
}
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc;
int i, j;
for (i = 0; i < cache_size; i++) {
cc = cache[i];
if (!cc)
continue;
if (!btf) {
/* when new module is loaded purge all of module_cand_cache,
* since new module might have candidates with the name
* that matches cached cands.
*/
bpf_free_cands_from_cache(cc);
cache[i] = NULL;
continue;
}
/* when module is unloaded purge cache entries
* that match module's btf
*/
for (j = 0; j < cc->cnt; j++)
if (cc->cands[j].btf == btf) {
bpf_free_cands_from_cache(cc);
cache[i] = NULL;
break;
}
}
}
static void purge_cand_cache(struct btf *btf)
{
mutex_lock(&cand_cache_mutex);
__purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
mutex_unlock(&cand_cache_mutex);
}
#endif
static struct bpf_cand_cache *
bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
int targ_start_id)
{
struct bpf_cand_cache *new_cands;
const struct btf_type *t;
const char *targ_name;
size_t targ_essent_len;
int n, i;
n = btf_nr_types(targ_btf);
for (i = targ_start_id; i < n; i++) {
t = btf_type_by_id(targ_btf, i);
if (btf_kind(t) != cands->kind)
continue;
targ_name = btf_name_by_offset(targ_btf, t->name_off);
if (!targ_name)
continue;
/* the resched point is before strncmp to make sure that search
* for non-existing name will have a chance to schedule().
*/
cond_resched();
if (strncmp(cands->name, targ_name, cands->name_len) != 0)
continue;
targ_essent_len = bpf_core_essential_name_len(targ_name);
if (targ_essent_len != cands->name_len)
continue;
/* most of the time there is only one candidate for a given kind+name pair */
new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
}
memcpy(new_cands, cands, sizeof_cands(cands->cnt));
bpf_free_cands(cands);
cands = new_cands;
cands->cands[cands->cnt].btf = targ_btf;
cands->cands[cands->cnt].id = i;
cands->cnt++;
}
return cands;
}
static struct bpf_cand_cache *
bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
{
struct bpf_cand_cache *cands, *cc, local_cand = {};
const struct btf *local_btf = ctx->btf;
const struct btf_type *local_type;
const struct btf *main_btf;
size_t local_essent_len;
struct btf *mod_btf;
const char *name;
int id;
main_btf = bpf_get_btf_vmlinux();
if (IS_ERR(main_btf))
return ERR_CAST(main_btf);
local_type = btf_type_by_id(local_btf, local_type_id);
if (!local_type)
return ERR_PTR(-EINVAL);
name = btf_name_by_offset(local_btf, local_type->name_off);
if (str_is_empty(name))
return ERR_PTR(-EINVAL);
local_essent_len = bpf_core_essential_name_len(name);
cands = &local_cand;
cands->name = name;
cands->kind = btf_kind(local_type);
cands->name_len = local_essent_len;
cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
/* cands is a pointer to stack here */
if (cc) {
if (cc->cnt)
return cc;
goto check_modules;
}
/* Attempt to find target candidates in vmlinux BTF first */
cands = bpf_core_add_cands(cands, main_btf, 1);
if (IS_ERR(cands))
return ERR_CAST(cands);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
/* populate cache even when cands->cnt == 0 */
cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
if (IS_ERR(cc))
return ERR_CAST(cc);
/* if vmlinux BTF has any candidate, don't go for module BTFs */
if (cc->cnt)
return cc;
check_modules:
/* cands is a pointer to stack here and cands->cnt == 0 */
cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
if (cc)
/* if cache has it return it even if cc->cnt == 0 */
return cc;
/* If candidate is not found in vmlinux's BTF then search in module's BTFs */
spin_lock_bh(&btf_idr_lock);
idr_for_each_entry(&btf_idr, mod_btf, id) {
if (!btf_is_module(mod_btf))
continue;
/* linear search could be slow hence unlock/lock
* the IDR to avoiding holding it for too long
*/
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
if (IS_ERR(cands)) {
btf_put(mod_btf);
return ERR_CAST(cands);
}
spin_lock_bh(&btf_idr_lock);
btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
* or pointer to stack if cands->cnd == 0.
* Copy it into the cache even when cands->cnt == 0 and
* return the result.
*/
return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
}
int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
int relo_idx, void *insn)
{
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
struct bpf_core_cand_list cands = {};
struct bpf_core_spec *specs;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
* into arrays of btf_ids of struct fields and array indices.
*/
specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
if (!specs)
return -ENOMEM;
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
mutex_lock(&cand_cache_mutex);
cc = bpf_core_find_cands(ctx, relo->type_id);
if (IS_ERR(cc)) {
bpf_log(ctx->log, "target candidate search failed for %d\n",
relo->type_id);
err = PTR_ERR(cc);
goto out;
}
if (cc->cnt) {
cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
if (!cands.cands) {
err = -ENOMEM;
goto out;
}
}
for (i = 0; i < cc->cnt; i++) {
bpf_log(ctx->log,
"CO-RE relocating %s %s: found target candidate [%d]\n",
btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
cands.cands[i].btf = cc->cands[i].btf;
cands.cands[i].id = cc->cands[i].id;
}
cands.len = cc->cnt;
/* cand_cache_mutex needs to span the cache lookup and
* copy of btf pointer into bpf_core_cand_list,
* since module can be unloaded while bpf_core_apply_relo_insn
* is working with module's btf.
*/
}
err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
relo, relo_idx, ctx->btf, &cands, specs);
out:
kfree(specs);
if (need_cands) {
kfree(cands.cands);
mutex_unlock(&cand_cache_mutex);
if (ctx->log->level & BPF_LOG_LEVEL2)
print_cand_cache(ctx->log);
}
return err;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Performance events callchain code, extracted from core.c:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <linux/sched/task_stack.h>
#include "internal.h"
struct callchain_cpus_entries {
struct rcu_head rcu_head;
struct perf_callchain_entry *cpu_entries[];
};
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
static inline size_t perf_callchain_entry__sizeof(void)
{
return (sizeof(struct perf_callchain_entry) +
sizeof(__u64) * (sysctl_perf_event_max_stack +
sysctl_perf_event_max_contexts_per_stack));
}
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
static struct callchain_cpus_entries *callchain_cpus_entries;
__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
}
static void release_callchain_buffers_rcu(struct rcu_head *head)
{
struct callchain_cpus_entries *entries;
int cpu;
entries = container_of(head, struct callchain_cpus_entries, rcu_head);
for_each_possible_cpu(cpu)
kfree(entries->cpu_entries[cpu]);
kfree(entries);
}
static void release_callchain_buffers(void)
{
struct callchain_cpus_entries *entries;
entries = callchain_cpus_entries;
RCU_INIT_POINTER(callchain_cpus_entries, NULL);
call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
}
static int alloc_callchain_buffers(void)
{
int cpu;
int size;
struct callchain_cpus_entries *entries;
/*
* We can't use the percpu allocation API for data that can be
* accessed from NMI. Use a temporary manual per cpu allocation
* until that gets sorted out.
*/
size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
entries = kzalloc(size, GFP_KERNEL);
if (!entries)
return -ENOMEM;
size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
cpu_to_node(cpu));
if (!entries->cpu_entries[cpu])
goto fail;
}
rcu_assign_pointer(callchain_cpus_entries, entries);
return 0;
fail:
for_each_possible_cpu(cpu)
kfree(entries->cpu_entries[cpu]);
kfree(entries);
return -ENOMEM;
}
int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
mutex_lock(&callchain_mutex);
count = atomic_inc_return(&nr_callchain_events);
if (WARN_ON_ONCE(count < 1)) {
err = -EINVAL;
goto exit;
}
/*
* If requesting per event more than the global cap,
* return a different error to help userspace figure
* this out.
*
* And also do it here so that we have &callchain_mutex held.
*/
if (event_max_stack > sysctl_perf_event_max_stack) {
err = -EOVERFLOW;
goto exit;
}
if (count == 1)
err = alloc_callchain_buffers();
exit:
if (err)
atomic_dec(&nr_callchain_events);
mutex_unlock(&callchain_mutex);
return err;
}
void put_callchain_buffers(void)
{
if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
release_callchain_buffers();
mutex_unlock(&callchain_mutex);
}
}
struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
*rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
if (*rctx == -1)
return NULL;
entries = rcu_dereference(callchain_cpus_entries);
if (!entries) {
put_recursion_context(this_cpu_ptr(callchain_recursion), *rctx);
return NULL;
}
cpu = smp_processor_id();
return (((void *)entries->cpu_entries[cpu]) +
(*rctx * perf_callchain_entry__sizeof()));
}
void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
}
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx;
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = init_nr;
ctx.contexts = 0;
ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(&ctx, regs);
}
if (user) {
if (!user_mode(regs)) {
if (current->mm)
regs = task_pt_regs(current);
else
regs = NULL;
}
if (regs) {
mm_segment_t fs;
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
force_uaccess_end(fs);
}
}
exit_put:
put_callchain_entry(rctx);
return entry;
}
/*
* Used for sysctl_perf_event_max_stack and
* sysctl_perf_event_max_contexts_per_stack.
*/
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
struct ctl_table new_table = *table;
new_table.data = &new_value;
ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
mutex_lock(&callchain_mutex);
if (atomic_read(&nr_callchain_events))
ret = -EBUSY;
else
*value = new_value;
mutex_unlock(&callchain_mutex);
return ret;
}
// SPDX-License-Identifier: GPL-2.0
/*
* linux/kernel/capability.c
*
* Copyright (C) 1997 Andrew Main <zefram@fysh.org>
*
* Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org>
* 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>
/*
* Leveraged for setting/resetting capabilities
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
EXPORT_SYMBOL(__cap_empty_set);
int file_caps_enabled = 1;
static int __init file_caps_disable(char *str)
{
file_caps_enabled = 0;
return 1;
}
__setup("no_file_caps", file_caps_disable);
#ifdef CONFIG_MULTIUSER
/*
* More recent versions of libcap are available from:
*
* http://www.kernel.org/pub/linux/libs/security/linux-privs/
*/
static void warn_legacy_capability_use(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
get_task_comm(name, current));
}
/*
* Version 2 capabilities worked fine, but the linux/capability.h file
* that accompanied their introduction encouraged their use without
* the necessary user-space source code changes. As such, we have
* created a version 3 with equivalent functionality to version 2, but
* with a header change to protect legacy source code from using
* version 2 when it wanted to use version 1. If your system has code
* that trips the following warning, it is using version 2 specific
* capabilities and may be doing so insecurely.
*
* The remedy is to either upgrade your version of libcap (to 2.10+,
* if the application is linked against it), or recompile your
* application with modern kernel headers and this warning will go
* away.
*/
static void warn_deprecated_v2(void)
{
char name[sizeof(current->comm)];
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
get_task_comm(name, current));
}
/*
* Version check. Return the number of u32s in each capability flag
* array, or a negative value on error.
*/
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
__u32 version;
if (get_user(version, &header->version))
return -EFAULT;
switch (version) {
case _LINUX_CAPABILITY_VERSION_1:
warn_legacy_capability_use();
*tocopy = _LINUX_CAPABILITY_U32S_1;
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
fallthrough; /* v3 is otherwise equivalent to v2 */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
default:
if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
return -EFAULT;
return -EINVAL;
}
return 0;
}
/*
* The only thing that can change the capabilities of the current
* process is the current process. As such, we can't be in this code
* at the same time as we are in the process of setting capabilities
* in this process. The net result is that we can limit our use of
* locks to when we are reading the caps of another process.
*/
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
kernel_cap_t *pIp, kernel_cap_t *pPp)
{
int ret;
if (pid && (pid != task_pid_vnr(current))) {
struct task_struct *target;
rcu_read_lock();
target = find_task_by_vpid(pid);
if (!target)
ret = -ESRCH;
else
ret = security_capget(target, pEp, pIp, pPp);
rcu_read_unlock();
} else
ret = security_capget(current, pEp, pIp, pPp);
return ret;
}
/**
* sys_capget - get the capabilities of a given process.
* @header: pointer to struct that contains capability version and
* target pid data
* @dataptr: pointer to struct that contains the effective, permitted,
* and inheritable capabilities that are returned
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
int ret = 0;
pid_t pid;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
ret = cap_validate_magic(header, &tocopy);
if ((dataptr == NULL) || (ret != 0))
return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
if (get_user(pid, &header->pid))
return -EFAULT;
if (pid < 0)
return -EINVAL;
ret = cap_get_target_pid(pid, &pE, &pI, &pP);
if (!ret) {
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
kdata[i].effective = pE.cap[i];
kdata[i].permitted = pP.cap[i];
kdata[i].inheritable = pI.cap[i];
}
/*
* Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
* bits when they perform a: capget/modify/capset
* sequence.
*
* This behavior is considered fail-safe
* behavior. Upgrading the application to a newer
* version of libcap will enable access to the newer
* capabilities.
*
* An alternative would be to return an error here
* (-ERANGE), but that causes legacy applications to
* unexpectedly fail; the capget/modify/capset aborts
* before modification is attempted and the application
* fails.
*/
if (copy_to_user(dataptr, kdata, tocopy
* sizeof(struct __user_cap_data_struct))) {
return -EFAULT;
}
}
return ret;
}
/**
* sys_capset - set capabilities for a process or (*) a group of processes
* @header: pointer to struct that contains capability version and
* target pid data
* @data: pointer to struct that contains the effective, permitted,
* and inheritable capabilities
*
* Set capabilities for the current process only. The ability to any other
* process(es) has been deprecated and removed.
*
* The restrictions on setting capabilities are specified as:
*
* I: any raised capabilities must be a subset of the old permitted
* P: any raised capabilities must be a subset of the old permitted
* E: must be set to a subset of new permitted
*
* Returns 0 on success and < 0 on error.
*/
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy, copybytes;
kernel_cap_t inheritable, permitted, effective;
struct cred *new;
int ret;
pid_t pid;
ret = cap_validate_magic(header, &tocopy);
if (ret != 0)
return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
/* may only affect current now */
if (pid != 0 && pid != task_pid_vnr(current))
return -EPERM;
copybytes = tocopy * sizeof(struct __user_cap_data_struct);
if (copybytes > sizeof(kdata))
return -EFAULT;
if (copy_from_user(&kdata, data, copybytes))
return -EFAULT;
for (i = 0; i < tocopy; i++) {
effective.cap[i] = kdata[i].effective;
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
i++;
}
effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
new = prepare_creds();
if (!new)
return -ENOMEM;
ret = security_capset(new, current_cred(),
&effective, &inheritable, &permitted);
if (ret < 0)
goto error;
audit_log_capset(new, current_cred());
return commit_creds(new);
error:
abort_creds(new);
return ret;
}
/**
* has_ns_capability - Does a task have a capability in a specific user ns
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability - Does a task have a capability in init_user_ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the initial user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability(struct task_struct *t, int cap)
{
return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);
/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
* @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to the specified user namespace, false if not.
* Do not write an audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_ns_capability_noaudit(struct task_struct *t,
struct user_namespace *ns, int cap)
{
int ret;
rcu_read_lock();
ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
/**
* has_capability_noaudit - Does a task have a capability (unaudited) in the
* initial user ns
* @t: The task in question
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
* currently in effect to init_user_ns, false if not. Don't write an
* audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
static bool ns_capable_common(struct user_namespace *ns,
int cap,
unsigned int opts)
{
int capable;
if (unlikely(!cap_valid(cap))) {
pr_crit("capable() called with invalid cap=%u\n", cap);
BUG();
}
capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
return false;
}
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
/**
* ns_capable_noaudit - Determine if the current task has a superior capability
* (unaudited) in effect
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
/**
* ns_capable_setid - Determine if the current task has a superior capability
* in effect, while signalling that this check is being done from within a
* setid or setgroups syscall.
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);
/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
* Return true if the current task has the given superior capability currently
* available for use, false if not.
*
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
bool capable(int cap)
{
return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */
/**
* file_ns_capable - Determine if the file's opener had a capability in effect
* @file: The file we want to check
* @ns: The usernamespace we want the capability in
* @cap: The capability to be tested for
*
* Return true if task that opened the file had a capability in effect
* when the file was opened.
*
* This does not set PF_SUPERPRIV because the caller may not
* actually be privileged.
*/
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
}
EXPORT_SYMBOL(file_ns_capable);
/**
* privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
* @ns: The user namespace in question
* @inode: The inode in question
*
* Return true if the inode uid and gid are within the namespace.
*/
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
struct user_namespace *mnt_userns,
const struct inode *inode)
{
return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
}
/**
* capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
* @inode: The inode in question
* @cap: The capability in question
*
* Return true if the current task has the given capability targeted at
* its own user namespace and that the given inode's uid and gid are
* mapped into the current user namespace.
*/
bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns,
const struct inode *inode, int cap)
{
struct user_namespace *ns = current_user_ns();
return ns_capable(ns, cap) &&
privileged_wrt_inode_uidgid(ns, mnt_userns, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);
/**
* ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
* @tsk: The task that may be ptraced
* @ns: The user namespace to search for CAP_SYS_PTRACE in
*
* Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
* in the specified user namespace.
*/
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
int ret = 0; /* An absent tracer adds no restrictions */
const struct cred *cred;
rcu_read_lock();
cred = rcu_dereference(tsk->ptracer_cred);
if (cred)
ret = security_capable(cred, ns, CAP_SYS_PTRACE,
CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
// SPDX-License-Identifier: GPL-2.0
/*
* Clang Control Flow Integrity (CFI) error and slowpath handling.
*
* Copyright (C) 2021 Google LLC
*/
#include <linux/hardirq.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/printk.h>
#include <linux/ratelimit.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
/* Compiler-defined handler names */
#ifdef CONFIG_CFI_PERMISSIVE
#define cfi_failure_handler __ubsan_handle_cfi_check_fail
#else
#define cfi_failure_handler __ubsan_handle_cfi_check_fail_abort
#endif
static inline void handle_cfi_failure(void *ptr)
{
if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
else
panic("CFI failure (target: %pS)\n", ptr);
}
#ifdef CONFIG_MODULES
#ifdef CONFIG_CFI_CLANG_SHADOW
/*
* Index type. A 16-bit index can address at most (2^16)-2 pages (taking
* into account SHADOW_INVALID), i.e. ~256M with 4k pages.
*/
typedef u16 shadow_t;
#define SHADOW_INVALID ((shadow_t)~0UL)
struct cfi_shadow {
/* Page index for the beginning of the shadow */
unsigned long base;
/* An array of __cfi_check locations (as indices to the shadow) */
shadow_t shadow[1];
} __packed;
/*
* The shadow covers ~128M from the beginning of the module region. If
* the region is larger, we fall back to __module_address for the rest.
*/
#define __SHADOW_RANGE (_UL(SZ_128M) >> PAGE_SHIFT)
/* The in-memory size of struct cfi_shadow, always at least one page */
#define __SHADOW_PAGES ((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
#define SHADOW_PAGES max(1UL, __SHADOW_PAGES)
#define SHADOW_SIZE (SHADOW_PAGES << PAGE_SHIFT)
/* The actual size of the shadow array, minus metadata */
#define SHADOW_ARR_SIZE (SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
#define SHADOW_ARR_SLOTS (SHADOW_ARR_SIZE / sizeof(shadow_t))
static DEFINE_MUTEX(shadow_update_lock);
static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
/* Returns the index in the shadow for the given address */
static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
{
unsigned long index;
unsigned long page = ptr >> PAGE_SHIFT;
if (unlikely(page < s->base))
return -1; /* Outside of module area */
index = page - s->base;
if (index >= SHADOW_ARR_SLOTS)
return -1; /* Cannot be addressed with shadow */
return (int)index;
}
/* Returns the page address for an index in the shadow */
static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
int index)
{
if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
return 0;
return (s->base + index) << PAGE_SHIFT;
}
/* Returns the __cfi_check function address for the given shadow location */
static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
int index)
{
if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
return 0;
if (unlikely(s->shadow[index] == SHADOW_INVALID))
return 0;
/* __cfi_check is always page aligned */
return (s->base + s->shadow[index]) << PAGE_SHIFT;
}
static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
struct cfi_shadow *next)
{
int i, index, check;
/* Mark everything invalid */
memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
if (!prev)
return; /* No previous shadow */
/* If the base address didn't change, an update is not needed */
if (prev->base == next->base) {
memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
return;
}
/* Convert the previous shadow to the new address range */
for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
if (prev->shadow[i] == SHADOW_INVALID)
continue;
index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
if (index < 0)
continue;
check = ptr_to_shadow(next,
shadow_to_check_fn(prev, prev->shadow[i]));
if (check < 0)
continue;
next->shadow[index] = (shadow_t)check;
}
}
static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
unsigned long min_addr, unsigned long max_addr)
{
int check_index;
unsigned long check = (unsigned long)mod->cfi_check;
unsigned long ptr;
if (unlikely(!PAGE_ALIGNED(check))) {
pr_warn("cfi: not using shadow for module %s\n", mod->name);
return;
}
check_index = ptr_to_shadow(s, check);
if (check_index < 0)
return; /* Module not addressable with shadow */
/* For each page, store the check function index in the shadow */
for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
int index = ptr_to_shadow(s, ptr);
if (index >= 0) {
/* Each page must only contain one module */
WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
s->shadow[index] = (shadow_t)check_index;
}
}
}
static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
unsigned long min_addr, unsigned long max_addr)
{
unsigned long ptr;
for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
int index = ptr_to_shadow(s, ptr);
if (index >= 0)
s->shadow[index] = SHADOW_INVALID;
}
}
typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
unsigned long min_addr, unsigned long max_addr);
static void update_shadow(struct module *mod, unsigned long base_addr,
update_shadow_fn fn)
{
struct cfi_shadow *prev;
struct cfi_shadow *next;
unsigned long min_addr, max_addr;
next = vmalloc(SHADOW_SIZE);
mutex_lock(&shadow_update_lock);
prev = rcu_dereference_protected(cfi_shadow,
mutex_is_locked(&shadow_update_lock));
if (next) {
next->base = base_addr >> PAGE_SHIFT;
prepare_next_shadow(prev, next);
min_addr = (unsigned long)mod->core_layout.base;
max_addr = min_addr + mod->core_layout.text_size;
fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
set_memory_ro((unsigned long)next, SHADOW_PAGES);
}
rcu_assign_pointer(cfi_shadow, next);
mutex_unlock(&shadow_update_lock);
synchronize_rcu();
if (prev) {
set_memory_rw((unsigned long)prev, SHADOW_PAGES);
vfree(prev);
}
}
void cfi_module_add(struct module *mod, unsigned long base_addr)
{
update_shadow(mod, base_addr, add_module_to_shadow);
}
void cfi_module_remove(struct module *mod, unsigned long base_addr)
{
update_shadow(mod, base_addr, remove_module_from_shadow);
}
static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
unsigned long ptr)
{
int index;
if (unlikely(!s))
return NULL; /* No shadow available */
index = ptr_to_shadow(s, ptr);
if (index < 0)
return NULL; /* Cannot be addressed with shadow */
return (cfi_check_fn)shadow_to_check_fn(s, index);
}
static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
{
cfi_check_fn fn;
rcu_read_lock_sched_notrace();
fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
rcu_read_unlock_sched_notrace();
return fn;
}
#else /* !CONFIG_CFI_CLANG_SHADOW */
static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
{
return NULL;
}
#endif /* CONFIG_CFI_CLANG_SHADOW */
static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
{
cfi_check_fn fn = NULL;
struct module *mod;
rcu_read_lock_sched_notrace();
mod = __module_address(ptr);
if (mod)
fn = mod->cfi_check;
rcu_read_unlock_sched_notrace();
return fn;
}
static inline cfi_check_fn find_check_fn(unsigned long ptr)
{
cfi_check_fn fn = NULL;
if (is_kernel_text(ptr))
return __cfi_check;
/*
* Indirect call checks can happen when RCU is not watching. Both
* the shadow and __module_address use RCU, so we need to wake it
* up if necessary.
*/
RCU_NONIDLE({
if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
fn = find_shadow_check_fn(ptr);
if (!fn)
fn = find_module_check_fn(ptr);
});
return fn;
}
void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
{
cfi_check_fn fn = find_check_fn((unsigned long)ptr);
if (likely(fn))
fn(id, ptr, diag);
else /* Don't allow unchecked modules */
handle_cfi_failure(ptr);
}
EXPORT_SYMBOL(__cfi_slowpath_diag);
#else /* !CONFIG_MODULES */
void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
{
handle_cfi_failure(ptr); /* No modules */
}
EXPORT_SYMBOL(__cfi_slowpath_diag);
#endif /* CONFIG_MODULES */
void cfi_failure_handler(void *data, void *ptr, void *vtable)
{
handle_cfi_failure(ptr);
}
EXPORT_SYMBOL(cfi_failure_handler);
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/ctype.h>
#include <linux/kmod.h>
#include <linux/sort.h>
#include <linux/delay.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
#include <linux/fs_parser.h>
#include <trace/events/cgroup.h>
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
* Expiring in the middle is a performance problem not a correctness one.
* 1 sec should be enough.
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/* protects cgroup_subsys->release_agent_path */
static DEFINE_SPINLOCK(release_agent_path_lock);
bool cgroup1_ssid_disabled(int ssid)
{
return cgroup_no_v1_mask & (1 << ssid);
}
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
*
* Return: %0 on success or a negative errno code on failure
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
struct cgroup_root *root;
int retval = 0;
mutex_lock(&cgroup_mutex);
percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
break;
}
percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/**
* cgroup_transfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
* Locking rules between cgroup_post_fork() and the migration path
* guarantee that, if a task is forking while being migrated, the new child
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
*
* Return: %0 on success or a negative errno code on failure
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
DEFINE_CGROUP_MGCTX(mgctx);
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
int ret;
if (cgroup_on_dfl(to))
return -EINVAL;
ret = cgroup_migrate_vet_dst(to);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
percpu_down_write(&cgroup_threadgroup_rwsem);
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &mgctx);
spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_err;
/*
* Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
css_task_iter_start(&from->self, 0, &it);
do {
task = css_task_iter_next(&it);
} while (task && (task->flags & PF_EXITING));
if (task)
get_task_struct(task);
css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(task, false, &mgctx);
if (!ret)
TRACE_CGROUP_PATH(transfer_tasks, to, task, false);
put_task_struct(task);
}
} while (task && !ret);
out_err:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return ret;
}
/*
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
};
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
*/
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
kvfree(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
*/
static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*/
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
if (!l)
return l;
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
return l;
}
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
/* get tgid or pid for procs or tasks file respectively */
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
}
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
kvfree(array);
return -ENOMEM;
}
/* store array, freeing old if necessary */
kvfree(l->list);
l->list = array;
l->length = length;
*lp = l;
return 0;
}
/*
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
* in the cgroup->l->list array.
*/
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
{
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @ctx->procs1.pidlist indicates that this isn't the first
* start() after open. If the matching pidlist is around, we can use
* that. Look for it. Note that @ctx->procs1.pidlist can't be used
* directly. It could already have been destroyed.
*/
if (ctx->procs1.pidlist)
ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!ctx->procs1.pidlist) {
ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (l->list[mid] == pid) {
index = mid;
break;
} else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
*pos = *iter;
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
CGROUP_PIDLIST_DESTROY_DELAY);
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
(*pos)++;
return NULL;
} else {
*pos = *p;
return p;
}
}
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
seq_printf(s, "%d\n", *(int *)v);
return 0;
}
static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
bool threadgroup)
{
struct cgroup *cgrp;
struct task_struct *task;
const struct cred *cred, *tcred;
ssize_t ret;
bool locked;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
task = cgroup_procs_write_start(buf, threadgroup, &locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
/*
* Even if we're attaching all tasks in the thread group, we only need
* to check permissions on one of them. Check permissions using the
* credentials from file open to protect against inherited fd attacks.
*/
cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
!uid_eq(cred->euid, tcred->suid))
ret = -EACCES;
put_cred(tcred);
if (ret)
goto out_finish;
ret = cgroup_attach_task(cgrp, task, threadgroup);
out_finish:
cgroup_procs_write_finish(task, locked);
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static ssize_t cgroup1_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __cgroup1_procs_write(of, buf, nbytes, off, true);
}
static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __cgroup1_procs_write(of, buf, nbytes, off, false);
}
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
/*
* Release agent gets called with all capabilities,
* require capabilities to set release agent.
*/
ctx = of->priv;
if ((ctx->ns->user_ns != &init_user_ns) ||
!file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
return -EPERM;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
strlcpy(cgrp->root->release_agent_path, strstrip(buf),
sizeof(cgrp->root->release_agent_path));
spin_unlock(&release_agent_path_lock);
cgroup_kn_unlock(of->kn);
return nbytes;
}
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
spin_lock(&release_agent_path_lock);
seq_puts(seq, cgrp->root->release_agent_path);
spin_unlock(&release_agent_path_lock);
seq_putc(seq, '\n');
return 0;
}
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
seq_puts(seq, "0\n");
return 0;
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
if (val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
else
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
return 0;
}
/* cgroup core interface files for the legacy hierarchies */
struct cftype cgroup1_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup1_procs_write,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
},
{
.name = "tasks",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
.write = cgroup1_tasks_write,
},
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
},
{ } /* terminate */
};
/* Display information about each subsystem and each hierarchy */
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* Grab the subsystems state racily. No need to add avenue to
* cgroup_mutex contention.
*/
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
return 0;
}
/**
* cgroupstats_build - build and fill cgroupstats
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*
* Return: %0 on success or a negative errno code on failure
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
struct task_struct *tsk;
/* it should be kernfs_node belonging to cgroupfs and is a directory */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
* @kn->priv is RCU safe. Let's do the RCU dancing.
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
return -ENOENT;
}
rcu_read_unlock();
css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (READ_ONCE(tsk->__state)) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (tsk->in_iowait)
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
cgroup_put(cgrp);
return 0;
}
void cgroup1_check_for_release(struct cgroup *cgrp)
{
if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
schedule_work(&cgrp->release_agent_work);
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
void cgroup1_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
char *pathbuf, *agentbuf;
char *argv[3], *envp[3];
int ret;
/* snoop agent path and exit early if empty */
if (!cgrp->root->release_agent_path[0])
return;
/* prepare argument buffers */
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf || !agentbuf)
goto out_free;
spin_lock(&release_agent_path_lock);
strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
if (ret < 0 || ret >= PATH_MAX)
goto out_free;
argv[0] = agentbuf;
argv[1] = pathbuf;
argv[2] = NULL;
/* minimal command environment */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
out_free:
kfree(agentbuf);
kfree(pathbuf);
}
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name_str)
{
struct cgroup *cgrp = kn->priv;
int ret;
/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
if (strchr(new_name_str, '\n'))
return -EINVAL;
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
if (kn->parent != new_parent)
return -EIO;
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
* active_ref. kernfs_rename() doesn't require active_ref
* protection. Break them before grabbing cgroup_mutex.
*/
kernfs_break_active_protection(new_parent);
kernfs_break_active_protection(kn);
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
mutex_unlock(&cgroup_mutex);
kernfs_unbreak_active_protection(kn);
kernfs_unbreak_active_protection(new_parent);
return ret;
}
static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
int ssid;
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_show_option(seq, "release_agent",
root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_show_option(seq, "name", root->name);
return 0;
}
enum cgroup1_param {
Opt_all,
Opt_clone_children,
Opt_cpuset_v2_mode,
Opt_name,
Opt_none,
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("all", Opt_all),
fsparam_flag ("clone_children", Opt_clone_children),
fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
fsparam_string("name", Opt_name),
fsparam_flag ("none", Opt_none),
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
{}
};
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_subsys *ss;
struct fs_parse_result result;
int opt, i;
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
int ret;
ret = vfs_parse_fs_param_source(fc, param);
if (ret != -ENOPARAM)
return ret;
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
return invalfc(fc, "Disabled controller '%s'",
param->key);
ctx->subsys_mask |= (1 << i);
return 0;
}
return invalfc(fc, "Unknown subsys name '%s'", param->key);
}
if (opt < 0)
return opt;
switch (opt) {
case Opt_none:
/* Explicitly have no subsystems */
ctx->none = true;
break;
case Opt_all:
ctx->all_ss = true;
break;
case Opt_noprefix:
ctx->flags |= CGRP_ROOT_NOPREFIX;
break;
case Opt_clone_children:
ctx->cpuset_clone_children = true;
break;
case Opt_cpuset_v2_mode:
ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
break;
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
/*
* Release agent gets called with all capabilities,
* require capabilities to set release agent.
*/
if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
case Opt_name:
/* blocked by boot param? */
if (cgroup_no_v1_named)
return -ENOENT;
/* Can't specify an empty name */
if (!param->size)
return invalfc(fc, "Empty name");
if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
return invalfc(fc, "Name too long");
/* Must match [\w.-]+ */
for (i = 0; i < param->size; i++) {
char c = param->string[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return invalfc(fc, "Invalid name");
}
/* Specifying two names is forbidden */
if (ctx->name)
return invalfc(fc, "name respecified");
ctx->name = param->string;
param->string = NULL;
break;
}
return 0;
}
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
u16 mask = U16_MAX;
u16 enabled = 0;
struct cgroup_subsys *ss;
int i;
#ifdef CONFIG_CPUSETS
mask = ~((u16)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
enabled |= 1 << i;
ctx->subsys_mask &= enabled;
/*
* In absence of 'none', 'name=' and subsystem name options,
* let's default to 'all'.
*/
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
ctx->all_ss = true;
if (ctx->all_ss) {
/* Mutually exclusive option 'all' + subsystem name */
if (ctx->subsys_mask)
return invalfc(fc, "subsys name conflicts with all");
/* 'all' => select all the subsystems */
ctx->subsys_mask = enabled;
}
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
if (!ctx->subsys_mask && !ctx->name)
return invalfc(fc, "Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
return invalfc(fc, "noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
if (ctx->subsys_mask && ctx->none)
return invalfc(fc, "none used incorrectly");
return 0;
}
int cgroup1_reconfigure(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
int ret = 0;
u16 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
added_mask = ctx->subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~ctx->subsys_mask;
/* Don't allow flags or name to change at remount */
if ((ctx->flags ^ root->flags) ||
(ctx->name && strcmp(ctx->name, root->name))) {
errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
/* remounting is not allowed for populated hierarchies */
if (!list_empty(&root->cgrp.self.children)) {
ret = -EBUSY;
goto out_unlock;
}
ret = rebind_subsystems(root, added_mask);
if (ret)
goto out_unlock;
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
mutex_unlock(&cgroup_mutex);
return ret;
}
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
.rename = cgroup1_rename,
.show_options = cgroup1_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
/*
* The guts of cgroup1 mount - find or create cgroup_root to use.
* Called with cgroup_mutex held; returns 0 on success, -E... on
* error and positive - in case when the candidate is busy dying.
* On success it stashes a reference to cgroup_root into given
* cgroup_fs_context; that reference is *NOT* counting towards the
* cgroup_root refcount.
*/
static int cgroup1_root_to_use(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_root *root;
struct cgroup_subsys *ss;
int i, ret;
/* First find the desired set of subsystems */
ret = check_cgroupfs_options(fc);
if (ret)
return ret;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(ctx->subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
return 1; /* restart */
cgroup_put(&ss->root->cgrp);
}
for_each_root(root) {
bool name_match = false;
if (root == &cgrp_dfl_root)
continue;
/*
* If we asked for a name then it must match. Also, if
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
if (ctx->name) {
if (strcmp(ctx->name, root->name))
continue;
name_match = true;
}
/*
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
if ((ctx->subsys_mask || ctx->none) &&
(ctx->subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
return -EBUSY;
}
if (root->flags ^ ctx->flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
ctx->root = root;
return 0;
}
/*
* No such thing, create a new one. name= matching without subsys
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
if (!ctx->subsys_mask && !ctx->none)
return invalfc(fc, "No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
if (ctx->ns != &init_cgroup_ns)
return -EPERM;
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
return -ENOMEM;
ctx->root = root;
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
cgroup_free_root(root);
return ret;
}
int cgroup1_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
/* Check if the caller has permission to mount. */
if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
ret = cgroup1_root_to_use(fc);
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
mutex_unlock(&cgroup_mutex);
if (!ret)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
fc_drop_locked(fc);
ret = 1;
}
if (unlikely(ret > 0)) {
msleep(10);
return restart_syscall();
}
return ret;
}
static int __init cgroup1_wq_init(void)
{
/*
* Used to destroy pidlists and separate to serve as flush domain.
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
0, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
core_initcall(cgroup1_wq_init);
static int __init cgroup_no_v1(char *str)
{
struct cgroup_subsys *ss;
char *token;
int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
continue;
}
if (!strcmp(token, "named")) {
cgroup_no_v1_named = true;
continue;
}
for_each_subsys(ss, i) {
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
cgroup_no_v1_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
This file has been truncated, but you can view the full file.
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "cgroup-internal.h"
#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
* To avoid confusing the compiler (and generating warnings) with code
* that attempts to access what would be a 0-element array (i.e. sized
* to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
* constant expression can be added.
*/
#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
* css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
*
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
* grabbing cgroup_mutex.
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
* Protects cgroup_file->kn for !self csses. It synchronizes notifications
* against file removal/re-creation across css hiding.
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_destroy_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x) \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
static bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
/*
* Assign a monotonically increasing serial number to csses. It guarantees
* cgroups with bigger numbers are newer than those with smaller numbers.
* Also, as csses are always appended to the parent's ->children list, it
* guarantees that sibling csses are always sorted in the ascending serial
* number order on the list. Protected by cgroup_mutex.
*/
static u64 css_serial_nr_next = 1;
/*
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
OPT_FEATURE_PRESSURE,
#endif
OPT_FEATURE_COUNT
};
static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
"pressure",
#endif
};
static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
*
* cgroup_subsys_enabled() can only be used with literal subsys names which
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
bool cgroup_ssid_enabled(int ssid)
{
if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
* cases where a subsystem should behave differently depending on the
* interface version.
*
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
* and "name" are disallowed.
*
* - When mounting an existing superblock, mount options should match.
*
* - Remount is disallowed.
*
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
* recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
*
* - "cgroup.clone_children" is removed.
*
* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
* and its descendants contain no task; otherwise, 1. The file also
* generates kernfs notification which can be monitored through poll and
* [di]notify when the value of the file changes.
*
* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
* take masks of ancestors with non-empty cpus/mems, instead of being
* moved to an ancestor.
*
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{
int ret;
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
}
static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
void *ret;
spin_lock_bh(&cgroup_idr_lock);
ret = idr_replace(idr, ptr, id);
spin_unlock_bh(&cgroup_idr_lock);
return ret;
}
static void cgroup_idr_remove(struct idr *idr, int id)
{
spin_lock_bh(&cgroup_idr_lock);
idr_remove(idr, id);
spin_unlock_bh(&cgroup_idr_lock);
}
static bool cgroup_has_tasks(struct cgroup *cgrp)
{
return cgrp->nr_populated_csets;
}
bool cgroup_is_threaded(struct cgroup *cgrp)
{
return cgrp->dom_cgrp != cgrp;
}
/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
/*
* Root isn't under domain level resource control exempting it from
* the no-internal-process constraint, so it can serve as a thread
* root and a parent of resource domains at the same time.
*/
return !cgroup_parent(cgrp);
}
/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
if (cgroup_is_mixable(cgrp))
return true;
/* domain roots can't be nested under threaded */
if (cgroup_is_threaded(cgrp))
return false;
/* can only have either domain or threaded children */
if (cgrp->nr_populated_domain_children)
return false;
/* and no domain controllers can be enabled */
if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
return false;
return true;
}
/* is @cgrp root of a threaded subtree? */
bool cgroup_is_thread_root(struct cgroup *cgrp)
{
/* thread root should be a domain */
if (cgroup_is_threaded(cgrp))
return false;
/* a domain w/ threaded children is a thread root */
if (cgrp->nr_threaded_children)
return true;
/*
* A domain which has tasks and explicit threaded controllers
* enabled is a thread root.
*/
if (cgroup_has_tasks(cgrp) &&
(cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
return true;
return false;
}
/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
/* the cgroup itself can be a thread root */
if (cgroup_is_threaded(cgrp))
return false;
/* but the ancestors can't be unless mixable */
while ((cgrp = cgroup_parent(cgrp))) {
if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
return false;
if (cgroup_is_threaded(cgrp))
return false;
}
return true;
}
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
u16 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask;
return ss_mask;
}
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask);
return root_ss_mask;
}
/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
u16 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask;
return ss_mask;
}
return cgrp->root->subsys_mask;
}
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
* the caller is responsible for pinning the returned css if it wants to
* keep accessing it outside the said locks. This function may return
* %NULL if @cgrp doesn't have @subsys_id enabled.
*/
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
return &cgrp->self;
}
/**
* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
css = cgroup_css(cgrp, ss);
if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
return css;
}
/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
*
* Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss)
return &cgrp->self;
/*
* This function is used while updating css associations and thus
* can't test the csses directly. Test ss_mask.
*/
while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp);
if (!cgrp)
return NULL;
}
return cgroup_css(cgrp, ss);
}
/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
* callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG)
return NULL;
do {
css = cgroup_css(cgrp, ss);
if (css)
return css;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
return init_css_set.subsys[ss->id];
}
/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
* The returned css must be put using css_put().
*/
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG)
return NULL;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css))
goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
css = init_css_set.subsys[ss->id];
css_get(css);
out_unlock:
rcu_read_unlock();
return css;
}
EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
/**
* __cgroup_task_count - count the number of tasks in a cgroup. The caller
* is responsible for taking the css_set_lock.
* @cgrp: the cgroup in question
*/
int __cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += link->cset->nr_tasks;
return count;
}
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*/
int cgroup_task_count(const struct cgroup *cgrp)
{
int count;
spin_lock_irq(&css_set_lock);
count = __cgroup_task_count(cgrp);
spin_unlock_irq(&css_set_lock);
return count;
}
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
/*
* This is open and unprotected implementation of cgroup_css().
* seq_css() is only called from a kernfs file operation which has
* an active reference on the file. Because all the subsystem
* files are drained before a css is disassociated with a cgroup,
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
* for_each_e_css - iterate all effective css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_e_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = cgroup_e_css_by_mask(cgrp, \
cgroup_subsys[(ssid)]))) \
; \
else
/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
* @ss_mask is set.
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
if (({ lockdep_assert_held(&cgroup_mutex); \
cgroup_is_dead(child); })) \
; \
else
/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
* The following field is re-initialized when this cset gets linked
* in cgroup_init(). However, let's initialize the field
* statically too so that the default cgroup can be accessed safely
* early during boot.
*/
.dfl_cgrp = &cgrp_dfl_root.cgrp,
};
static int css_set_count = 1; /* 1 for init_css_set */
static bool css_set_threaded(struct css_set *cset)
{
return cset->dom_cset != cset;
}
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
*
* css_set_populated() should be the same as !!cset->nr_tasks at steady
* state. However, css_set_populated() can be called while a task is being
* added to or removed from the linked list before the nr_tasks is
* properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
lockdep_assert_held(&css_set_lock);
return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}
/**
* cgroup_update_populated - update the populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
* task or losing the last. Update @cgrp->nr_populated_* accordingly. The
* count is propagated towards root so that a given cgroup's
* nr_populated_children is zero iff none of its descendants contain any
* tasks.
*
* @cgrp's interface file "cgroup.populated" is zero if both
* @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
* 1 otherwise. When the sum changes from or to zero, userland is notified
* that the content of the interface file has changed. This can be used to
* detect when @cgrp and its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
struct cgroup *child = NULL;
int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do {
bool was_populated = cgroup_is_populated(cgrp);
if (!child) {
cgrp->nr_populated_csets += adj;
} else {
if (cgroup_is_threaded(child))
cgrp->nr_populated_threaded_children += adj;
else
cgrp->nr_populated_domain_children += adj;
}
if (was_populated == cgroup_is_populated(cgrp))
break;
cgroup1_check_for_release(cgrp);
TRACE_CGROUP_PATH(notify_populated, cgrp,
cgroup_is_populated(cgrp));
cgroup_file_notify(&cgrp->events_file);
child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
/**
* css_set_update_populated - update populated state of a css_set
* @cset: target css_set
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
* populated counters of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
struct cgrp_cset_link *link;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
cgroup_update_populated(link->cgrp, populated);
}
/*
* @task is leaving, advance task iterators which are pointing to it so
* that they can resume at the next position. Advancing an iterator might
* remove it from the list, use safe walk. See css_task_iter_skip() for
* details.
*/
static void css_set_skip_task_iters(struct css_set *cset,
struct task_struct *task)
{
struct css_task_iter *it, *pos;
list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
css_task_iter_skip(it, task);
}
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
* @from_cset: css_set @task currently belongs to (may be NULL)
* @to_cset: new css_set @task is being moved to (may be NULL)
* @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
*
* Move @task from @from_cset to @to_cset. If @task didn't belong to any
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
* This function automatically handles populated counter updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
bool use_mg_tasks)
{
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
if (from_cset) {
WARN_ON_ONCE(list_empty(&task->cg_list));
css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
} else {
WARN_ON_ONCE(!list_empty(&task->cg_list));
}
if (to_cset) {
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
* against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
cgroup_move_task(task, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
}
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
unsigned long key = 0UL;
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
key += (unsigned long)css[i];
key = (key >> 16) ^ key;
return key;
}
void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&css_set_lock);
if (!refcount_dec_and_test(&cset->refcount))
return;
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
/* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
}
hash_del(&cset->hlist);
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
if (cgroup_parent(link->cgrp))
cgroup_put(link->cgrp);
kfree(link);
}
if (css_set_threaded(cset)) {
list_del(&cset->threaded_csets_node);
put_css_set_locked(cset->dom_cset);
}
kfree_rcu(cset, rcu_head);
}
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct cgroup *new_dfl_cgrp;
struct list_head *l1, *l2;
/*
* On the default hierarchy, there can be csets which are
* associated with the same set of cgroups but different csses.
* Let's first ensure that csses match.
*/
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
/* @cset's domain should match the default cgroup's */
if (cgroup_on_dfl(new_cgrp))
new_dfl_cgrp = new_cgrp;
else
new_dfl_cgrp = old_cset->dfl_cgrp;
if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
return false;
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
* share the same effective css, this comparison is always
* necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
struct cgrp_cset_link *link1, *link2;
struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cgrp1->root == new_cgrp->root) {
if (cgrp1 != new_cgrp)
return false;
} else {
if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
*/
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
{
struct cgroup_root *root = cgrp->root;
struct cgroup_subsys *ss;
struct css_set *cset;
unsigned long key;
int i;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else {
/*
* @ss is not in this hierarchy, so we don't want
* to change the css.
*/
template[i] = old_cset->subsys[i];
}
}
key = css_set_hash(template);
hash_for_each_possible(css_set_table, cset, hlist, key) {
if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
return cset;
}
/* No existing cgroup group matched */
return NULL;
}
static void free_cgrp_cset_links(struct list_head *links_to_free)
{
struct cgrp_cset_link *link, *tmp_link;
list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
list_del(&link->cset_link);
kfree(link);
}
}
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
*/
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
struct cgrp_cset_link *link;
int i;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
* @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
{
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
if (cgroup_on_dfl(cgrp))
cset->dfl_cgrp = cgrp;
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
/*
* Always add links to the tail of the lists so that the lists are
* in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get_live(cgrp);
}
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
*/
static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup *cgrp)
{
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
struct cgroup_subsys *ss;
unsigned long key;
int ssid;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
if (!cset)
return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
kfree(cset);
return NULL;
}
refcount_set(&cset->refcount, 1);
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
spin_lock_irq(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
c = cgrp;
link_css_set(&tmp_links, cset, c);
}
BUG_ON(!list_empty(&tmp_links));
css_set_count++;
/* Add @cset to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cset->subsys[ssid];
list_add_tail(&cset->e_cset_node[ssid],
&css->cgroup->e_csets[ssid]);
css_get(css);
}
spin_unlock_irq(&css_set_lock);
/*
* If @cset should be threaded, look up the matching dom_cset and
* link them up. We first fully initialize @cset then look for the
* dom_cset. It's simpler this way and safe as @cset is guaranteed
* to stay empty until we return.
*/
if (cgroup_is_threaded(cset->dfl_cgrp)) {
struct css_set *dcset;
dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
if (!dcset) {
put_css_set(cset);
return NULL;
}
spin_lock_irq(&css_set_lock);
cset->dom_cset = dcset;
list_add_tail(&cset->threaded_csets_node,
&dcset->threaded_csets);
spin_unlock_irq(&css_set_lock);
}
return cset;
}
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
return root_cgrp->root;
}
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
lockdep_assert_held(&cgroup_mutex);
id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
if (id < 0)
return id;
root->hierarchy_id = id;
return 0;
}
static void cgroup_exit_root_id(struct cgroup_root *root)
{
lockdep_assert_held(&cgroup_mutex);
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
void cgroup_free_root(struct cgroup_root *root)
{
kfree(root);
}
static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
trace_cgroup_destroy_root(root);
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
spin_unlock_irq(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
cgroup_root_count--;
}
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
* look up cgroup associated with current task's cgroup namespace on the
* specified hierarchy
*/
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
rcu_read_unlock();
BUG_ON(!res);
return res;
}
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
if (cset == &init_css_set) {
res = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
res = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
BUG_ON(!res);
return res;
}
/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold css_set_lock the
* task can't change groups.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
} else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
}
return buf;
}
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* S_IRUGO for read, S_IWUSR for write.
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
{
umode_t mode = 0;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write) {
if (cft->flags & CFTYPE_WORLD_WRITABLE)
mode |= S_IWUGO;
else
mode |= S_IWUSR;
}
return mode;
}
/**
* cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @subtree_control: the new subtree_control mask to consider
* @this_ss_mask: available subsystems
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
u16 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
u16 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
} while_each_subsys_mask();
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask)
break;
cur_ss_mask = new_ss_mask;
}
return cur_ss_mask;
}
/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper undoes cgroup_kn_lock_live() and should be invoked before
* the method finishes if locking succeeded. Note that once this function
* returns the cgroup returned by cgroup_kn_lock_live() may become
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
void cgroup_kn_unlock(struct kernfs_node *kn)
{
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
mutex_unlock(&cgroup_mutex);
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
}
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
* @drain_offline: perform offline draining on the cgroup
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
* matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
* cgroup is drained of offlining csses before return.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
* active_ref. cgroup liveliness check alone provides enough
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
*/
if (!cgroup_tryget(cgrp))
return NULL;
kernfs_break_active_protection(kn);
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp))
return cgrp;
cgroup_kn_unlock(kn);
return NULL;
}
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
lockdep_assert_held(&cgroup_mutex);
if (cft->file_offset) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
struct cgroup_file *cfile = (void *)css + cft->file_offset;
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
del_timer_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}
/**
* css_clear_dir - remove subsys files in a cgroup directory
* @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
struct cftype *cfts;
if (!(css->flags & CSS_VISIBLE))
return;
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_base_files;
else
cfts = cgroup1_base_files;
cgroup_addrm_files(css, cgrp, cfts, false);
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
}
}
/**
* css_populate_dir - create subsys files in a cgroup directory
* @css: target css
*
* On failure, no file is added.
*/
static int css_populate_dir(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts;
int ret;
if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
return 0;
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_base_files;
else
cfts = cgroup1_base_files;
ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
if (ret < 0)
return ret;
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
if (ret < 0) {
failed_cfts = cfts;
goto err;
}
}
}
css->flags |= CSS_VISIBLE;
return 0;
err:
list_for_each_entry(cfts, &css->ss->cfts, node) {
if (cfts == failed_cfts)
break;
cgroup_addrm_files(css, cgrp, cfts, false);
}
return ret;
}
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
do_each_subsys_mask(ss, ssid, ss_mask) {
/*
* If @ss has non-root csses attached to it, can't move.
* If @ss is an implicit controller, it is exempt from this
* rule and can be stolen.
*/
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
/*
* Collect ssid's that need to be disabled from default
* hierarchy.
*/
if (ss->root == &cgrp_dfl_root)
dfl_disable_ss_mask |= 1 << ssid;
} while_each_subsys_mask();
if (dfl_disable_ss_mask) {
struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/*
* Controllers from default hierarchy that need to be rebound
* are all disabled together in one go.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;
WARN_ON(!css || cgroup_css(dcgrp, ss));
if (src_root != &cgrp_dfl_root) {
/* disable from the source */
src_root->subsys_mask &= ~(1 << ssid);
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
if (ss->css_rstat_flush) {
list_del_rcu(&css->rstat_css_node);
list_add_rcu(&css->rstat_css_node,
&dcgrp->rstat_css_list);
}
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
ret = cgroup_apply_control(dcgrp);
if (ret)
pr_warn("partial failure to rebind %s controller (err=%d)\n",
ss->name, ret);
if (ss->bind)
ss->bind(css);
} while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
}
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root)
{
int len = 0;
char *buf = NULL;
struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
struct cgroup *ns_cgroup;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
spin_unlock_irq(&css_set_lock);
if (len >= PATH_MAX)
len = -ERANGE;
else if (len > 0) {
seq_escape(sf, buf, " \t\n\\");
len = 0;
}
kfree(buf);
return len;
}
enum cgroup2_param {
Opt_nsdelegate,
Opt_memory_localevents,
Opt_memory_recursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
};
static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct fs_parse_result result;
int opt;
opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
if (opt < 0)
return opt;
switch (opt) {
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
}
return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
{
if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
if (root_flags & CGRP_ROOT_NS_DELEGATE)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
}
}
static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
return 0;
}
static int cgroup_reconfigure(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
apply_cgroup_root_flags(ctx->flags);
return 0;
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ssid;
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
void init_cgroup_root(struct cgroup_fs_context *ctx)
{
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
root->flags = ctx->flags;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL);
if (ret)
goto out;
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. Later rebinding may disable
* controllers on the default hierarchy and thus create new csets,
* which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
ret = cgroup_init_root_id(root);
if (ret)
goto cancel_ref;
kf_sops = root == &cgrp_dfl_root ?
&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
ret = rebind_subsystems(root, ss_mask);
if (ret)
goto exit_stats;
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
trace_cgroup_setup_root(root);
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
ret = 0;
goto out;
exit_stats:
cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
}
int cgroup_do_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
ctx->kfc.root = ctx->root->kf_root;
if (fc->fs_type == &cgroup2_fs_type)
ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
else
ctx->kfc.magic = CGROUP_SUPER_MAGIC;
ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
if (IS_ERR(nsdentry)) {
deactivate_locked_super(sb);
ret = PTR_ERR(nsdentry);
nsdentry = NULL;
}
fc->root = nsdentry;
}
if (!ctx->kfc.new_sb_created)
cgroup_put(&ctx->root->cgrp);
return ret;
}
/*
* Destroy a cgroup filesystem context.
*/
static void cgroup_fs_context_free(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
kfree(ctx->name);
kfree(ctx->release_agent);
put_cgroup_ns(ctx->ns);
kernfs_free_fs_context(fc);
kfree(ctx);
}
static int cgroup_get_tree(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
ret = cgroup_do_get_tree(fc);
if (!ret)
apply_cgroup_root_flags(ctx->flags);
return ret;
}
static const struct fs_context_operations cgroup_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup2_parse_param,
.get_tree = cgroup_get_tree,
.reconfigure = cgroup_reconfigure,
};
static const struct fs_context_operations cgroup1_fs_context_ops = {
.free = cgroup_fs_context_free,
.parse_param = cgroup1_parse_param,
.get_tree = cgroup1_get_tree,
.reconfigure = cgroup1_reconfigure,
};
/*
* Initialise the cgroup filesystem creation/reconfiguration context. Notably,
* we select the namespace we're going to use.
*/
static int cgroup_init_fs_context(struct fs_context *fc)
{
struct cgroup_fs_context *ctx;
ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
fc->fs_private = &ctx->kfc;
if (fc->fs_type == &cgroup2_fs_type)
fc->ops = &cgroup_fs_context_ops;
else
fc->ops = &cgroup1_fs_context_ops;
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
{
struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
!percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
}
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.init_fs_context = cgroup_init_fs_context,
.parameters = cgroup1_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.init_fs_context = cgroup_init_fs_context,
.parameters = cgroup2_fs_parameters,
.kill_sb = cgroup_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
#ifdef CONFIG_CPUSETS
static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free,
};
/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
static int cpuset_init_fs_context(struct fs_context *fc)
{
char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
struct cgroup_fs_context *ctx;
int err;
err = cgroup_init_fs_context(fc);
if (err) {
kfree(agent);
return err;
}
fc->ops = &cpuset_fs_context_ops;
ctx = cgroup_fc2context(fc);
ctx->subsys_mask = 1 << cpuset_cgrp_id;
ctx->flags |= CGRP_ROOT_NOPREFIX;
ctx->release_agent = agent;
get_filesystem(&cgroup_fs_type);
put_filesystem(fc->fs_type);
fc->fs_type = &cgroup_fs_type;
return 0;
}
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
.init_fs_context = cpuset_init_fs_context,
.fs_flags = FS_USERNS_MOUNT,
};
#endif
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
* Determine @task's cgroup on the first (the one with the lowest non-zero
* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
* Return value is the same as kernfs_path().
*/
int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
{
struct cgroup_root *root;
struct cgroup *cgrp;
int hierarchy_id = 1;
int ret;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
if (root) {
cgrp = task_cgroup_from_root(task, root);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
ret = strlcpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
*
* Add @task, which is a migration target, to @mgctx->tset. This function
* becomes noop if @task doesn't need to be migrated. @task's css_set
* should have been added as a migration source and @task->cg_list will be
* moved from the css_set's tasks list to mg_tasks one.
*/
static void cgroup_migrate_add_task(struct task_struct *task,
struct cgroup_mgctx *mgctx)
{
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* @task either already exited or can't exit until the end */
if (task->flags & PF_EXITING)
return;
/* cgroup_threadgroup_rwsem protects racing against forks */
WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task);
if (!cset->mg_src_cgrp)
return;
mgctx->tset.nr_tasks++;
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
&mgctx->tset.src_csets);
if (list_empty(&cset->mg_dst_cset->mg_node))
list_add_tail(&cset->mg_dst_cset->mg_node,
&mgctx->tset.dst_csets);
}
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* @tset iteration is initialized and the first task is returned.
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
tset->cur_task = NULL;
return cgroup_taskset_next(tset, dst_cssp);
}
/**
* cgroup_taskset_next - iterate to the next task in taskset
* @tset: taskset of interest
* @dst_cssp: output variable for the destination css
*
* Return the next task in @tset. Iteration must have been initialized
* with cgroup_taskset_first().
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct cgroup_subsys_state **dst_cssp)
{
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
else
task = list_next_entry(task, cg_list);
if (&task->cg_list != &cset->mg_tasks) {
tset->cur_cset = cset;
tset->cur_task = task;
/*
* This function may be called both before and
* after cgroup_taskset_migrate(). The two cases
* can be distinguished by looking at whether @cset
* has its ->mg_dst_cset set.
*/
if (cset->mg_dst_cset)
*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
else
*dst_cssp = cset->subsys[tset->ssid];
return task;
}
cset = list_next_entry(cset, mg_node);
task = NULL;
}
return NULL;
}
/**
* cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
* This function fails iff one of the ->can_attach callbacks fails and
* guarantees that either all or none of the tasks in @mgctx are migrated.
* @mgctx is consumed regardless of success.
*/
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
struct cgroup_taskset *tset = &mgctx->tset;
struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
/* check that we can legitimately attach to the cgroup */
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset);
if (ret) {
failed_ssid = ssid;
goto out_cancel_attach;
}
}
} while_each_subsys_mask();
}
/*
* Now that we're guaranteed success, proceed to move all tasks to
* the new cgroup. There are no failure cases after here, so this
* is the commit point.
*/
spin_lock_irq(&css_set_lock);
list_for_each_entry(cset, &tset->src_csets, mg_node) {
list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
struct css_set *from_cset = task_css_set(task);
struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
from_cset->nr_tasks--;
/*
* If the source or destination cgroup is frozen,
* the task might require to change its state.
*/
cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
to_cset->dfl_cgrp);
put_css_set_locked(from_cset);
}
}
spin_unlock_irq(&css_set_lock);
/*
* Migration is committed, all target tasks are now on dst_csets.
* Nothing is sensitive to fork() after this point. Notify
* controllers that migration is complete.
*/
tset->csets = &tset->dst_csets;
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
}
} while_each_subsys_mask();
}
ret = 0;
goto out_release_tset;
out_cancel_attach:
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ssid == failed_ssid)
break;
if (ss->cancel_attach) {
tset->ssid = ssid;
ss->cancel_attach(tset);
}
} while_each_subsys_mask();
}
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
/*
* Re-initialize the cgroup_taskset structure in case it is reused
* again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
* iteration.
*/
tset->nr_tasks = 0;
tset->csets = &tset->src_csets;
return ret;
}
/**
* cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
* @dst_cgrp: destination cgroup to test
*
* On the default hierarchy, except for the mixable, (possible) thread root
* and threaded cgroups, subtree_control must be zero for migration
* destination cgroups with tasks so that child cgroups don't compete
* against tasks.
*/
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
/* v1 doesn't have any restriction */
if (!cgroup_on_dfl(dst_cgrp))
return 0;
/* verify @dst_cgrp can host resources */
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
/* mixables don't care */
if (cgroup_is_mixable(dst_cgrp))
return 0;
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
*/
if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
return 0;
/* apply no-internal-process constraint */
if (dst_cgrp->subtree_control)
return -EBUSY;
return 0;
}
/**
* cgroup_migrate_finish - cleanup after attach
* @mgctx: migration context
*
* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
* those functions for details.
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
spin_unlock_irq(&css_set_lock);
}
/**
* cgroup_migrate_add_src - add a migration source css_set
* @src_cset: the source css_set to add
* @dst_cgrp: the destination cgroup
* @mgctx: migration context
*
* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
* @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
* This function may be called without holding cgroup_threadgroup_rwsem
* even if the target is a process. Threads may be created and destroyed
* but as long as cgroup_mutex is not dropped, no new css_set can be put
* into play and the preloaded css_sets are guaranteed to cover all
* migrations.
*/
void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
/*
* If ->dead, @src_set is associated with one or more dead cgroups
* and doesn't contain any migratable tasks. Ignore it early so
* that the rest of migration path doesn't get confused by it.
*/
if (src_cset->dead)
return;
if (!list_empty(&src_cset->mg_preload_node))
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
* @mgctx: migration context
*
* Tasks are about to be moved and all the source css_sets have been
* preloaded to @mgctx->preloaded_src_csets. This function looks up and
* pins all destination css_sets, links each to its source, and append them
* to @mgctx->preloaded_dst_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @mgctx.
*/
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
mg_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
return -ENOMEM;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
/*
* If src cset equals dst, it's noop. Drop the src.
* cgroup_migrate() will skip the cset too. Note that we
* can't handle src == dst as some nodes are used by both.
*/
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
}
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
list_add_tail(&dst_cset->mg_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
for_each_subsys(ss, ssid)
if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
mgctx->ss_mask |= 1 << ssid;
}
return 0;
}
/**
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
* @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
* responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
*
* As long as a controller's ->can_attach() doesn't fail, this function is
* guaranteed to succeed. This means that, excluding ->can_attach()
* failure, when migrating multiple targets, the success or failure can be
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
struct cgroup_mgctx *mgctx)
{
struct task_struct *task;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
}
/**
* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
* @dst_cgrp: the cgroup to attach to
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup)
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (!ret)
ret = cgroup_migrate(leader, threadgroup, &mgctx);
cgroup_migrate_finish(&mgctx);
if (!ret)
TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
return ret;
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem)
{
struct task_struct *tsk;
pid_t pid;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
/*
* If we migrate a single thread, we don't care about threadgroup
* stability. If the thread is `current`, it won't exit(2) under our
* hands or change PID through exec(2). We exclude
* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
* callers by cgroup_mutex.
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
if (pid || threadgroup) {
percpu_down_write(&cgroup_threadgroup_rwsem);
*locked = true;
} else {
*locked = false;
}
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
tsk = ERR_PTR(-ESRCH);
goto out_unlock_threadgroup;
}
} else {
tsk = current;
}
if (threadgroup)
tsk = tsk->group_leader;
/*
* kthreads may acquire PF_NO_SETAFFINITY during initialization.
* If userland migrates such a kthread to a non-root cgroup, it can
* become trapped in a cpuset, or RT kthread may be born in a
* cgroup with no rt_runtime allocated. Just say no.
*/
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL);
goto out_unlock_threadgroup;
}
get_task_struct(tsk);
goto out_unlock_rcu;
out_unlock_threadgroup:
if (*locked) {
percpu_up_write(&cgroup_threadgroup_rwsem);
*locked = false;
}
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
void cgroup_procs_write_finish(struct task_struct *task, bool locked)
__releases(&cgroup_threadgroup_rwsem)
{
struct cgroup_subsys *ss;
int ssid;
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
if (locked)
percpu_up_write(&cgroup_threadgroup_rwsem);
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
}
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
seq_puts(seq, ss->name);
printed = true;
} while_each_subsys_mask();
if (printed)
seq_putc(seq, '\n');
}
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgroup_control(cgrp));
return 0;
}
/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
/**
* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
* @cgrp: root of the subtree to update csses for
*
* @cgrp's control masks have changed and its subtree's css associations
* need to be updated accordingly. This function looks up all css_sets
* which are attached to the subtree, creates the matching updated css_sets
* and migrates the tasks to the new ones.
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
int ret;
lockdep_assert_held(&cgroup_mutex);
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
/**
* cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
* @cgrp: root of the target subtree
*
* Because css offlining is asynchronous, userland may try to re-enable a
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
restart:
mutex_lock(&cgroup_mutex);
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
DEFINE_WAIT(wait);
if (!css || !percpu_ref_is_dying(&css->refcnt))
continue;
cgroup_get_live(dsct);
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&dsct->offline_waitq, &wait);
cgroup_put(dsct);
goto restart;
}
}
}
/**
* cgroup_save_control - save control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
* Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
* respective old_ prefixed fields for @cgrp's subtree including @cgrp
* itself.
*/
static void cgroup_save_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->old_subtree_control = dsct->subtree_control;
dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
dsct->old_dom_cgrp = dsct->dom_cgrp;
}
}
/**
* cgroup_propagate_control - refresh control masks of a subtree
* @cgrp: root of the target subtree
*
* For @cgrp and its subtree, ensure ->subtree_ss_mask matches
* ->subtree_control and propagate controller availability through the
* subtree so that descendants don't have unavailable controllers enabled.
*/
static void cgroup_propagate_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
dsct->subtree_control &= cgroup_control(dsct);
dsct->subtree_ss_mask =
cgroup_calc_subtree_ss_mask(dsct->subtree_control,
cgroup_ss_mask(dsct));
}
}
/**
* cgroup_restore_control - restore control masks and dom_cgrp of a subtree
* @cgrp: root of the target subtree
*
* Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
* respective old_ prefixed fields for @cgrp's subtree including @cgrp
* itself.
*/
static void cgroup_restore_control(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
dsct->subtree_control = dsct->old_subtree_control;
dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
dsct->dom_cgrp = dsct->old_dom_cgrp;
}
}
static bool css_visible(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
if (cgroup_control(cgrp) & (1 << ss->id))
return true;
if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
return false;
return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}
/**
* cgroup_apply_control_enable - enable or show csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and create new csses or make the existing ones
* visible. A css is created invisible if it's being implicitly enabled
* through dependency. An invisible css is made visible when the userland
* explicitly enables it.
*
* Returns 0 on success, -errno on failure. On failure, csses which have
* been processed already aren't cleaned up. The caller is responsible for
* cleaning up with cgroup_apply_control_disable().
*/
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid, ret;
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
continue;
if (!css) {
css = css_create(dsct, ss);
if (IS_ERR(css))
return PTR_ERR(css);
}
WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
if (css_visible(css)) {
ret = css_populate_dir(css);
if (ret)
return ret;
}
}
}
return 0;
}
/**
* cgroup_apply_control_disable - kill or hide csses according to control
* @cgrp: root of the target subtree
*
* Walk @cgrp's subtree and kill and hide csses so that they match
* cgroup_ss_mask() and cgroup_visible_mask().
*
* A css is hidden when the userland requests it to be disabled while other
* subsystems are still depending on it. The css must not actively control
* resources and be in the vanilla state if it's made visible again later.
* Controllers which may be depended upon should provide ->css_reset() for
* this purpose.
*/
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
struct cgroup_subsys *ss;
int ssid;
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
if (!css)
continue;
WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
ss->css_reset(css);
}
}
}
}
/**
* cgroup_apply_control - apply control mask updates to the subtree
* @cgrp: root of the target subtree
*
* subsystems can be enabled and disabled in a subtree using the following
* steps.
*
* 1. Call cgroup_save_control() to stash the current state.
* 2. Update ->subtree_control masks in the subtree as desired.
* 3. Call cgroup_apply_control() to apply the changes.
* 4. Optionally perform other related operations.
* 5. Call cgroup_finalize_control() to finish up.
*
* This function implements step 3 and propagates the mask changes
* throughout @cgrp's subtree, updates csses accordingly and perform
* process migrations.
*/
static int cgroup_apply_control(struct cgroup *cgrp)
{
int ret;
cgroup_propagate_control(cgrp);
ret = cgroup_apply_control_enable(cgrp);
if (ret)
return ret;
/*
* At this point, cgroup_e_css_by_mask() results reflect the new csses
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
return ret;
return 0;
}
/**
* cgroup_finalize_control - finalize control mask update
* @cgrp: root of the target subtree
* @ret: the result of the update
*
* Finalize control mask update. See cgroup_apply_control() for more info.
*/
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
if (ret) {
cgroup_restore_control(cgrp);
cgroup_propagate_control(cgrp);
}
cgroup_apply_control_disable(cgrp);
}
static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
{
u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
return 0;
/* can @cgrp host any resources? */
if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
return -EOPNOTSUPP;
/* mixables don't care */
if (cgroup_is_mixable(cgrp))
return 0;
if (domain_enable) {
/* can't enable domain controllers inside a thread subtree */
if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
return -EOPNOTSUPP;
} else {
/*
* Threaded controllers can handle internal competitions
* and are always allowed inside a (prospective) thread
* subtree.
*/
if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
return 0;
}
/*
* Controllers can't be enabled for a cgroup with tasks to avoid
* child cgroups competing against tasks.
*/
if (cgroup_has_tasks(cgrp))
return -EBUSY;
return 0;
}
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
u16 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
int ssid, ret;
/*
* Parse input - space separated list of subsystem names prefixed
* with either + or -.
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
if (tok[0] == '\0')
continue;
do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
enable |= 1 << ssid;
disable &= ~(1 << ssid);
} else if (*tok == '-') {
disable |= 1 << ssid;
enable &= ~(1 << ssid);
} else {
return -EINVAL;
}
break;
} while_each_subsys_mask();
if (ssid == CGROUP_SUBSYS_COUNT)
return -EINVAL;
}
cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENODEV;
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
if (!(cgroup_control(cgrp) & (1 << ssid))) {
ret = -ENOENT;
goto out_unlock;
}
} else if (disable & (1 << ssid)) {
if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
}
}
}
if (!enable && !disable) {
ret = 0;
goto out_unlock;
}
ret = cgroup_vet_subtree_control_enable(cgrp, enable);
if (ret)
goto out_unlock;
/* save and update control masks and prepare csses */
cgroup_save_control(cgrp);
cgrp->subtree_control |= enable;
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
cgroup_finalize_control(cgrp, ret);
if (ret)
goto out_unlock;
kernfs_activate(cgrp->kn);
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
/**
* cgroup_enable_threaded - make @cgrp threaded
* @cgrp: the target cgroup
*
* Called when "threaded" is written to the cgroup.type interface file and
* tries to make @cgrp threaded and join the parent's resource domain.
* This function is never called on the root cgroup as cgroup.type doesn't
* exist on it.
*/
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup *dom_cgrp = parent->dom_cgrp;
struct cgroup *dsct;
struct cgroup_subsys_state *d_css;
int ret;
lockdep_assert_held(&cgroup_mutex);
/* noop if already threaded */
if (cgroup_is_threaded(cgrp))
return 0;
/*
* If @cgroup is populated or has domain controllers enabled, it
* can't be switched. While the below cgroup_can_be_thread_root()
* test can catch the same conditions, that's only when @parent is
* not mixable, so let's check it explicitly.
*/
if (cgroup_is_populated(cgrp) ||
cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
return -EOPNOTSUPP;
/* we're joining the parent's domain, ensure its validity */
if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp))
return -EOPNOTSUPP;
/*
* The following shouldn't cause actual migrations and should
* always succeed.
*/
cgroup_save_control(cgrp);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
if (dsct == cgrp || cgroup_is_threaded(dsct))
dsct->dom_cgrp = dom_cgrp;
ret = cgroup_apply_control(cgrp);
if (!ret)
parent->nr_threaded_children++;
cgroup_finalize_control(cgrp, ret);
return ret;
}
static int cgroup_type_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
if (cgroup_is_threaded(cgrp))
seq_puts(seq, "threaded\n");
else if (!cgroup_is_valid_domain(cgrp))
seq_puts(seq, "domain invalid\n");
else if (cgroup_is_thread_root(cgrp))
seq_puts(seq, "domain threaded\n");
else
seq_puts(seq, "domain\n");
return 0;
}
static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
int ret;
/* only switching to threaded mode is supported */
if (strcmp(strstrip(buf), "threaded"))
return -EINVAL;
/* drain dying csses before we re-apply (threaded) subtree control */
cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENOENT;
/* threaded can only be enabled */
ret = cgroup_enable_threaded(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
int descendants = READ_ONCE(cgrp->max_descendants);
if (descendants == INT_MAX)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%d\n", descendants);
return 0;
}
static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
int descendants;
ssize_t ret;
buf = strstrip(buf);
if (!strcmp(buf, "max")) {
descendants = INT_MAX;
} else {
ret = kstrtoint(buf, 0, &descendants);
if (ret)
return ret;
}
if (descendants < 0)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgrp->max_descendants = descendants;
cgroup_kn_unlock(of->kn);
return nbytes;
}
static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
int depth = READ_ONCE(cgrp->max_depth);
if (depth == INT_MAX)
seq_puts(seq, "max\n");
else
seq_printf(seq, "%d\n", depth);
return 0;
}
static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
ssize_t ret;
int depth;
buf = strstrip(buf);
if (!strcmp(buf, "max")) {
depth = INT_MAX;
} else {
ret = kstrtoint(buf, 0, &depth);
if (ret)
return ret;
}
if (depth < 0)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgrp->max_depth = depth;
cgroup_kn_unlock(of->kn);
return nbytes;
}
static int cgroup_events_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
return 0;
}
static int cgroup_stat_show(struct seq_file *seq, void *v)
{
struct cgroup *cgroup = seq_css(seq)->cgroup;
seq_printf(seq, "nr_descendants %d\n",
cgroup->nr_descendants);
seq_printf(seq, "nr_dying_descendants %d\n",
cgroup->nr_dying_descendants);
return 0;
}
static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
struct cgroup *cgrp, int ssid)
{
struct cgroup_subsys *ss = cgroup_subsys[ssid];
struct cgroup_subsys_state *css;
int ret;
if (!ss->css_extra_stat_show)
return 0;
css = cgroup_tryget_css(cgrp, ss);
if (!css)
return 0;
ret = ss->css_extra_stat_show(seq, css);
css_put(css);
return ret;
}
static int cpu_stat_show(struct seq_file *seq, void *v)
{
struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
int ret = 0;
cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
return ret;
}
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
}
static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
struct cgroup_file_ctx *ctx = of->priv;
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
{
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
return false;
}
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "%d\n", cgrp->freezer.freeze);
return 0;
}
static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
ssize_t ret;
int freeze;
ret = kstrtoint(strstrip(buf), 0, &freeze);
if (ret)
return ret;
if (freeze < 0 || freeze > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
cgroup_freeze(cgrp, freeze);
cgroup_kn_unlock(of->kn);
return nbytes;
}
static void __cgroup_kill(struct cgroup *cgrp)
{
struct css_task_iter it;
struct task_struct *task;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
set_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
while ((task = css_task_iter_next(&it))) {
/* Ignore kernel threads here. */
if (task->flags & PF_KTHREAD)
continue;
/* Skip tasks that are already dying. */
if (__fatal_signal_pending(task))
continue;
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
spin_lock_irq(&css_set_lock);
clear_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
{
struct cgroup_subsys_state *css;
struct cgroup *dsct;
lockdep_assert_held(&cgroup_mutex);
cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
__cgroup_kill(dsct);
}
static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
ssize_t ret = 0;
int kill;
struct cgroup *cgrp;
ret = kstrtoint(strstrip(buf), 0, &kill);
if (ret)
return ret;
if (kill != 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
/*
* Killing is a process directed operation, i.e. the whole thread-group
* is taken down so act like we do for cgroup.procs and only make this
* writable in non-threaded cgroups.
*/
if (cgroup_is_threaded(cgrp))
ret = -EOPNOTSUPP;
else
cgroup_kill(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
struct cgroup_file_ctx *ctx;
int ret;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->ns = current->nsproxy->cgroup_ns;
get_cgroup_ns(ctx->ns);
of->priv = ctx;
if (!cft->open)
return 0;
ret = cft->open(of);
if (ret) {
put_cgroup_ns(ctx->ns);
kfree(ctx);
}
return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
put_cgroup_ns(ctx->ns);
kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
if (!nbytes)
return 0;
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable -
* cgroup.procs and cgroup.subtree_control.
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
return cft->write(of, buf, nbytes, off);
/*
* kernfs guarantees that a file isn't deleted with operations in
* flight, which means that the matching css is and stays alive and
* doesn't need to be pinned. The RCU locking is not necessary
* either. It's just for the convenience of using cgroup_css().
*/
rcu_read_lock();
css = cgroup_css(cgrp, cft->ss);
rcu_read_unlock();
if (cft->write_u64) {
unsigned long long v;
ret = kstrtoull(buf, 0, &v);
if (!ret)
ret = cft->write_u64(css, cft, v);
} else if (cft->write_s64) {
long long v;
ret = kstrtoll(buf, 0, &v);
if (!ret)
ret = cft->write_s64(css, cft, v);
} else {
ret = -EINVAL;
}
return ret ?: nbytes;
}
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
return kernfs_generic_poll(of, pt);
}
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
}
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
return seq_cft(seq)->seq_next(seq, v, ppos);
}
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
if (seq_cft(seq)->seq_stop)
seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
if (cft->seq_show)
return cft->seq_show(m, arg);
if (cft->read_u64)
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
return -EINVAL;
return 0;
}
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
.poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
.poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
.seq_show = cgroup_seqfile_show,
};
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = current_fsuid(),
.ia_gid = current_fsgid(), };
if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
return 0;
return kernfs_setattr(kn, &iattr);
}
static void cgroup_file_notify_timer(struct timer_list *timer)
{
cgroup_file_notify(container_of(timer, struct cgroup_file,
notify_timer));
}
static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
int ret;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft),
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
0, cft->kf_ops, cft,
NULL, key);
if (IS_ERR(kn))
return PTR_ERR(kn);
ret = cgroup_kn_set_ugid(kn);
if (ret) {
kernfs_remove(kn);
return ret;
}
if (cft->file_offset) {
struct cgroup_file *cfile = (void *)css + cft->file_offset;
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
spin_lock_irq(&cgroup_file_kn_lock);
cfile->kn = kn;
spin_unlock_irq(&cgroup_file_kn_lock);
}
return 0;
}
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
* @css: the target css
* @cgrp: the target cgroup (usually css->cgroup)
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
* For removals, this function never fails.
*/
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add)
{
struct cftype *cft, *cft_end = NULL;
int ret = 0;
lockdep_assert_held(&cgroup_mutex);
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
cft_end = cft;
is_add = false;
goto restart;
}
} else {
cgroup_rm_file(cgrp, cft);
}
}
return ret;
}
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
struct cgroup_subsys *ss = cfts[0].ss;
struct cgroup *root = &ss->root->cgrp;
struct cgroup_subsys_state *css;
int ret = 0;
lockdep_assert_held(&cgroup_mutex);
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
if (!(css->flags & CSS_VISIBLE))
continue;
ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
if (ret)
break;
}
if (is_add && !ret)
kernfs_activate(root->kn);
return ret;
}
static void cgroup_exit_cftypes(struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* free copy for custom atomic_write_len, see init_cftypes() */
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
kf_ops = &cgroup_kf_single_ops;
/*
* Ugh... if @cft wants a custom max_write_len, we need to
* make a copy of kf_ops to set its atomic_write_len.
*/
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
cgroup_exit_cftypes(cfts);
return -ENOMEM;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
}
return 0;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
if (!cfts || !cfts[0].ss)
return -ENOENT;
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
return 0;
}
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Unregister @cfts. Files described by @cfts are removed from all
* existing cgroups and all future cgroups won't have them either. This
* function can be called anytime whether @cfts' subsys is attached or not.
*
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
* registered.
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
return ret;
}
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Register @cfts to @ss. Files described by @cfts are created for all
* existing cgroups to which @ss is attached and all future cgroups will
* have them too. This function can be called anytime whether @ss is
* attached or not.
*
* Returns 0 on successful registration, -errno on failure. Note that this
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
if (!cgroup_ssid_enabled(ss->id))
return 0;
if (!cfts || cfts[0].name[0] == '\0')
return 0;
ret = cgroup_init_cftypes(ss, cfts);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
return ret;
}
/**
* cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Similar to cgroup_add_cftypes() but the added files are only used for
* the default hierarchy.
*/
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
cft->flags |= __CFTYPE_ONLY_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
/**
* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Similar to cgroup_add_cftypes() but the added files are only used for
* the legacy hierarchies.
*/
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
/**
* cgroup_file_notify - generate a file modified event for a cgroup_file
* @cfile: target cgroup_file
*
* @cfile must have been obtained by setting cftype->file_offset.
*/
void cgroup_file_notify(struct cgroup_file *cfile)
{
unsigned long flags;
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
if (cfile->kn) {
unsigned long last = cfile->notified_at;
unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
if (time_in_range(jiffies, last, next)) {
timer_reduce(&cfile->notify_timer, next);
} else {
kernfs_notify(cfile->kn);
cfile->notified_at = jiffies;
}
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
*
* This function returns the next child of @parent and should be called
* under either cgroup_mutex or RCU read lock. The only requirement is
* that @parent and @pos are accessible. The next sibling is guaranteed to
* be returned regardless of their states.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *parent)
{
struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/*
* @pos could already have been unlinked from the sibling list.
* Once a cgroup is removed, its ->sibling.next is no longer
* updated when its next sibling changes. CSS_RELEASED is set when
* @pos is taken off list, at which time its next pointer is valid,
* and, as releases are serialized, the one pointed to by the next
* pointer is guaranteed to not have started release yet. This
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
* have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
* increasing unique serial number and always appended to the
* sibling list, the next one can be found by walking the parent's
* children until the first css with higher serial number than
* @pos's. While this path can be slower, it happens iff iteration
* races against release and the race window is very small.
*/
if (!pos) {
next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
list_for_each_entry_rcu(next, &parent->children, sibling,
lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
/*
* @next, if not pointing to the head, can be dereferenced and is
* the next sibling.
*/
if (&next->sibling != &parent->children)
return next;
return NULL;
}
/**
* css_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
* @root: css whose descendants to walk
*
* To be used by css_for_each_descendant_pre(). Find the next descendant
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
*/
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
{
struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit @root */
if (!pos)
return root;
/* visit the first child if exists */
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
while (pos != root) {
next = css_next_child(pos, pos->parent);
if (next)
return next;
pos = pos->parent;
}
return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
*
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
*
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last, *tmp;
cgroup_assert_mutex_or_rcu_locked();
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last;
do {
last = pos;
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
* css_next_descendant_post - find the next de
View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment