Skip to content

Instantly share code, notes, and snippets.

@akshithg
Created March 7, 2023 05:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akshithg/f3270f1f074950a344020697ec9b38dc to your computer and use it in GitHub Desktop.
Save akshithg/f3270f1f074950a344020697ec9b38dc to your computer and use it in GitHub Desktop.
Cases of non-determinism in the kernel
---#
arch/x86/kernel/process.c:119
---
static int set_new_tls(struct task_struct *p, unsigned long tls)
{
struct user_desc __user *utls = (struct user_desc __user *)tls;
if (in_ia32_syscall()) ## x ##
return do_set_thread_area(p, -1, utls, 0);
else
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
---
`in_ia32_syscall()` check if the current process is in a 32-bit syscall.
if so, it will use `do_set_thread_area()` to set the `tls` (Thread Local Storage).
Otherwise, it will use `do_set_thread_area_64()` to set the `tls`.
---#
arch/x86/kernel/cpu/mce/core.c:1519
---
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
unsigned long iv;
WARN_ON(cpu_t != t);
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
goto done;
}
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
*/
if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100); ## x ##
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
done:
__this_cpu_write(mce_next_interval, iv);
__start_timer(t, iv);
}
---
`mce_notify_irq()` will check a MCE (Machine Check Exception) happened. If so
it will increase the polling interval. Otherwise, it will decrease the polling
interval.
---#
arch/x86/kernel/hpet.c:699
---
static u64 read_hpet(struct clocksource *cs)
{
unsigned long flags;
union hpet_lock old, new;
BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
/*
* Read HPET directly if in NMI.
*/
if (in_nmi())
return (u64)hpet_readl(HPET_COUNTER);
/*
* Read the current state of the lock and HPET value atomically.
*/
old.lockval = READ_ONCE(hpet.lockval); ## x ##
if (arch_spin_is_locked(&old.lock))
goto contended;
local_irq_save(flags);
if (arch_spin_trylock(&hpet.lock)) {
new.value = hpet_readl(HPET_COUNTER);
/*
* Use WRITE_ONCE() to prevent store tearing.
*/
WRITE_ONCE(hpet.value, new.value);
arch_spin_unlock(&hpet.lock);
local_irq_restore(flags);
return (u64)new.value;
}
local_irq_restore(flags);
contended:
/*
* Contended case
* --------------
* Wait until the HPET value change or the lock is free to indicate
* its value is up-to-date.
*
* It is possible that old.value has already contained the latest
* HPET value while the lock holder was in the process of releasing
* the lock. Checking for lock state change will enable us to return
* the value immediately instead of waiting for the next HPET reader
* to come along.
*/
do {
cpu_relax();
new.lockval = READ_ONCE(hpet.lockval);
} while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
return (u64)new.value;
}
---
`READ_ONCE()` is a macro to read a variable without any compiler optimization.
https://www.kernel.org/doc/Documentation/memory-barriers.txt
If `in_nmi` is true, it will read the HPET directly. Otherwise, it will
read the HPET value atomically by using `arch_spin_trylock()`.
---#
arch/x86/kernel/process.h:26
---
/*
* This needs to be inline to optimize for the common case where no extra
* work needs to be done.
*/
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long prev_tif = task_thread_info(prev)->flags;
if (IS_ENABLED(CONFIG_SMP)) {
/*
* Avoid __switch_to_xtra() invocation when conditional
* STIBP is disabled and the only different bit is
* TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
* in the TIF_WORK_CTXSW masks.
*/
if (!static_branch_likely(&switch_to_cond_stibp)) { ## x ##
prev_tif &= ~_TIF_SPEC_IB;
next_tif &= ~_TIF_SPEC_IB;
}
}
/*
* __switch_to_xtra() handles debug registers, i/o bitmaps,
* speculation mitigations etc.
*/
if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
prev_tif & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev, next);
}
---
`static_branch_likely()` is a macro to check if a static branch is enabled.
If not, it sets the speculation to inverse of `TIF_SPEC_IB`
(Indirect Branch Speculation).
---#
arch/x86/kernel/process_64.c:213
---
/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
* is hot.
*/
static __always_inline void save_base_legacy(struct task_struct *prev_p,
unsigned short selector,
enum which_selector which)
{
if (likely(selector == 0)) { ## x ##
/*
* On Intel (without X86_BUG_NULL_SEG), the segment base could
* be the pre-existing saved base or it could be zero. On AMD
* (with X86_BUG_NULL_SEG), the segment base could be almost
* anything.
*
* This branch is very hot (it's hit twice on almost every
* context switch between 64-bit programs), and avoiding
* the RDMSR helps a lot, so we just assume that whatever
* value is already saved is correct. This matches historical
* Linux behavior, so it won't break existing applications.
*
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
* report that the base is zero, it needs to actually be zero:
* see the corresponding logic in load_seg_legacy.
*/
} else {
/*
* If the selector is 1, 2, or 3, then the base is zero on
* !X86_BUG_NULL_SEG CPUs and could be anything on
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
* has never attempted to preserve the base across context
* switches.
*
* If selector > 3, then it refers to a real segment, and
* saving the base isn't necessary.
*/
if (which == FS)
prev_p->thread.fsbase = 0;
else
prev_p->thread.gsbase = 0;
}
}
---
this is related to branch prediction, and setting up the FS, GS base registers.
if `selector` is likely to be 0, then noting happens
otherwise, it will set either the `fsbase` or `gsbase` to 0.
---#
arch/x86/kernel/process_64.c:629
---
/*
* switch_to(x,y) should switch tasks from x to y.
*
* This could still be optimized:
* - fold all the options into a flag word and test it with a single test.
* - could test fs/gs bitsliced
*
* Kprobes not supported here. Set the probe on schedule instead.
* Function graph tracer not supported too.
*/
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
# ...
/*
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
switch_fpu_finish(next_fpu);
/* Reload sp0. */
update_task_stack(next_p);
switch_to_extra(prev_p, next_p);
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
* does not update the cached descriptor. As a result, if we
* do SYSRET while SS is NULL, we'll end up in user mode with
* SS apparently equal to __USER_DS but actually unusable.
*
* The straightforward workaround would be to fix it up just
* before SYSRET, but that would slow down the system call
* fast paths. Instead, we ensure that SS is never NULL in
* system call context. We do this by replacing NULL SS
* selectors at every context switch. SYSCALL sets up a valid
* SS, so the only way to get NULL is to re-enter the kernel
* from CPL 3 through an interrupt. Since that can't happen
* in the same task as a running syscall, we are guaranteed to
* context switch between every interrupt vector entry and a
* subsequent SYSRET.
*
* We read SS first because SS reads are much faster than
* writes. Out of caution, we force SS to __KERNEL_DS even if
* it previously had a different non-NULL value.
*/
unsigned short ss_sel;
savesegment(ss, ss_sel);
if (ss_sel != __KERNEL_DS) ## x ##
loadsegment(ss, __KERNEL_DS);
}
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in();
return prev_p;
}
---
`switch_to` is the function that switches from one task to another. this line is
a fix for a known bug in AMD CPUs. SS reads are much faster than writes. Out of
caution, SS is forced to __KERNEL_DS even if it previously had a different
non-NULL value.
---#
arch/x86/kernel/signal.c:91
---
static int restore_sigcontext(struct pt_regs *regs,
struct sigcontext __user *usc,
unsigned long uc_flags)
{
struct sigcontext sc;
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) ## x ##
return -EFAULT;
#...
/* Get CS/SS and force CPL3 */
regs->cs = sc.cs | 0x03;
regs->ss = sc.ss | 0x03;
regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
/* disable syscall checks */
regs->orig_ax = -1;
#ifdef CONFIG_X86_64
/*
* Fix up SS if needed for the benefit of old DOSEMU and
* CRIU.
*/
if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
force_valid_ss(regs);
#endif
return fpu__restore_sig((void __user *)sc.fpstate,
IS_ENABLED(CONFIG_X86_32));
}
---
`restore_sigcontext` is the function that restores the context of a signal.
`copy_from_user` here copies `CONTEXT_COPY_SIZE` bytes from the user space to
the kernel space. `CONTEXT_COPY_SIZE` is defined as `sizeof(struct sigcontext)`,
returns -EFAULT on failure.
---#
arch/x86/kernel/signal.c:469
---
static int __setup_rt_frame(int sig, struct ksignal *ksig,
sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
void __user *fp = NULL;
unsigned long uc_flags;
/* x86-64 should always use SA_RESTORER. */
if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
return -EFAULT;
frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
uc_flags = frame_uc_flags(regs);
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
/* Create the ucontext. */
unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
unsafe_put_user(0, &frame->uc.uc_link, Efault);
unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
unsafe_put_sigmask(set, frame, Efault);
user_access_end();
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
if (copy_siginfo_to_user(&frame->info, &ksig->info)) ## x ##
return -EFAULT;
}
/* Set up registers for signal handler */
regs->di = sig;
/* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
next argument after the signal number on the stack. */
regs->si = (unsigned long)&frame->info;
regs->dx = (unsigned long)&frame->uc;
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
regs->sp = (unsigned long)frame;
/*
* Set up the CS and SS registers to run signal handlers in
* 64-bit mode, even if the handler happens to be interrupting
* 32-bit or 16-bit code.
*
* SS is subtle. In 64-bit mode, we don't need any particular
* SS descriptor, but we do need SS to be valid. It's possible
* that the old SS is entirely bogus -- this can happen if the
* signal we're trying to deliver is #GP or #SS caused by a bad
* SS value. We also have a compatbility issue here: DOSEMU
* relies on the contents of the SS register indicating the
* SS value at the time of the signal, even though that code in
* DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
* avoids relying on sigreturn to restore SS; instead it uses
* a trampoline.) So we do our best: if the old SS was valid,
* we keep it. Otherwise we replace it.
*/
regs->cs = __USER_CS;
if (unlikely(regs->ss != __USER_DS))
force_valid_ss(regs);
return 0;
Efault:
user_access_end();
return -EFAULT;
}
---
`__setup_rt_frame` is the function that sets up the frame for a signal.
`copy_siginfo_to_user` here copies `sizeof(struct siginfo)` bytes from the
kernel space to the user space. `sizeof(struct siginfo)` returns -EFAULT on
failure.
---#
arch/x86/lib/insn.c:156
---
/**
* insn_get_prefixes - scan x86 instruction prefix bytes
* @insn: &struct insn containing instruction
*
* Populates the @insn->prefixes bitmap, and updates @insn->next_byte
* to point to the (first) opcode. No effect if @insn->prefixes.got
* is already set.
*/
void insn_get_prefixes(struct insn *insn)
{
struct insn_field *prefixes = &insn->prefixes;
insn_attr_t attr;
insn_byte_t b, lb;
int i, nb;
if (prefixes->got)
return;
# ...
/* Decode REX prefix */
if (insn->x86_64) { ## x ##
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_rex_prefix(attr)) {
insn->rex_prefix.value = b;
insn->rex_prefix.nbytes = 1;
insn->next_byte++;
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
insn->opnd_bytes = 8;
}
}
insn->rex_prefix.got = 1;
# ...
vex_end:
insn->vex_prefix.got = 1;
prefixes->got = 1;
err_out:
return;
}
---
`insn_get_prefixes` is the function that gets the prefixes of an instruction.
if `insn->x86_64` is true, `insn_get_prefixes` will try to get the REX prefix
of the instruction. `inat_get_opcode_attribute` returns the attribute of the
byte. `inat_is_rex_prefix` checks if the attribute is a REX prefix. `X86_REX_W`
checks if the REX prefix is a 64-bit REX prefix. `insn->opnd_bytes` is the size
of the operand. `insn->opnd_bytes = 8` sets the operand size to 64-bit.
---#
arch/x86/mm/fault.c:1101
---
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
/*
* Read or write was blocked by protection keys. This is
* always an unconditional error and can never result in
* a follow-up action to resolve the fault, like a COW.
*/
if (error_code & X86_PF_PK) ## x ##
return 1;
# ...
return 0;
}
---
`access_error` is the function that checks if the access is allowed. If the
access is not allowed, `access_error` returns 1. `error_code` is the error
code of the fault. `X86_PF_PK` is the bit that indicates if the access is
blocked by protection keys.
---#
arch/x86/mm/fault.c:1121
---
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
# ...
/* read, present: */
if (unlikely(error_code & X86_PF_PROT))
return 1;
/* read, not present: */
if (unlikely(!vma_is_accessible(vma)))
return 1;
return 0;
}
---
`access_error` is the function that checks if the access is allowed. If the
access is not allowed, `access_error` returns 1. `error_code` is the error
code of the fault. `X86_PF_PROT` is the bit that indicates if the access is
blocked by protection keys.
---#
arch/x86/mm/fault.c:1131
---
bool fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
* TASK_SIZE_MAX, but is not considered part of the kernel
* address space.
*/
if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) ## x ##
return false;
return address >= TASK_SIZE_MAX;
}
---
`fault_in_kernel_space` is the function that checks if the fault is in the
kernel space. `TASK_SIZE_MAX` is the maximum address of the user space.
In this line `is_vsyscall_vaddr` checks if the fault is in the vsyscall (a page
that contains the system call instructions) page and returns false if it is.
---#
arch/x86/mm/fault.c:1340
---
* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_DEFAULT;
tsk = current;
mm = tsk->mm;
# ...
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { ## x ##
bad_area(regs, hw_error_code, address);
return;
}
# ...
check_v8086_mode(regs, address, tsk);
}
---
`do_user_addr_fault` is the function that handles the fault in the user space.
`VM_GROWSDOWN` is the flag that indicates if the stack grows down. If the
stack does not grow down, `bad_area` is called.
---#
arch/x86/mm/pat/memtype.c:1085
---
/*
* untrack_pfn is called while unmapping a pfnmap for a region.
* untrack can be called for a specific region indicated by pfn and size or
* can be for the entire vma (in which case pfn, size are zero).
*/
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size)
{
resource_size_t paddr;
unsigned long prot;
if (vma && !(vma->vm_flags & VM_PAT)) ## x ##
return;
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return;
}
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
if (vma)
vma->vm_flags &= ~VM_PAT;
}
---
`untrack_pfn` is the function that untracks the pfn (physical frame number).
`VM_PAT` is the flag that indicates if the page is tracked. If the page is not
tracked, `untrack_pfn` returns.
---#
arch/x86/mm/tlb.c:559
---
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
# ...
if (need_flush) { ## x ##
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/* Make sure we write CR3 before loaded_mm. */
barrier();
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
cr4_update_pce_mm(next);
switch_ldt(real_prev, next);
}
}
---
`switch_mm_irqs_off` is the function that switches the mm. `need_flush` is the
variable that indicates if the TLB needs to be flushed. If `need_flush` is
true, the TLB is flushed.
---#
arch/x86/mm/tlb.c:598
---
/*
* Please ignore the name of this function. It should be called
* switch_to_kernel_thread().
*
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
* kernel thread or other context without an mm. Acceptable implementations
* include doing nothing whatsoever, switching to init_mm, or various clever
* lazy tricks to try to minimize TLB flushes.
*
* The scheduler reserves the right to call enter_lazy_tlb() several times
* in a row. It will notify us that we're going back to a real mm by
* calling switch_mm_irqs_off().
*/
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) ## x ##
return;
this_cpu_write(cpu_tlbstate.is_lazy, true);
}
---
this functions is called when the scheduler enters a kernel thread without a
mm. new mm is set to `init_mm` and returns. Else `cpu_tlbstate.is_lazy` is set
to true.
---#
arch/x86/mm/tlb.c:818
---
STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
* at the next context switch.
*
* However, if page tables are getting freed, we need to send the
* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
if (info->freed_tables) ## x ##
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
(void *)info, 1, cpumask);
}
---
`native_flush_tlb_others` is the function that flushes the TLB of other CPUs.
If `info->freed_tables` is true, the TLB of all CPUs is flushed. Else, the
TLB of CPUs that are not in lazy mode is flushed.
---#
block/bio.c:225
---
struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
mempool_t *pool)
{
struct bio_vec *bvl;
# ...
/*
* Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
* is set, retry with the 1-entry mempool
*/
bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { ## x ##
*idx = BVEC_POOL_MAX;
goto fallback;
}
}
(*idx)++;
return bvl;
}
---
`bvec_alloc` is the function that allocates a bio_vec (a bio_vec is a vector of
pages). ref:http://books.gigatux.nl/mirror/kerneldevelopment/0672327201/ch13lev1sec3.html
If the allocation fails and `__GFP_DIRECT_RECLAIM` is set, the allocation is
retried with the 1-entry mempool.
---#
/block/bio.c:503
---
/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* ...
*/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
unsigned inline_vecs;
struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
# ...
if (nr_iovecs > inline_vecs) { ## x ##
unsigned long idx = 0;
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
}
if (unlikely(!bvl))
goto err_free;
bio->bi_flags |= idx << BVEC_POOL_OFFSET;
} else if (nr_iovecs) {
bvl = bio->bi_inline_vecs;
}
bio->bi_pool = bs;
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bvl;
return bio;
err_free:
mempool_free(p, &bs->bio_pool);
return NULL;
}
---
`bio_alloc_bioset` is the function that allocates a bio. If `nr_iovecs` is
greater than `inline_vecs`, the allocation is retried with the 1-entry mempool.
Else if `nr_iovecs` is not 0, `bvl` is set to `bio->bi_inline_vecs` (a vector of
pages).
---#
block/bio.c:880
---
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) { ## x ##
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
---
`__bio_try_merge_page` is the function that tries to append data to an existing
bvec. If `bio->bi_vcnt` is greater than 0, the last bvec of `bio` is retrieved
and if the page is mergeable, the length of the bvec is increased by `len` and
the size of the bio is increased by `len`. If the size of the bio is greater
than `UINT_MAX - len`, the page is not merged.
---#
block/bio.c:918
---
/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add, may cross pages
* @off: offset of the data relative to @page, may cross pages
*
* Add the data at @page + @off to @bio as a new bvec. The caller must ensure
* that @bio has space for another bvec.
*/
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio, len));
bv->bv_page = page;
bv->bv_offset = off;
bv->bv_len = len;
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) ## x ##
bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
---
`__bio_add_page` is the function that adds a page to a bio in a new segment. The
caller must ensure that the bio has space for another bvec. The page is added
to the bio and the size of the bio is increased by `len`. If the page is in the
workingset, the bio is flagged as being in the workingset (a workingset is a
set of pages that are frequently accessed).
---#
block/blk-core.c:832
---
static noinline_for_stack bool submit_bio_checks(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
blk_status_t status = BLK_STS_IOERR;
struct blk_plug *plug;
might_sleep();
plug = blk_mq_plug(q, bio);
if (plug && plug->nowait)
bio->bi_opf |= REQ_NOWAIT;
/*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue does not support NOWAIT.
*/
if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
goto not_supported;
if (should_fail_bio(bio))
goto end_io;
if (bio->bi_partno) {
if (unlikely(blk_partition_remap(bio)))
goto end_io;
} else {
if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) ## x ##
goto end_io;
if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
goto end_io;
}
# ...
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
bio->bi_status = status;
bio_endio(bio);
return false;
}
---
`submit_bio_checks` is the function that checks if a bio can be submitted. If
the bio is read-only, the bio is ended and `false` is returned.
---#
block/blk-core.c:1269
---
static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
{
unsigned long stamp;
again:
stamp = READ_ONCE(part->stamp);
if (unlikely(stamp != now)) { ## x ##
if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->partno) {
part = &part_to_disk(part)->part0;
goto again;
}
}
---
`update_io_ticks` is the function that updates the io ticks of a partition.
`part->stamp` is the timestamp of the last io operation. If `part->stamp` is
different than `now`, `part->stamp` is updated to `now` and the io ticks of
`part` are increased by 1. If `end` is true, the io ticks of `part` are
increased by `now - stamp`.
---#
block/blk-core.c:1272
---
static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
{
unsigned long stamp;
again:
stamp = READ_ONCE(part->stamp);
if (unlikely(stamp != now)) {
if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (part->partno) { ## x ##
part = &part_to_disk(part)->part0;
goto again;
}
}
---
`update_io_ticks` is the function that updates the io ticks of a partition.
`part->partno` is the partition number. If `part->partno` is not 0, the
partition of `part` is retrieved and the function is called again.
---#
block/blk-core.c:1775
---
/**
* blk_finish_plug - mark the end of a batch of submitted I/O
* @plug: The &struct blk_plug passed to blk_start_plug()
*
* Description:
* Indicate that a batch of I/O submissions is complete. This function
* must be paired with an initial call to blk_start_plug(). The intent
* is to allow the block layer to optimize I/O submission. See the
* documentation for blk_start_plug() for more information.
*/
void blk_finish_plug(struct blk_plug *plug)
{
if (plug != current->plug) ## x ##
return;
blk_flush_plug_list(plug, false);
current->plug = NULL;
}
---
`blk_finish_plug` is the function that marks the end of a batch of submitted
io. If `plug` is not the current plug, the function returns. Else the plug list
is flushed and the current plug is set to `NULL`.
---#
block/blk-mq-sched.c:341
---
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
bool multi_hctxs = false, run_queue = false;
bool dispatched = false, busy = false;
unsigned int max_dispatch;
LIST_HEAD(rq_list);
int count = 0;
# ...
if (busy) ## x ##
return -EAGAIN;
return !!dispatched;
}
---
`__blk_mq_do_dispatch_sched` is the function that dispatches requests from the
scheduler. If the hctx (hardware context) is busy, -EAGAIN is returned. Else 0
is returned.
---#
block/blk-mq.c:605
---
static void blk_mq_trigger_softirq(struct request *rq)
{
struct list_head *list;
unsigned long flags;
local_irq_save(flags);
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
/*
* If the list only contains our just added request, signal a raise of
* the softirq. If there are already entries there, someone already
* raised the irq but it hasn't run yet.
*/
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ); ## x ##
local_irq_restore(flags);
}
---
`blk_mq_trigger_softirq` is the function that triggers the softirq.
if `list->next` is equal to `&rq->ipi_list`, it means the list only contains
entries added by the current cpu. In this case, the softirq is raised.
Else, the softirq is already raised and it hasn't run yet.
---#
/block/bounce.c:377
---
void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
{
mempool_t *pool;
/*
* Data-less bio, nothing to bounce
*/
if (!bio_has_data(*bio_orig))
return;
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,
* don't waste time iterating over bio segments
*/
if (!(q->bounce_gfp & GFP_DMA)) {
if (q->limits.bounce_pfn >= blk_max_pfn) ## x ##
return;
pool = &page_pool;
} else {
BUG_ON(!mempool_initialized(&isa_page_pool));
pool = &isa_page_pool;
}
/*
* slow path
*/
__blk_queue_bounce(q, bio_orig, pool);
}
---
`blk_queue_bounce` is the function that bounces a bio. If the bounce pfn is
equal to or bigger than the highest pfn in the system, the function returns.
---#
fs/exec.c:449
---
**
* count() counts the number of strings in array ARGV.
*/
static int count(struct user_arg_ptr argv, int max)
{
int i = 0;
if (argv.ptr.native != NULL) {
for (;;) {
const char __user *p = get_user_arg_ptr(argv, i);
if (!p)
break;
if (IS_ERR(p))
return -EFAULT;
if (i >= max)
return -E2BIG;
++i;
if (fatal_signal_pending(current))
return -ERESTARTNOHAND;
cond_resched(); ## x ##
}
}
return i;
}
---
`count` is the function that counts the number of strings in an array.
`p` is the string at index `i` in the array.
`cond_resched` is called to reschedule the current task.
---#
fs/exec.c:1022
---
/*
* Maps the mm_struct mm into the current task struct.
* On success, this function returns with the mutex
* exec_update_mutex locked.
*/
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
int ret;
# ...
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0; ## x ##
vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
}
mmdrop(active_mm);
return 0;
}
---
`exec_mmap` is the function that maps the mm_struct `mm` into the current task
struct.
If `CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM` is enabled (which is the case for
x86), `local_irq_enable` is called before `activate_mm`.
---#
fs/exec.c:1850
---
/*
* sys_execve() executes a new program.
*/
static int bprm_execve(struct linux_binprm *bprm,
int fd, struct filename *filename, int flags)
{
struct file *file;
struct files_struct *displaced;
int retval;
/*
* Cancel any io_uring activity across execve
*/
io_uring_task_cancel();
retval = unshare_files(&displaced);
if (retval)
return retval;
# ...
out_files:
if (displaced)
reset_files_struct(displaced); ## x ##
return retval;
}
---
`bprm_execve` is the function that executes a new program.
`displaced` is the files struct that is displaced by the new files struct.
`reset_files_struct` is called to reset the files struct.
---#
---#
source file : line number
---
code line a
code line b ## x ## <--------- marks the specific line noted above
code line c
---
description of the code
an example:
---#
arch/x86/kernel/process.c:119
---
static int set_new_tls(struct task_struct *p, unsigned long tls)
{
struct user_desc __user *utls = (struct user_desc __user *)tls;
if (in_ia32_syscall()) ## x ##
return do_set_thread_area(p, -1, utls, 0);
else
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
---
`in_ia32_syscall()` check if the current process is in a 32-bit syscall.
if so, it will use `do_set_thread_area()` to set the `tls` (Thread Local Storage).
Otherwise, it will use `do_set_thread_area_64()` to set the `tls`.
---#
fs/exec.c:196
---
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can
* use a lot of memory, account these pages in current->mm temporary
* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
* change the counter back via acct_arg_size(0).
*/
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
struct mm_struct *mm = current->mm;
long diff = (long)(pages - bprm->vma_pages);
if (!mm || !diff)
return;
bprm->vma_pages = pages; ## x ##
add_mm_counter(mm, MM_ANONPAGES, diff);
}
---
`acct_arg_size` is the function that accounts the size of the arguments.
`add_mm_counter` is called to add the counter `diff` to the mm counter
if `mm` or `diff` fails to be allocated, the function returns before adding the
counter.
---#
block/bio.c:258
---
static void bio_free(struct bio *bio)
{
struct bio_set *bs = bio->bi_pool;
void *p;
bio_uninit(bio);
if (bs) {
bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); ## x ##
/*
* If we have front padding, adjust the bio pointer before freeing
*/
p = bio;
p -= bs->front_pad;
mempool_free(p, &bs->bio_pool);
} else {
/* Bio was allocated by bio_kmalloc() */
kfree(bio);
}
}
---
`bio_free` is the function that frees the bio.
`bs` is the bio_set that the bio belongs to. if `bs` is null, then the bio was
allocated by `bio_kmalloc` and is freed by `kfree`. Otherwise, the bio is freed
by `mempool_free`.
---#
block/bio.c:876
---
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) { ## x ##
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
---
`__bio_try_merge_page` is the function that tries to merge the page to the bio.
`bio->bi_vcnt` is the number of bio_vecs in the bio. if `bio->bi_vcnt` is
greater than 0, then the bio has at least one bio_vec. `bv` is the last bio_vec
in the bio.
---#
mm/memory.c:488
---
static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
if (current->mm == mm)
sync_mm_rss(mm); ## x ##
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
}
---
`add_mm_rss_vec` is the function that adds the rss (Resident Set Size and is
used to show how much memory is allocated to that process and is in RAM) to the
mm counter.
---#
mm/memory.c:1444
---
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
if (start >= vma->vm_end)
return; ## x ##
# ...
}
---
`unmap_single_vma` is the function that unmaps a single vma (virtual memory area)
from the tlb (Translation Lookaside Buffer).
`start` is the maximum of the vma start and the start address. if `start` is
greater than or equal to the vma end, then the function returns.
---#

How I filter the trace of just the application?

  • The longest running process in a qemu instance in the application i'm tracing.
  • Thus is the CR3 with the largest count = application
all pc
      1 CR3=0000000000000000
      1 CR3=0000000002d58000
      1 CR3=0000000100150000
      2 CR3=0000000101572000
      3 CR3=00000001015ba000
      6 CR3=000000010152a000
     11 CR3=000000000331f000
     27 CR3=000000010157e000
     28 CR3=0000000101608000
     93 CR3=00000001015fe000
    181 CR3=0000000101570000
    250 CR3=00000001014ea000
   1812 CR3=000000000260c000
 188926 CR3=000000010157c000 <-------- interested application

kernel pc
      1 CR3=0000000002d58000
      1 CR3=0000000100150000
      2 CR3=0000000101572000
      3 CR3=00000001015ba000
      4 CR3=000000010152a000
     18 CR3=000000010157e000
     19 CR3=00000001015fe000
     27 CR3=0000000101608000
     45 CR3=0000000101570000
     94 CR3=000000010157c000 <-------- interested application
    155 CR3=00000001014ea000
   1812 CR3=000000000260c000

user pc
      1 CR3=0000000000000000
      1 CR3=0000000101608000
      2 CR3=000000010152a000
      9 CR3=000000010157e000
     11 CR3=000000000331f000
     74 CR3=00000001015fe000
     95 CR3=00000001014ea000
    136 CR3=0000000101570000
 188832 CR3=000000010157c000 <-------- interested application
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment